[llvm] [AMDGPULowerBufferFatPointers] Fix offset-only ptrtoint (PR #95543)
Nikita Popov via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 14 06:25:27 PDT 2024
https://github.com/nikic created https://github.com/llvm/llvm-project/pull/95543
For ptrtoint that truncates to the offset only, the expansion generated a shift by the bit width, which is poison. Instead, we should return the offset directly.
(The same problem exists for the constant expression case, but I plan to address that separately, and more comprehensively.)
>From edeb965130f285d12196cc865f8d4e745ef49940 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 14 Jun 2024 15:18:00 +0200
Subject: [PATCH] [AMDGPULowerBufferFatPointers] Fix offset-only ptrtoint
For ptrtoint that truncates to the offset only, the expansion
generated a shift by the bit width, which is poison. Instead, we
should return the offset directly.
(The same problem exists for the constant expression case, but I
plan to address that differently.)
---
.../AMDGPU/AMDGPULowerBufferFatPointers.cpp | 30 +-
.../buffer-fat-pointer-atomicrmw-fadd.ll | 1203 +++++++++-------
.../buffer-fat-pointer-atomicrmw-fmax.ll | 1235 ++++++++++-------
.../buffer-fat-pointer-atomicrmw-fmin.ll | 1235 ++++++++++-------
.../lower-buffer-fat-pointers-pointer-ops.ll | 3 +-
5 files changed, 2165 insertions(+), 1541 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 0b261d8e33907..dfe0583767313 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1435,20 +1435,22 @@ PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) {
const DataLayout &DL = PI.getModule()->getDataLayout();
unsigned FatPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER);
- Value *RsrcInt;
- if (Width <= BufferOffsetWidth)
- RsrcInt = ConstantExpr::getIntegerValue(ResTy, APInt::getZero(Width));
- else
- RsrcInt = IRB.CreatePtrToInt(Rsrc, ResTy, PI.getName() + ".rsrc");
- copyMetadata(RsrcInt, &PI);
-
- Value *Shl = IRB.CreateShl(
- RsrcInt,
- ConstantExpr::getIntegerValue(ResTy, APInt(Width, BufferOffsetWidth)), "",
- Width >= FatPtrWidth, Width > FatPtrWidth);
- Value *OffCast =
- IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, PI.getName() + ".off");
- Value *Res = IRB.CreateOr(Shl, OffCast);
+ Value *Res;
+ if (Width <= BufferOffsetWidth) {
+ Res = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false,
+ PI.getName() + ".off");
+ } else {
+ Value *RsrcInt = IRB.CreatePtrToInt(Rsrc, ResTy, PI.getName() + ".rsrc");
+ Value *Shl = IRB.CreateShl(
+ RsrcInt,
+ ConstantExpr::getIntegerValue(ResTy, APInt(Width, BufferOffsetWidth)),
+ "", Width >= FatPtrWidth, Width > FatPtrWidth);
+ Value *OffCast = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false,
+ PI.getName() + ".off");
+ Res = IRB.CreateOr(Shl, OffCast);
+ }
+
+ copyMetadata(Res, &PI);
Res->takeName(&PI);
SplitUsers.insert(&PI);
PI.replaceAllUsesWith(Res);
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 2f4606035376d..b81730803d4a9 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -1814,22 +1814,27 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s4, 0x200
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_b32 s4, s4, -4
-; GFX12-NEXT: v_mov_b32_e32 v5, s4
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s5, s4, -4
+; GFX12-NEXT: s_and_b32 s4, s4, 3
+; GFX12-NEXT: v_mov_b32_e32 v5, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v1, 24, v2
+; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_f16_e32 v1, v1, v0
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; GFX12-NEXT: v_and_or_b32 v1, 0xffffff, v2, v1
+; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -1837,31 +1842,36 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v3
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s4, 0x200
-; GFX940-NEXT: s_and_b32 s4, s4, -4
-; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: s_and_b32 s5, s4, -4
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
-; GFX940-NEXT: s_mov_b32 s6, 0xffffff
+; GFX940-NEXT: s_and_b32 s4, s4, 3
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3
+; GFX940-NEXT: v_add_f16_e32 v2, v2, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
@@ -1872,30 +1882,34 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i
; GFX940-NEXT: s_cbranch_execnz .LBB6_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s4, 0x200
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s4, s4, -4
-; GFX11-NEXT: v_mov_b32_e32 v5, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s5, s4, -4
+; GFX11-NEXT: s_and_b32 s4, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v5, s5
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_not_b32 s6, s5
; GFX11-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
-; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 24, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f16_e32 v1, v1, v0
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; GFX11-NEXT: v_and_or_b32 v1, 0xffffff, v2, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
@@ -1904,31 +1918,35 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB6_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v3
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s8, 0x200
-; GFX10-NEXT: s_and_b32 s8, s8, -4
-; GFX10-NEXT: v_mov_b32_e32 v5, s8
-; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: s_and_b32 s9, s8, -4
+; GFX10-NEXT: s_and_b32 s8, s8, 3
+; GFX10-NEXT: v_mov_b32_e32 v5, s9
+; GFX10-NEXT: s_lshl_b32 s8, s8, 3
+; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT: s_not_b32 s10, s9
; GFX10-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen
+; GFX10-NEXT: s_mov_b32 s9, 0
; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, s8, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; GFX10-NEXT: v_and_or_b32 v1, 0xffffff, v2, v1
+; GFX10-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v1, v2, s10, v1
; GFX10-NEXT: v_mov_b32_e32 v4, v2
; GFX10-NEXT: v_mov_b32_e32 v3, v1
; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc
@@ -1937,28 +1955,33 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX10-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_cbranch_execnz .LBB6_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s8, 0x200
-; GFX90A-NEXT: s_and_b32 s8, s8, -4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s8
+; GFX90A-NEXT: s_and_b32 s9, s8, -4
+; GFX90A-NEXT: v_mov_b32_e32 v1, s9
; GFX90A-NEXT: buffer_load_dword v3, v1, s[4:7], 0 offen
-; GFX90A-NEXT: s_mov_b32 s10, 0xffffff
+; GFX90A-NEXT: s_and_b32 s8, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT: s_not_b32 s11, s8
; GFX90A-NEXT: s_mov_b64 s[8:9], 0
; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX90A-NEXT: v_and_or_b32 v2, v3, s10, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s10, v3
+; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s10, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s11, v2
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -1970,23 +1993,28 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i
; GFX90A-NEXT: s_cbranch_execnz .LBB6_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s8, 0x200
-; GFX908-NEXT: s_and_b32 s8, s8, -4
-; GFX908-NEXT: v_mov_b32_e32 v5, s8
+; GFX908-NEXT: s_and_b32 s9, s8, -4
+; GFX908-NEXT: v_mov_b32_e32 v5, s9
; GFX908-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen
-; GFX908-NEXT: s_mov_b32 s10, 0xffffff
+; GFX908-NEXT: s_and_b32 s8, s8, 3
+; GFX908-NEXT: s_lshl_b32 s10, s8, 3
+; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT: s_not_b32 s11, s8
; GFX908-NEXT: s_mov_b64 s[8:9], 0
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX908-NEXT: v_and_or_b32 v1, v2, s10, v1
+; GFX908-NEXT: v_lshrrev_b32_e32 v1, s10, v2
+; GFX908-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, s10, v1
+; GFX908-NEXT: v_and_or_b32 v1, v2, s11, v1
; GFX908-NEXT: v_mov_b32_e32 v4, v2
; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc
@@ -1999,22 +2027,28 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i
; GFX908-NEXT: s_cbranch_execnz .LBB6_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v3
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s8, 0x200
-; GFX8-NEXT: s_and_b32 s8, s8, -4
-; GFX8-NEXT: v_mov_b32_e32 v5, s8
+; GFX8-NEXT: s_and_b32 s9, s8, -4
+; GFX8-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen
+; GFX8-NEXT: s_and_b32 s8, s8, 3
+; GFX8-NEXT: s_lshl_b32 s10, s8, 3
+; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT: s_not_b32 s11, s8
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffffff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, s10, v2
+; GFX8-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX8-NEXT: v_and_b32_e32 v3, s11, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, s10, v1
; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: v_mov_b32_e32 v3, v1
@@ -2028,28 +2062,32 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i
; GFX8-NEXT: s_cbranch_execnz .LBB6_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s8, 0x200
-; GFX7-NEXT: s_and_b32 s8, s8, -4
-; GFX7-NEXT: v_mov_b32_e32 v4, s8
+; GFX7-NEXT: s_and_b32 s9, s8, -4
+; GFX7-NEXT: v_mov_b32_e32 v4, s9
; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: s_and_b32 s8, s8, 3
+; GFX7-NEXT: s_lshl_b32 s10, s8, 3
+; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: s_not_b32 s11, s8
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffffff, v1
+; GFX7-NEXT: v_and_b32_e32 v2, s11, v1
; GFX7-NEXT: v_add_f32_e32 v0, v0, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: v_mov_b32_e32 v2, v0
@@ -2063,7 +2101,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i
; GFX7-NEXT: s_cbranch_execnz .LBB6_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -2071,22 +2109,26 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s8, 0x200
-; GFX6-NEXT: s_and_b32 s8, s8, -4
-; GFX6-NEXT: v_mov_b32_e32 v4, s8
+; GFX6-NEXT: s_and_b32 s9, s8, -4
+; GFX6-NEXT: v_mov_b32_e32 v4, s9
; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: s_and_b32 s8, s8, 3
+; GFX6-NEXT: s_lshl_b32 s10, s8, 3
+; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: s_not_b32 s11, s8
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s11, v1
; GFX6-NEXT: v_add_f32_e32 v0, v0, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
; GFX6-NEXT: v_mov_b32_e32 v3, v1
; GFX6-NEXT: v_mov_b32_e32 v2, v0
@@ -2100,7 +2142,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i
; GFX6-NEXT: s_cbranch_execnz .LBB6_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -2118,22 +2160,27 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s4, 0x200
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_b32 s4, s4, -4
-; GFX12-NEXT: v_mov_b32_e32 v3, s4
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s5, s4, -4
+; GFX12-NEXT: s_and_b32 s4, s4, 3
+; GFX12-NEXT: v_mov_b32_e32 v3, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v1, 24, v2
+; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_f16_e32 v1, v1, v0
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; GFX12-NEXT: v_and_or_b32 v1, 0xffffff, v2, v1
+; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -2141,30 +2188,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s4, 0x200
-; GFX940-NEXT: s_and_b32 s4, s4, -4
-; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: s_and_b32 s5, s4, -4
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
-; GFX940-NEXT: s_mov_b32 s6, 0xffffff
+; GFX940-NEXT: s_and_b32 s4, s4, 3
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3
+; GFX940-NEXT: v_add_f16_e32 v2, v2, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
@@ -2181,23 +2233,27 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s4, 0x200
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s4, s4, -4
-; GFX11-NEXT: v_mov_b32_e32 v3, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s5, s4, -4
+; GFX11-NEXT: s_and_b32 s4, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v3, s5
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_not_b32 s6, s5
; GFX11-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
-; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 24, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f16_e32 v1, v1, v0
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; GFX11-NEXT: v_and_or_b32 v1, 0xffffff, v2, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
@@ -2206,30 +2262,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX11-NEXT: v_mov_b32_e32 v2, v4
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB7_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s8, 0x200
-; GFX10-NEXT: s_and_b32 s8, s8, -4
-; GFX10-NEXT: v_mov_b32_e32 v3, s8
-; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: s_and_b32 s9, s8, -4
+; GFX10-NEXT: s_and_b32 s8, s8, 3
+; GFX10-NEXT: v_mov_b32_e32 v3, s9
+; GFX10-NEXT: s_lshl_b32 s8, s8, 3
+; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT: s_not_b32 s10, s9
; GFX10-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen
+; GFX10-NEXT: s_mov_b32 s9, 0
; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, s8, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; GFX10-NEXT: v_and_or_b32 v1, 0xffffff, v2, v1
+; GFX10-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v1, v2, s10, v1
; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: v_mov_b32_e32 v4, v1
; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
@@ -2238,27 +2298,32 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX10-NEXT: v_mov_b32_e32 v2, v4
-; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_cbranch_execnz .LBB7_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s8, 0x200
-; GFX90A-NEXT: s_and_b32 s8, s8, -4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s8
+; GFX90A-NEXT: s_and_b32 s9, s8, -4
+; GFX90A-NEXT: v_mov_b32_e32 v1, s9
; GFX90A-NEXT: buffer_load_dword v3, v1, s[4:7], 0 offen
-; GFX90A-NEXT: s_mov_b32 s10, 0xffffff
+; GFX90A-NEXT: s_and_b32 s8, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT: s_not_b32 s11, s8
; GFX90A-NEXT: s_mov_b64 s[8:9], 0
; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f16_sdwa v2, v3, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX90A-NEXT: v_and_or_b32 v2, v3, s10, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s10, v3
+; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s10, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s11, v2
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -2276,16 +2341,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7)
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s8, 0x200
-; GFX908-NEXT: s_and_b32 s8, s8, -4
-; GFX908-NEXT: v_mov_b32_e32 v3, s8
+; GFX908-NEXT: s_and_b32 s9, s8, -4
+; GFX908-NEXT: v_mov_b32_e32 v3, s9
; GFX908-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen
-; GFX908-NEXT: s_mov_b32 s10, 0xffffff
+; GFX908-NEXT: s_and_b32 s8, s8, 3
+; GFX908-NEXT: s_lshl_b32 s10, s8, 3
+; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT: s_not_b32 s11, s8
; GFX908-NEXT: s_mov_b64 s[8:9], 0
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX908-NEXT: v_and_or_b32 v1, v2, s10, v1
+; GFX908-NEXT: v_lshrrev_b32_e32 v1, s10, v2
+; GFX908-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, s10, v1
+; GFX908-NEXT: v_and_or_b32 v1, v2, s11, v1
; GFX908-NEXT: v_mov_b32_e32 v5, v2
; GFX908-NEXT: v_mov_b32_e32 v4, v1
; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
@@ -2304,15 +2374,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s8, 0x200
-; GFX8-NEXT: s_and_b32 s8, s8, -4
-; GFX8-NEXT: v_mov_b32_e32 v3, s8
+; GFX8-NEXT: s_and_b32 s9, s8, -4
+; GFX8-NEXT: v_mov_b32_e32 v3, s9
; GFX8-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen
+; GFX8-NEXT: s_and_b32 s8, s8, 3
+; GFX8-NEXT: s_lshl_b32 s10, s8, 3
+; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT: s_not_b32 s11, s8
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, s10, v2
+; GFX8-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX8-NEXT: v_and_b32_e32 v4, s11, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, s10, v1
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v2
; GFX8-NEXT: v_mov_b32_e32 v4, v1
@@ -2332,21 +2408,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s8, 0x200
-; GFX7-NEXT: s_and_b32 s8, s8, -4
-; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: s_and_b32 s9, s8, -4
+; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: s_and_b32 s8, s8, 3
+; GFX7-NEXT: s_lshl_b32 s10, s8, 3
+; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT: s_not_b32 s11, s8
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffffff, v1
+; GFX7-NEXT: v_and_b32_e32 v4, s11, v1
; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
; GFX7-NEXT: v_mov_b32_e32 v5, v1
; GFX7-NEXT: v_mov_b32_e32 v4, v0
@@ -2366,22 +2446,26 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s8, 0x200
-; GFX6-NEXT: s_and_b32 s8, s8, -4
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: s_and_b32 s9, s8, -4
+; GFX6-NEXT: v_mov_b32_e32 v2, s9
; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: s_and_b32 s8, s8, 3
+; GFX6-NEXT: s_lshl_b32 s10, s8, 3
+; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT: s_not_b32 s11, s8
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v1
+; GFX6-NEXT: v_and_b32_e32 v4, s11, v1
; GFX6-NEXT: v_add_f32_e32 v0, v0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
; GFX6-NEXT: v_mov_b32_e32 v5, v1
; GFX6-NEXT: v_mov_b32_e32 v4, v0
@@ -2410,10 +2494,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v4, -4, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v6
+; GFX12-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX12-NEXT: v_not_b32_e32 v11, v7
; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
@@ -2425,7 +2514,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: buffer_load_b32 v7, v4, s[4:7], null offen
+; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2:
@@ -2435,15 +2524,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: ; =>This Loop Header: Depth=1
; GFX12-NEXT: ; Child Loop BB8_4 Depth 2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v6, 24, v7
+; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_f16_e32 v6, v6, v5
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 24, v6
-; GFX12-NEXT: v_and_or_b32 v6, 0xffffff, v7, v6
+; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
@@ -2459,7 +2548,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v4, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -2474,15 +2563,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_cbranch_execnz .LBB8_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v8
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX940-NEXT: v_and_b32_e32 v4, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v10, 0xffffff
+; GFX940-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v6, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v11, v6
; GFX940-NEXT: s_mov_b64 s[2:3], exec
; GFX940-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -2494,7 +2587,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen
+; GFX940-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB8_1
; GFX940-NEXT: ; %bb.2:
@@ -2504,9 +2597,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX940-NEXT: ; =>This Loop Header: Depth=1
; GFX940-NEXT: ; Child Loop BB8_4 Depth 2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_add_f16_sdwa v6, v7, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX940-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX940-NEXT: v_add_f16_e32 v6, v6, v5
+; GFX940-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX940-NEXT: v_and_or_b32 v6, v7, v11, v6
; GFX940-NEXT: s_mov_b64 s[8:9], exec
-; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v6
; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
@@ -2521,7 +2616,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[4:7], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB8_4
; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -2535,17 +2630,22 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX940-NEXT: s_cbranch_execnz .LBB8_3
; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v8
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v4, -4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v6
+; GFX11-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX11-NEXT: v_not_b32_e32 v11, v7
; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_readfirstlane_b32 s5, v1
@@ -2557,7 +2657,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v7, v4, s[4:7], 0 offen
+; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB8_1
; GFX11-NEXT: ; %bb.2:
@@ -2567,15 +2667,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB8_4 Depth 2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, v4, v7
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f16_e32 v6, v6, v5
; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6
-; GFX11-NEXT: v_and_or_b32 v6, 0xffffff, v7, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX11-NEXT: v_and_or_b32 v6, v7, v11, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
; GFX11-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
@@ -2591,7 +2691,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v4, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB8_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -2607,16 +2707,20 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX11-NEXT: s_cbranch_execnz .LBB8_3
; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v8
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
-; GFX10-NEXT: v_and_b32_e32 v4, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v6
+; GFX10-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v11, v7
; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
; GFX10-NEXT: v_readfirstlane_b32 s9, v1
@@ -2626,7 +2730,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen
+; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB8_1
@@ -2636,12 +2740,12 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB8_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f16_sdwa v6, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
-; GFX10-NEXT: v_and_or_b32 v6, 0xffffff, v7, v6
+; GFX10-NEXT: v_add_f16_e32 v6, v6, v5
+; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6
; GFX10-NEXT: v_mov_b32_e32 v9, v7
; GFX10-NEXT: v_mov_b32_e32 v8, v6
; GFX10-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
@@ -2655,7 +2759,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB8_4
@@ -2672,15 +2776,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX10-NEXT: s_cbranch_execnz .LBB8_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX90A-NEXT: v_and_b32_e32 v4, -4, v4
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0xffffff
+; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v11, v6
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -2692,7 +2800,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen
+; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
; GFX90A-NEXT: ; %bb.2:
@@ -2702,8 +2810,10 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB8_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f16_sdwa v6, v7, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v6
+; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX90A-NEXT: v_add_f16_e32 v6, v6, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
@@ -2717,7 +2827,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB8_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -2731,15 +2841,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX90A-NEXT: s_cbranch_execnz .LBB8_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v8
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX908-NEXT: v_and_b32_e32 v4, -4, v4
-; GFX908-NEXT: v_mov_b32_e32 v10, 0xffffff
+; GFX908-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v11, v6
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -2751,7 +2865,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen
+; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB8_1
; GFX908-NEXT: ; %bb.2:
@@ -2761,8 +2875,10 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB8_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_add_f16_sdwa v6, v7, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX908-NEXT: v_and_or_b32 v6, v7, v10, v6
+; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX908-NEXT: v_add_f16_e32 v6, v6, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6
; GFX908-NEXT: v_mov_b32_e32 v9, v7
; GFX908-NEXT: s_mov_b64 s[12:13], exec
; GFX908-NEXT: v_mov_b32_e32 v8, v6
@@ -2777,7 +2893,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB8_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -2791,14 +2907,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX908-NEXT: s_cbranch_execnz .LBB8_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v8
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4
-; GFX8-NEXT: v_and_b32_e32 v4, -4, v4
+; GFX8-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v11, v6
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -2810,7 +2931,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen
+; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB8_1
; GFX8-NEXT: ; %bb.2:
@@ -2820,8 +2941,10 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB8_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v6, v7, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v8, 0xffffff, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX8-NEXT: v_add_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX8-NEXT: v_and_b32_e32 v8, v7, v11
; GFX8-NEXT: v_or_b32_e32 v6, v8, v6
; GFX8-NEXT: v_mov_b32_e32 v9, v7
; GFX8-NEXT: s_mov_b64 s[12:13], exec
@@ -2837,7 +2960,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB8_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -2851,14 +2974,18 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX8-NEXT: s_cbranch_execnz .LBB8_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX7-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
+; GFX7-NEXT: v_not_b32_e32 v9, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -2869,25 +2996,25 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB8_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4
; GFX7-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB8_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX7-NEXT: v_add_f32_e32 v4, v4, v10
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: v_mov_b32_e32 v5, v6
@@ -2902,7 +3029,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB8_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -2916,7 +3043,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX7-NEXT: s_cbranch_execnz .LBB8_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -2924,7 +3051,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX6-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
+; GFX6-NEXT: v_not_b32_e32 v9, v4
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -2935,25 +3066,25 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB8_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4
; GFX6-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB8_4 Depth 2
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
; GFX6-NEXT: s_mov_b64 s[12:13], exec
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX6-NEXT: v_add_f32_e32 v4, v4, v10
; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: v_mov_b32_e32 v5, v6
@@ -2968,7 +3099,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB8_4
; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -2982,7 +3113,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX6-NEXT: s_cbranch_execnz .LBB8_3
; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -3005,15 +3136,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s4, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX12-NEXT: s_and_b32 s4, s4, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_and_b32 s5, s4, -4
+; GFX12-NEXT: s_and_b32 s4, s4, 3
+; GFX12-NEXT: v_mov_b32_e32 v4, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -3027,8 +3162,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX12-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -3036,42 +3171,43 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s4, 0x200
-; GFX940-NEXT: s_and_b32 s4, s4, -4
-; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: s_and_b32 s5, s4, -4
+; GFX940-NEXT: v_mov_b32_e32 v4, s5
; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
-; GFX940-NEXT: s_mov_b32 s6, 0xffffff
+; GFX940-NEXT: s_and_b32 s4, s4, 3
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX940-NEXT: s_movk_i32 s7, 0x7fff
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: v_add_f32_e32 v0, v0, v5
; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v2, v2, v0, s7
+; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX940-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -3083,7 +3219,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX940-NEXT: s_cbranch_execnz .LBB9_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
@@ -3091,16 +3227,20 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s4, 0x200
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT: s_and_b32 s4, s4, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_and_b32 s5, s4, -4
+; GFX11-NEXT: s_and_b32 s4, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v4, s5
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: s_not_b32 s6, s5
; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -3114,8 +3254,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX11-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
@@ -3124,13 +3264,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB9_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
@@ -3138,25 +3278,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s8, 0x200
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX10-NEXT: s_and_b32 s8, s8, -4
-; GFX10-NEXT: v_mov_b32_e32 v4, s8
-; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: s_and_b32 s9, s8, -4
+; GFX10-NEXT: s_and_b32 s8, s8, 3
+; GFX10-NEXT: v_mov_b32_e32 v4, s9
+; GFX10-NEXT: s_lshl_b32 s8, s8, 3
+; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT: s_not_b32 s10, s9
; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX10-NEXT: s_mov_b32 s9, 0
; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v5
; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0
; GFX10-NEXT: v_mov_b32_e32 v3, v1
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
@@ -3165,39 +3307,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v2
-; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_cbranch_execnz .LBB9_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s8, 0x200
-; GFX90A-NEXT: s_and_b32 s8, s8, -4
-; GFX90A-NEXT: v_mov_b32_e32 v4, s8
+; GFX90A-NEXT: s_and_b32 s9, s8, -4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s9
; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX90A-NEXT: s_mov_b32 s10, 0xffffff
+; GFX90A-NEXT: s_and_b32 s8, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT: s_not_b32 s11, s8
; GFX90A-NEXT: s_mov_b64 s[8:9], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX90A-NEXT: s_movk_i32 s11, 0x7fff
+; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX90A-NEXT: v_add_f32_e32 v0, v0, v5
; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s11
+; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -3209,34 +3352,35 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s8, 0x200
-; GFX908-NEXT: s_and_b32 s8, s8, -4
-; GFX908-NEXT: v_mov_b32_e32 v4, s8
+; GFX908-NEXT: s_and_b32 s9, s8, -4
+; GFX908-NEXT: v_mov_b32_e32 v4, s9
; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX908-NEXT: s_mov_b32 s10, 0xffffff
+; GFX908-NEXT: s_and_b32 s8, s8, 3
+; GFX908-NEXT: s_lshl_b32 s10, s8, 3
+; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT: s_not_b32 s11, s8
; GFX908-NEXT: s_mov_b64 s[8:9], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX908-NEXT: s_movk_i32 s11, 0x7fff
+; GFX908-NEXT: s_movk_i32 s12, 0x7fff
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX908-NEXT: v_add_f32_e32 v0, v0, v5
; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v2, v2, v0, s11
+; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX908-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
@@ -3249,33 +3393,36 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX908-NEXT: s_cbranch_execnz .LBB9_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s8, 0x200
-; GFX8-NEXT: s_and_b32 s8, s8, -4
-; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: s_and_b32 s9, s8, -4
+; GFX8-NEXT: v_mov_b32_e32 v4, s9
; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX8-NEXT: s_and_b32 s8, s8, 3
+; GFX8-NEXT: s_lshl_b32 s10, s8, 3
+; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT: s_not_b32 s11, s8
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, s11, v1
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: v_mov_b32_e32 v2, v0
@@ -3289,28 +3436,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX8-NEXT: s_cbranch_execnz .LBB9_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s8, 0x200
-; GFX7-NEXT: s_and_b32 s8, s8, -4
-; GFX7-NEXT: v_mov_b32_e32 v4, s8
+; GFX7-NEXT: s_and_b32 s9, s8, -4
+; GFX7-NEXT: v_mov_b32_e32 v4, s9
; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX7-NEXT: s_and_b32 s8, s8, 3
+; GFX7-NEXT: s_lshl_b32 s10, s8, 3
+; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: s_not_b32 s11, s8
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_add_f32_e32 v0, v0, v5
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffffff, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX7-NEXT: v_and_b32_e32 v2, s11, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: v_mov_b32_e32 v2, v0
@@ -3324,7 +3475,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX7-NEXT: s_cbranch_execnz .LBB9_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -3332,22 +3483,26 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s8, 0x200
-; GFX6-NEXT: s_and_b32 s8, s8, -4
-; GFX6-NEXT: v_mov_b32_e32 v4, s8
+; GFX6-NEXT: s_and_b32 s9, s8, -4
+; GFX6-NEXT: v_mov_b32_e32 v4, s9
; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX6-NEXT: s_and_b32 s8, s8, 3
+; GFX6-NEXT: s_lshl_b32 s10, s8, 3
+; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: s_not_b32 s11, s8
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_add_f32_e32 v0, v0, v5
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX6-NEXT: v_and_b32_e32 v2, s11, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
; GFX6-NEXT: v_mov_b32_e32 v3, v1
; GFX6-NEXT: v_mov_b32_e32 v2, v0
@@ -3361,7 +3516,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX6-NEXT: s_cbranch_execnz .LBB9_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -3380,15 +3535,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s4, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-NEXT: s_and_b32 s4, s4, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_and_b32 s5, s4, -4
+; GFX12-NEXT: s_and_b32 s4, s4, 3
+; GFX12-NEXT: v_mov_b32_e32 v2, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -3402,8 +3561,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX12-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -3411,41 +3570,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s4, 0x200
-; GFX940-NEXT: s_and_b32 s4, s4, -4
-; GFX940-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NEXT: s_and_b32 s5, s4, -4
+; GFX940-NEXT: v_mov_b32_e32 v2, s5
; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
-; GFX940-NEXT: s_mov_b32 s6, 0xffffff
+; GFX940-NEXT: s_and_b32 s4, s4, 3
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX940-NEXT: s_movk_i32 s7, 0x7fff
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: v_add_f32_e32 v0, v0, v3
; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v4, v4, v0, s7
+; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX940-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -3464,16 +3624,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s4, 0x200
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: s_and_b32 s4, s4, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v2, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_and_b32 s5, s4, -4
+; GFX11-NEXT: s_and_b32 s4, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v2, s5
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: s_not_b32 s6, s5
; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -3487,8 +3651,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX11-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
@@ -3497,12 +3661,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX11-NEXT: v_mov_b32_e32 v1, v4
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB10_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
@@ -3510,25 +3674,27 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s8, 0x200
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT: s_and_b32 s8, s8, -4
-; GFX10-NEXT: v_mov_b32_e32 v2, s8
-; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: s_and_b32 s9, s8, -4
+; GFX10-NEXT: s_and_b32 s8, s8, 3
+; GFX10-NEXT: v_mov_b32_e32 v2, s9
+; GFX10-NEXT: s_lshl_b32 s8, s8, 3
+; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT: s_not_b32 s10, s9
; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX10-NEXT: s_mov_b32 s9, 0
; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v3
; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0
; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
@@ -3537,38 +3703,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_cbranch_execnz .LBB10_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s8, 0x200
-; GFX90A-NEXT: s_and_b32 s8, s8, -4
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: s_and_b32 s9, s8, -4
+; GFX90A-NEXT: v_mov_b32_e32 v2, s9
; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX90A-NEXT: s_mov_b32 s10, 0xffffff
+; GFX90A-NEXT: s_and_b32 s8, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT: s_not_b32 s11, s8
; GFX90A-NEXT: s_mov_b64 s[8:9], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX90A-NEXT: s_movk_i32 s11, 0x7fff
+; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3
; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s11
+; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -3586,27 +3753,28 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s8, 0x200
-; GFX908-NEXT: s_and_b32 s8, s8, -4
-; GFX908-NEXT: v_mov_b32_e32 v2, s8
+; GFX908-NEXT: s_and_b32 s9, s8, -4
+; GFX908-NEXT: v_mov_b32_e32 v2, s9
; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX908-NEXT: s_mov_b32 s10, 0xffffff
+; GFX908-NEXT: s_and_b32 s8, s8, 3
+; GFX908-NEXT: s_lshl_b32 s10, s8, 3
+; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT: s_not_b32 s11, s8
; GFX908-NEXT: s_mov_b64 s[8:9], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX908-NEXT: s_movk_i32 s11, 0x7fff
+; GFX908-NEXT: s_movk_i32 s12, 0x7fff
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX908-NEXT: v_add_f32_e32 v0, v0, v3
; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v4, v4, v0, s11
+; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX908-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX908-NEXT: v_mov_b32_e32 v5, v1
; GFX908-NEXT: v_mov_b32_e32 v4, v0
; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
@@ -3625,26 +3793,29 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s8, 0x200
-; GFX8-NEXT: s_and_b32 s8, s8, -4
-; GFX8-NEXT: v_mov_b32_e32 v2, s8
+; GFX8-NEXT: s_and_b32 s9, s8, -4
+; GFX8-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX8-NEXT: s_and_b32 s8, s8, 3
+; GFX8-NEXT: s_lshl_b32 s10, s8, 3
+; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT: s_not_b32 s11, s8
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, s11, v1
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v4, v0
@@ -3664,21 +3835,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s8, 0x200
-; GFX7-NEXT: s_and_b32 s8, s8, -4
-; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: s_and_b32 s9, s8, -4
+; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX7-NEXT: s_and_b32 s8, s8, 3
+; GFX7-NEXT: s_lshl_b32 s10, s8, 3
+; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: s_not_b32 s11, s8
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffffff, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX7-NEXT: v_and_b32_e32 v4, s11, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
; GFX7-NEXT: v_mov_b32_e32 v5, v1
; GFX7-NEXT: v_mov_b32_e32 v4, v0
@@ -3698,22 +3873,26 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s8, 0x200
-; GFX6-NEXT: s_and_b32 s8, s8, -4
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: s_and_b32 s9, s8, -4
+; GFX6-NEXT: v_mov_b32_e32 v2, s9
; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX6-NEXT: s_and_b32 s8, s8, 3
+; GFX6-NEXT: s_lshl_b32 s10, s8, 3
+; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: s_not_b32 s11, s8
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_add_f32_e32 v0, v0, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX6-NEXT: v_and_b32_e32 v4, s11, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
; GFX6-NEXT: v_mov_b32_e32 v5, v1
; GFX6-NEXT: v_mov_b32_e32 v4, v0
@@ -3744,8 +3923,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-NEXT: v_not_b32_e32 v9, v6
; GFX12-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
@@ -3757,34 +3941,34 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: buffer_load_b32 v6, v7, s[4:7], null offen
+; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
; GFX12-NEXT: ; Child Loop BB11_4 Depth 2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX12-NEXT: v_add_f32_e32 v4, v4, v10
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX12-NEXT: v_and_or_b32 v5, 0xffffff, v6, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v4, v5
; GFX12-NEXT: v_mov_b32_e32 v5, v6
@@ -3801,7 +3985,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -3816,15 +4000,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_cbranch_execnz .LBB11_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX940-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v9, 0xffffff
+; GFX940-NEXT: v_and_b32_e32 v9, -4, v4
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0
+; GFX940-NEXT: v_not_b32_e32 v10, v4
; GFX940-NEXT: s_mov_b64 s[2:3], exec
; GFX940-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -3836,31 +4024,30 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v7, v8, s[4:7], 0 offen
+; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB11_1
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_mov_b64 exec, s[2:3]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5
; GFX940-NEXT: s_movk_i32 s10, 0x7fff
; GFX940-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Loop Header: Depth=1
; GFX940-NEXT: ; Child Loop BB11_4 Depth 2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, 24, v7
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX940-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_mov_b64 s[8:9], exec
+; GFX940-NEXT: v_add_f32_e32 v4, v4, v11
; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10
; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: s_mov_b64 s[8:9], exec
; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX940-NEXT: v_and_or_b32 v6, v7, v9, v4
+; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX940-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
@@ -3874,7 +4061,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB11_4
; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -3888,7 +4075,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX940-NEXT: s_cbranch_execnz .LBB11_3
; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
@@ -3897,8 +4084,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-NEXT: v_not_b32_e32 v9, v6
; GFX11-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_readfirstlane_b32 s5, v1
@@ -3910,35 +4102,35 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v7, s[4:7], 0 offen
+; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB11_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB11_4 Depth 2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX11-NEXT: v_add_f32_e32 v4, v4, v10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX11-NEXT: v_and_or_b32 v5, 0xffffff, v6, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v4, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v6
@@ -3955,7 +4147,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB11_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -3972,7 +4164,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
@@ -3981,7 +4173,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
-; GFX10-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX10-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v9, v6
; GFX10-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
; GFX10-NEXT: v_readfirstlane_b32 s9, v1
@@ -3991,30 +4187,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB11_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX10-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB11_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v10
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT: v_and_or_b32 v5, 0xffffff, v6, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX10-NEXT: v_mov_b32_e32 v4, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v6
; GFX10-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
@@ -4028,7 +4222,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB11_4
@@ -4045,15 +4239,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX10-NEXT: s_cbranch_execnz .LBB11_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX90A-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, 0xffffff
+; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4
+; GFX90A-NEXT: v_not_b32_e32 v10, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -4065,29 +4263,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v8, s[8:11], 0 offen
+; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5
; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
; GFX90A-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB11_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, 24, v7
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX90A-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v4, v4, v11
; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14
; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v9, v4
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
@@ -4101,7 +4297,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB11_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4115,15 +4311,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX90A-NEXT: s_cbranch_execnz .LBB11_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX908-NEXT: v_and_b32_e32 v7, -4, v4
-; GFX908-NEXT: v_mov_b32_e32 v8, 0xffffff
+; GFX908-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4
+; GFX908-NEXT: v_not_b32_e32 v9, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -4135,29 +4335,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB11_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX908-NEXT: s_movk_i32 s14, 0x7fff
; GFX908-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB11_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v4, 24, v6
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX908-NEXT: v_add_f32_e32 v4, v4, v9
+; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v4, v4, v10
; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14
-; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX908-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX908-NEXT: v_and_or_b32 v5, v6, v8, v4
+; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
; GFX908-NEXT: v_mov_b32_e32 v5, v6
@@ -4172,7 +4370,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB11_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4186,14 +4384,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX908-NEXT: s_cbranch_execnz .LBB11_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4
-; GFX8-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX8-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4
+; GFX8-NEXT: v_not_b32_e32 v9, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -4205,29 +4408,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX8-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB11_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v4, v4, v10
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc
+; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_and_b32_e32 v5, v6, v9
; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
@@ -4243,7 +4444,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4257,14 +4458,18 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX8-NEXT: s_cbranch_execnz .LBB11_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX7-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
+; GFX7-NEXT: v_not_b32_e32 v9, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -4275,24 +4480,24 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB11_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
; GFX7-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB11_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX7-NEXT: v_add_f32_e32 v4, v4, v10
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_mov_b64 s[12:13], exec
@@ -4308,7 +4513,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB11_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4322,7 +4527,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX7-NEXT: s_cbranch_execnz .LBB11_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -4330,7 +4535,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX6-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
+; GFX6-NEXT: v_not_b32_e32 v9, v4
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -4341,24 +4550,24 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB11_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
; GFX6-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB11_4 Depth 2
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX6-NEXT: v_add_f32_e32 v4, v4, v10
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_mov_b64 s[12:13], exec
@@ -4374,7 +4583,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB11_4
; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4388,7 +4597,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX6-NEXT: s_cbranch_execnz .LBB11_3
; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index 684fa2d7df60a..fb068e35fc597 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -2354,57 +2354,66 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s4, 0x200
; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0
-; GFX12-NEXT: s_and_b32 s4, s4, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_and_b32 s5, s4, -4
+; GFX12-NEXT: s_and_b32 s4, s4, 3
+; GFX12-NEXT: v_mov_b32_e32 v4, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s4, 0x200
-; GFX940-NEXT: s_and_b32 s4, s4, -4
-; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: s_and_b32 s5, s4, -4
+; GFX940-NEXT: v_mov_b32_e32 v4, s5
; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
-; GFX940-NEXT: s_mov_b32 s6, 0xffffff
+; GFX940-NEXT: s_and_b32 s4, s4, 3
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
; GFX940-NEXT: v_max_f16_e32 v5, v0, v0
; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX940-NEXT: v_max_f16_sdwa v0, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX940-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX940-NEXT: v_max_f16_e32 v0, v0, v5
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
@@ -2415,7 +2424,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
; GFX940-NEXT: s_cbranch_execnz .LBB6_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
@@ -2423,25 +2432,29 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s4, 0x200
; GFX11-NEXT: v_max_f16_e32 v5, v0, v0
-; GFX11-NEXT: s_and_b32 s4, s4, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_and_b32 s5, s4, -4
+; GFX11-NEXT: s_and_b32 s4, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v4, s5
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: s_not_b32 s6, s5
; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_max_f16_e32 v0, v0, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2449,13 +2462,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB6_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
@@ -2463,19 +2476,23 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s8, 0x200
; GFX10-NEXT: v_max_f16_e32 v5, v0, v0
-; GFX10-NEXT: s_and_b32 s8, s8, -4
-; GFX10-NEXT: v_mov_b32_e32 v4, s8
-; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: s_and_b32 s9, s8, -4
+; GFX10-NEXT: s_and_b32 s8, s8, 3
+; GFX10-NEXT: v_mov_b32_e32 v4, s9
+; GFX10-NEXT: s_lshl_b32 s8, s8, 3
+; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT: s_not_b32 s10, s9
; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX10-NEXT: s_mov_b32 s9, 0
; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: v_max_f16_e32 v0, v0, v5
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0
; GFX10-NEXT: v_mov_b32_e32 v3, v1
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
@@ -2484,30 +2501,35 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v2
-; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_cbranch_execnz .LBB6_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s8, 0x200
-; GFX90A-NEXT: s_and_b32 s8, s8, -4
-; GFX90A-NEXT: v_mov_b32_e32 v4, s8
+; GFX90A-NEXT: s_and_b32 s9, s8, -4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s9
; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX90A-NEXT: s_mov_b32 s10, 0xffffff
+; GFX90A-NEXT: s_and_b32 s8, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT: s_not_b32 s11, s8
; GFX90A-NEXT: s_mov_b64 s[8:9], 0
; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0
; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX90A-NEXT: v_max_f16_sdwa v0, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT: v_max_f16_e32 v0, v0, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s10, v0
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -2519,25 +2541,30 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
; GFX90A-NEXT: s_cbranch_execnz .LBB6_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s8, 0x200
-; GFX908-NEXT: s_and_b32 s8, s8, -4
-; GFX908-NEXT: v_mov_b32_e32 v4, s8
+; GFX908-NEXT: s_and_b32 s9, s8, -4
+; GFX908-NEXT: v_mov_b32_e32 v4, s9
; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX908-NEXT: s_mov_b32 s10, 0xffffff
+; GFX908-NEXT: s_and_b32 s8, s8, 3
+; GFX908-NEXT: s_lshl_b32 s10, s8, 3
+; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT: s_not_b32 s11, s8
; GFX908-NEXT: s_mov_b64 s[8:9], 0
; GFX908-NEXT: v_max_f16_e32 v5, v0, v0
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX908-NEXT: v_max_f16_sdwa v0, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT: v_max_f16_e32 v0, v0, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, s10, v0
+; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
@@ -2550,24 +2577,30 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
; GFX908-NEXT: s_cbranch_execnz .LBB6_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s8, 0x200
-; GFX8-NEXT: s_and_b32 s8, s8, -4
-; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: s_and_b32 s9, s8, -4
+; GFX8-NEXT: v_mov_b32_e32 v4, s9
; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX8-NEXT: s_and_b32 s8, s8, 3
+; GFX8-NEXT: s_lshl_b32 s10, s8, 3
+; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT: s_not_b32 s11, s8
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: v_max_f16_e32 v5, v0, v0
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v5
+; GFX8-NEXT: v_and_b32_e32 v2, s11, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: v_mov_b32_e32 v2, v0
@@ -2581,28 +2614,32 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
; GFX8-NEXT: s_cbranch_execnz .LBB6_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s8, 0x200
-; GFX7-NEXT: s_and_b32 s8, s8, -4
-; GFX7-NEXT: v_mov_b32_e32 v4, s8
+; GFX7-NEXT: s_and_b32 s9, s8, -4
+; GFX7-NEXT: v_mov_b32_e32 v4, s9
; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: s_and_b32 s8, s8, 3
+; GFX7-NEXT: s_lshl_b32 s10, s8, 3
+; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: s_not_b32 s11, s8
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffffff, v1
+; GFX7-NEXT: v_and_b32_e32 v2, s11, v1
; GFX7-NEXT: v_max_f32_e32 v0, v0, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: v_mov_b32_e32 v2, v0
@@ -2616,7 +2653,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
; GFX7-NEXT: s_cbranch_execnz .LBB6_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -2624,22 +2661,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s8, 0x200
-; GFX6-NEXT: s_and_b32 s8, s8, -4
-; GFX6-NEXT: v_mov_b32_e32 v4, s8
+; GFX6-NEXT: s_and_b32 s9, s8, -4
+; GFX6-NEXT: v_mov_b32_e32 v4, s9
; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: s_and_b32 s8, s8, 3
+; GFX6-NEXT: s_lshl_b32 s10, s8, 3
+; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: s_not_b32 s11, s8
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s11, v1
; GFX6-NEXT: v_max_f32_e32 v0, v0, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
; GFX6-NEXT: v_mov_b32_e32 v3, v1
; GFX6-NEXT: v_mov_b32_e32 v2, v0
@@ -2653,7 +2694,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
; GFX6-NEXT: s_cbranch_execnz .LBB6_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -2672,56 +2713,65 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s4, 0x200
; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0
-; GFX12-NEXT: s_and_b32 s4, s4, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_and_b32 s5, s4, -4
+; GFX12-NEXT: s_and_b32 s4, s4, 3
+; GFX12-NEXT: v_mov_b32_e32 v2, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s4, 0x200
-; GFX940-NEXT: s_and_b32 s4, s4, -4
-; GFX940-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NEXT: s_and_b32 s5, s4, -4
+; GFX940-NEXT: v_mov_b32_e32 v2, s5
; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
-; GFX940-NEXT: s_mov_b32 s6, 0xffffff
+; GFX940-NEXT: s_and_b32 s4, s4, 3
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
; GFX940-NEXT: v_max_f16_e32 v3, v0, v0
; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX940-NEXT: v_max_f16_sdwa v0, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX940-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX940-NEXT: v_max_f16_e32 v0, v0, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
@@ -2739,25 +2789,29 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s4, 0x200
; GFX11-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX11-NEXT: s_and_b32 s4, s4, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v2, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_and_b32 s5, s4, -4
+; GFX11-NEXT: s_and_b32 s4, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v2, s5
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: s_not_b32 s6, s5
; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_max_f16_e32 v0, v0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2765,12 +2819,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX11-NEXT: v_mov_b32_e32 v1, v4
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB7_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
@@ -2778,19 +2832,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s8, 0x200
; GFX10-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX10-NEXT: s_and_b32 s8, s8, -4
-; GFX10-NEXT: v_mov_b32_e32 v2, s8
-; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: s_and_b32 s9, s8, -4
+; GFX10-NEXT: s_and_b32 s8, s8, 3
+; GFX10-NEXT: v_mov_b32_e32 v2, s9
+; GFX10-NEXT: s_lshl_b32 s8, s8, 3
+; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT: s_not_b32 s10, s9
; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX10-NEXT: s_mov_b32 s9, 0
; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: v_max_f16_e32 v0, v0, v3
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0
; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
@@ -2799,29 +2857,34 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_cbranch_execnz .LBB7_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s8, 0x200
-; GFX90A-NEXT: s_and_b32 s8, s8, -4
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: s_and_b32 s9, s8, -4
+; GFX90A-NEXT: v_mov_b32_e32 v2, s9
; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX90A-NEXT: s_mov_b32 s10, 0xffffff
+; GFX90A-NEXT: s_and_b32 s8, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT: s_not_b32 s11, s8
; GFX90A-NEXT: s_mov_b64 s[8:9], 0
; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0
; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX90A-NEXT: v_max_f16_sdwa v0, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT: v_max_f16_e32 v0, v0, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s10, v0
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -2839,18 +2902,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s8, 0x200
-; GFX908-NEXT: s_and_b32 s8, s8, -4
-; GFX908-NEXT: v_mov_b32_e32 v2, s8
+; GFX908-NEXT: s_and_b32 s9, s8, -4
+; GFX908-NEXT: v_mov_b32_e32 v2, s9
; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX908-NEXT: s_mov_b32 s10, 0xffffff
+; GFX908-NEXT: s_and_b32 s8, s8, 3
+; GFX908-NEXT: s_lshl_b32 s10, s8, 3
+; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT: s_not_b32 s11, s8
; GFX908-NEXT: s_mov_b64 s[8:9], 0
; GFX908-NEXT: v_max_f16_e32 v3, v0, v0
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX908-NEXT: v_max_f16_sdwa v0, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT: v_max_f16_e32 v0, v0, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, s10, v0
+; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX908-NEXT: v_mov_b32_e32 v5, v1
; GFX908-NEXT: v_mov_b32_e32 v4, v0
; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
@@ -2869,17 +2937,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s8, 0x200
-; GFX8-NEXT: s_and_b32 s8, s8, -4
-; GFX8-NEXT: v_mov_b32_e32 v2, s8
+; GFX8-NEXT: s_and_b32 s9, s8, -4
+; GFX8-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX8-NEXT: s_and_b32 s8, s8, 3
+; GFX8-NEXT: s_lshl_b32 s10, s8, 3
+; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT: s_not_b32 s11, s8
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v3
+; GFX8-NEXT: v_and_b32_e32 v4, s11, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v4, v0
@@ -2899,21 +2973,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s8, 0x200
-; GFX7-NEXT: s_and_b32 s8, s8, -4
-; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: s_and_b32 s9, s8, -4
+; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: s_and_b32 s8, s8, 3
+; GFX7-NEXT: s_lshl_b32 s10, s8, 3
+; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT: s_not_b32 s11, s8
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffffff, v1
+; GFX7-NEXT: v_and_b32_e32 v4, s11, v1
; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
; GFX7-NEXT: v_mov_b32_e32 v5, v1
; GFX7-NEXT: v_mov_b32_e32 v4, v0
@@ -2933,22 +3011,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s8, 0x200
-; GFX6-NEXT: s_and_b32 s8, s8, -4
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: s_and_b32 s9, s8, -4
+; GFX6-NEXT: v_mov_b32_e32 v2, s9
; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: s_and_b32 s8, s8, 3
+; GFX6-NEXT: s_lshl_b32 s10, s8, 3
+; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT: s_not_b32 s11, s8
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v1
+; GFX6-NEXT: v_and_b32_e32 v4, s11, v1
; GFX6-NEXT: v_max_f32_e32 v0, v0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
; GFX6-NEXT: v_mov_b32_e32 v5, v1
; GFX6-NEXT: v_mov_b32_e32 v4, v0
@@ -2979,8 +3061,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-NEXT: v_not_b32_e32 v9, v6
; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
@@ -2992,28 +3079,28 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: buffer_load_b32 v6, v7, s[4:7], null offen
+; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_max_num_f16_e32 v8, v5, v5
+; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
; GFX12-NEXT: ; Child Loop BB8_4 Depth 2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4
-; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v8
+; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v10
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, 0xffffff, v6, v4
+; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v5
; GFX12-NEXT: v_mov_b32_e32 v5, v6
; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
@@ -3029,7 +3116,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -3044,15 +3131,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_cbranch_execnz .LBB8_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX940-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v9, 0xffffff
+; GFX940-NEXT: v_and_b32_e32 v9, -4, v4
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0
+; GFX940-NEXT: v_not_b32_e32 v10, v4
; GFX940-NEXT: s_mov_b64 s[2:3], exec
; GFX940-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -3064,21 +3155,23 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v7, v8, s[4:7], 0 offen
+; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB8_1
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_mov_b64 exec, s[2:3]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_max_f16_e32 v10, v5, v5
+; GFX940-NEXT: v_max_f16_e32 v11, v5, v5
; GFX940-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Loop Header: Depth=1
; GFX940-NEXT: ; Child Loop BB8_4 Depth 2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX940-NEXT: v_max_f16_sdwa v4, v4, v10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7
+; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX940-NEXT: v_max_f16_e32 v4, v4, v11
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX940-NEXT: s_mov_b64 s[8:9], exec
-; GFX940-NEXT: v_and_or_b32 v6, v7, v9, v4
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
@@ -3093,7 +3186,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB8_4
; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -3107,7 +3200,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX940-NEXT: s_cbranch_execnz .LBB8_3
; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
@@ -3116,8 +3209,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-NEXT: v_not_b32_e32 v9, v6
; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_readfirstlane_b32 s5, v1
@@ -3129,28 +3227,28 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v7, s[4:7], 0 offen
+; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB8_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_max_f16_e32 v8, v5, v5
+; GFX11-NEXT: v_max_f16_e32 v10, v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB8_4 Depth 2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX11-NEXT: v_max_f16_e32 v4, v4, v8
+; GFX11-NEXT: v_max_f16_e32 v4, v4, v10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, 0xffffff, v6, v4
+; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX11-NEXT: v_mov_b32_e32 v4, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v6
; GFX11-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
@@ -3166,7 +3264,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB8_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -3182,7 +3280,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX11-NEXT: s_cbranch_execnz .LBB8_3
; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
@@ -3191,7 +3289,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
-; GFX10-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX10-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v9, v6
; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
; GFX10-NEXT: v_readfirstlane_b32 s9, v1
@@ -3201,24 +3303,24 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB8_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
-; GFX10-NEXT: v_max_f16_e32 v8, v5, v5
+; GFX10-NEXT: v_max_f16_e32 v10, v5, v5
; GFX10-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB8_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f16_e32 v4, v4, v8
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT: v_and_or_b32 v5, 0xffffff, v6, v4
+; GFX10-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX10-NEXT: v_max_f16_e32 v4, v4, v10
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX10-NEXT: v_mov_b32_e32 v4, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v6
; GFX10-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
@@ -3232,7 +3334,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB8_4
@@ -3249,15 +3351,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX10-NEXT: s_cbranch_execnz .LBB8_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX90A-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, 0xffffff
+; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4
+; GFX90A-NEXT: v_not_b32_e32 v10, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -3269,20 +3375,22 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v8, s[8:11], 0 offen
+; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_max_f16_e32 v10, v5, v5
+; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5
; GFX90A-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB8_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX90A-NEXT: v_max_f16_sdwa v4, v4, v10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v9, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v11
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
@@ -3296,7 +3404,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB8_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -3310,15 +3418,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX90A-NEXT: s_cbranch_execnz .LBB8_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX908-NEXT: v_and_b32_e32 v7, -4, v4
-; GFX908-NEXT: v_mov_b32_e32 v8, 0xffffff
+; GFX908-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4
+; GFX908-NEXT: v_not_b32_e32 v9, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -3330,20 +3442,22 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB8_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_max_f16_e32 v9, v5, v5
+; GFX908-NEXT: v_max_f16_e32 v10, v5, v5
; GFX908-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB8_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX908-NEXT: v_max_f16_sdwa v4, v4, v9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_and_or_b32 v5, v6, v8, v4
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX908-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX908-NEXT: v_max_f16_e32 v4, v4, v10
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
; GFX908-NEXT: v_mov_b32_e32 v5, v6
@@ -3358,7 +3472,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB8_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -3372,14 +3486,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX908-NEXT: s_cbranch_execnz .LBB8_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4
-; GFX8-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX8-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4
+; GFX8-NEXT: v_not_b32_e32 v9, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -3391,20 +3510,22 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB8_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_max_f16_e32 v8, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v10, v5, v5
; GFX8-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB8_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX8-NEXT: v_max_f16_sdwa v4, v4, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX8-NEXT: v_and_b32_e32 v5, v6, v9
; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
@@ -3420,7 +3541,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB8_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -3434,14 +3555,18 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX8-NEXT: s_cbranch_execnz .LBB8_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX7-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
+; GFX7-NEXT: v_not_b32_e32 v9, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -3452,25 +3577,25 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB8_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4
; GFX7-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB8_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_max_f32_e32 v4, v4, v8
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v10
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: v_mov_b32_e32 v5, v6
@@ -3485,7 +3610,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB8_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -3499,7 +3624,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX7-NEXT: s_cbranch_execnz .LBB8_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -3507,7 +3632,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX6-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
+; GFX6-NEXT: v_not_b32_e32 v9, v4
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -3518,25 +3647,25 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB8_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4
; GFX6-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB8_4 Depth 2
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
; GFX6-NEXT: s_mov_b64 s[12:13], exec
-; GFX6-NEXT: v_max_f32_e32 v4, v4, v8
+; GFX6-NEXT: v_max_f32_e32 v4, v4, v10
; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: v_mov_b32_e32 v5, v6
@@ -3551,7 +3680,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB8_4
; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -3565,7 +3694,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX6-NEXT: s_cbranch_execnz .LBB8_3
; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -3588,15 +3717,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s4, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX12-NEXT: s_and_b32 s4, s4, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_and_b32 s5, s4, -4
+; GFX12-NEXT: s_and_b32 s4, s4, 3
+; GFX12-NEXT: v_mov_b32_e32 v4, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -3610,8 +3743,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX12-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -3619,42 +3752,43 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s4, 0x200
-; GFX940-NEXT: s_and_b32 s4, s4, -4
-; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: s_and_b32 s5, s4, -4
+; GFX940-NEXT: v_mov_b32_e32 v4, s5
; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
-; GFX940-NEXT: s_mov_b32 s6, 0xffffff
+; GFX940-NEXT: s_and_b32 s4, s4, 3
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX940-NEXT: s_movk_i32 s7, 0x7fff
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: v_max_f32_e32 v0, v0, v5
; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v2, v2, v0, s7
+; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX940-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -3666,7 +3800,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX940-NEXT: s_cbranch_execnz .LBB9_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
@@ -3674,16 +3808,20 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s4, 0x200
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT: s_and_b32 s4, s4, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_and_b32 s5, s4, -4
+; GFX11-NEXT: s_and_b32 s4, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v4, s5
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: s_not_b32 s6, s5
; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -3697,8 +3835,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX11-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
@@ -3707,13 +3845,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB9_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
@@ -3721,25 +3859,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s8, 0x200
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX10-NEXT: s_and_b32 s8, s8, -4
-; GFX10-NEXT: v_mov_b32_e32 v4, s8
-; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: s_and_b32 s9, s8, -4
+; GFX10-NEXT: s_and_b32 s8, s8, 3
+; GFX10-NEXT: v_mov_b32_e32 v4, s9
+; GFX10-NEXT: s_lshl_b32 s8, s8, 3
+; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT: s_not_b32 s10, s9
; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX10-NEXT: s_mov_b32 s9, 0
; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_max_f32_e32 v0, v0, v5
; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0
; GFX10-NEXT: v_mov_b32_e32 v3, v1
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
@@ -3748,39 +3888,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v2
-; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_cbranch_execnz .LBB9_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s8, 0x200
-; GFX90A-NEXT: s_and_b32 s8, s8, -4
-; GFX90A-NEXT: v_mov_b32_e32 v4, s8
+; GFX90A-NEXT: s_and_b32 s9, s8, -4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s9
; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX90A-NEXT: s_mov_b32 s10, 0xffffff
+; GFX90A-NEXT: s_and_b32 s8, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT: s_not_b32 s11, s8
; GFX90A-NEXT: s_mov_b64 s[8:9], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX90A-NEXT: s_movk_i32 s11, 0x7fff
+; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX90A-NEXT: v_max_f32_e32 v0, v0, v5
; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s11
+; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -3792,34 +3933,35 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s8, 0x200
-; GFX908-NEXT: s_and_b32 s8, s8, -4
-; GFX908-NEXT: v_mov_b32_e32 v4, s8
+; GFX908-NEXT: s_and_b32 s9, s8, -4
+; GFX908-NEXT: v_mov_b32_e32 v4, s9
; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX908-NEXT: s_mov_b32 s10, 0xffffff
+; GFX908-NEXT: s_and_b32 s8, s8, 3
+; GFX908-NEXT: s_lshl_b32 s10, s8, 3
+; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT: s_not_b32 s11, s8
; GFX908-NEXT: s_mov_b64 s[8:9], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX908-NEXT: s_movk_i32 s11, 0x7fff
+; GFX908-NEXT: s_movk_i32 s12, 0x7fff
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX908-NEXT: v_max_f32_e32 v0, v0, v5
; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v2, v2, v0, s11
+; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX908-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
@@ -3832,33 +3974,36 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX908-NEXT: s_cbranch_execnz .LBB9_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s8, 0x200
-; GFX8-NEXT: s_and_b32 s8, s8, -4
-; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: s_and_b32 s9, s8, -4
+; GFX8-NEXT: v_mov_b32_e32 v4, s9
; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX8-NEXT: s_and_b32 s8, s8, 3
+; GFX8-NEXT: s_lshl_b32 s10, s8, 3
+; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT: s_not_b32 s11, s8
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_max_f32_e32 v0, v0, v5
-; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, s11, v1
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: v_mov_b32_e32 v2, v0
@@ -3872,29 +4017,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX8-NEXT: s_cbranch_execnz .LBB9_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s8, 0x200
-; GFX7-NEXT: s_and_b32 s8, s8, -4
-; GFX7-NEXT: v_mov_b32_e32 v4, s8
+; GFX7-NEXT: s_and_b32 s9, s8, -4
+; GFX7-NEXT: v_mov_b32_e32 v4, s9
; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX7-NEXT: s_and_b32 s8, s8, 3
+; GFX7-NEXT: s_lshl_b32 s10, s8, 3
+; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: s_not_b32 s11, s8
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v0, v0, v5
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffffff, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX7-NEXT: v_and_b32_e32 v2, s11, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: v_mov_b32_e32 v2, v0
@@ -3908,7 +4057,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX7-NEXT: s_cbranch_execnz .LBB9_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -3916,23 +4065,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s8, 0x200
-; GFX6-NEXT: s_and_b32 s8, s8, -4
-; GFX6-NEXT: v_mov_b32_e32 v4, s8
+; GFX6-NEXT: s_and_b32 s9, s8, -4
+; GFX6-NEXT: v_mov_b32_e32 v4, s9
; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX6-NEXT: s_and_b32 s8, s8, 3
+; GFX6-NEXT: s_lshl_b32 s10, s8, 3
+; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: s_not_b32 s11, s8
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_max_f32_e32 v0, v0, v5
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX6-NEXT: v_and_b32_e32 v2, s11, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
; GFX6-NEXT: v_mov_b32_e32 v3, v1
; GFX6-NEXT: v_mov_b32_e32 v2, v0
@@ -3946,7 +4099,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX6-NEXT: s_cbranch_execnz .LBB9_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -3965,15 +4118,19 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s4, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-NEXT: s_and_b32 s4, s4, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_and_b32 s5, s4, -4
+; GFX12-NEXT: s_and_b32 s4, s4, 3
+; GFX12-NEXT: v_mov_b32_e32 v2, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -3987,8 +4144,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX12-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -3996,41 +4153,42 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s4, 0x200
-; GFX940-NEXT: s_and_b32 s4, s4, -4
-; GFX940-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NEXT: s_and_b32 s5, s4, -4
+; GFX940-NEXT: v_mov_b32_e32 v2, s5
; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
-; GFX940-NEXT: s_mov_b32 s6, 0xffffff
+; GFX940-NEXT: s_and_b32 s4, s4, 3
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX940-NEXT: s_movk_i32 s7, 0x7fff
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: v_max_f32_e32 v0, v0, v3
; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v4, v4, v0, s7
+; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX940-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -4049,16 +4207,20 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s4, 0x200
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: s_and_b32 s4, s4, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v2, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_and_b32 s5, s4, -4
+; GFX11-NEXT: s_and_b32 s4, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v2, s5
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: s_not_b32 s6, s5
; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -4072,8 +4234,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX11-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
@@ -4082,12 +4244,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX11-NEXT: v_mov_b32_e32 v1, v4
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB10_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
@@ -4095,25 +4257,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s8, 0x200
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT: s_and_b32 s8, s8, -4
-; GFX10-NEXT: v_mov_b32_e32 v2, s8
-; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: s_and_b32 s9, s8, -4
+; GFX10-NEXT: s_and_b32 s8, s8, 3
+; GFX10-NEXT: v_mov_b32_e32 v2, s9
+; GFX10-NEXT: s_lshl_b32 s8, s8, 3
+; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT: s_not_b32 s10, s9
; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX10-NEXT: s_mov_b32 s9, 0
; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_max_f32_e32 v0, v0, v3
; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0
; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
@@ -4122,38 +4286,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_cbranch_execnz .LBB10_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s8, 0x200
-; GFX90A-NEXT: s_and_b32 s8, s8, -4
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: s_and_b32 s9, s8, -4
+; GFX90A-NEXT: v_mov_b32_e32 v2, s9
; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX90A-NEXT: s_mov_b32 s10, 0xffffff
+; GFX90A-NEXT: s_and_b32 s8, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT: s_not_b32 s11, s8
; GFX90A-NEXT: s_mov_b64 s[8:9], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX90A-NEXT: s_movk_i32 s11, 0x7fff
+; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3
; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s11
+; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -4171,27 +4336,28 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s8, 0x200
-; GFX908-NEXT: s_and_b32 s8, s8, -4
-; GFX908-NEXT: v_mov_b32_e32 v2, s8
+; GFX908-NEXT: s_and_b32 s9, s8, -4
+; GFX908-NEXT: v_mov_b32_e32 v2, s9
; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX908-NEXT: s_mov_b32 s10, 0xffffff
+; GFX908-NEXT: s_and_b32 s8, s8, 3
+; GFX908-NEXT: s_lshl_b32 s10, s8, 3
+; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT: s_not_b32 s11, s8
; GFX908-NEXT: s_mov_b64 s[8:9], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX908-NEXT: s_movk_i32 s11, 0x7fff
+; GFX908-NEXT: s_movk_i32 s12, 0x7fff
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX908-NEXT: v_max_f32_e32 v0, v0, v3
; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v4, v4, v0, s11
+; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX908-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX908-NEXT: v_mov_b32_e32 v5, v1
; GFX908-NEXT: v_mov_b32_e32 v4, v0
; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
@@ -4210,26 +4376,29 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s8, 0x200
-; GFX8-NEXT: s_and_b32 s8, s8, -4
-; GFX8-NEXT: v_mov_b32_e32 v2, s8
+; GFX8-NEXT: s_and_b32 s9, s8, -4
+; GFX8-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX8-NEXT: s_and_b32 s8, s8, 3
+; GFX8-NEXT: s_lshl_b32 s10, s8, 3
+; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT: s_not_b32 s11, s8
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v3
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, s11, v1
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v4, v0
@@ -4249,22 +4418,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s8, 0x200
-; GFX7-NEXT: s_and_b32 s8, s8, -4
-; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: s_and_b32 s9, s8, -4
+; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX7-NEXT: s_and_b32 s8, s8, 3
+; GFX7-NEXT: s_lshl_b32 s10, s8, 3
+; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: s_not_b32 s11, s8
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffffff, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX7-NEXT: v_and_b32_e32 v4, s11, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
; GFX7-NEXT: v_mov_b32_e32 v5, v1
; GFX7-NEXT: v_mov_b32_e32 v4, v0
@@ -4284,23 +4457,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s8, 0x200
-; GFX6-NEXT: s_and_b32 s8, s8, -4
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: s_and_b32 s9, s8, -4
+; GFX6-NEXT: v_mov_b32_e32 v2, s9
; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX6-NEXT: s_and_b32 s8, s8, 3
+; GFX6-NEXT: s_lshl_b32 s10, s8, 3
+; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: s_not_b32 s11, s8
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_max_f32_e32 v0, v0, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX6-NEXT: v_and_b32_e32 v4, s11, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
; GFX6-NEXT: v_mov_b32_e32 v5, v1
; GFX6-NEXT: v_mov_b32_e32 v4, v0
@@ -4331,8 +4508,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-NEXT: v_not_b32_e32 v9, v6
; GFX12-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
@@ -4344,34 +4526,34 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: buffer_load_b32 v6, v7, s[4:7], null offen
+; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
; GFX12-NEXT: ; Child Loop BB11_4 Depth 2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v8
+; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v10
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX12-NEXT: v_and_or_b32 v5, 0xffffff, v6, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v4, v5
; GFX12-NEXT: v_mov_b32_e32 v5, v6
@@ -4388,7 +4570,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4403,15 +4585,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_cbranch_execnz .LBB11_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX940-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v9, 0xffffff
+; GFX940-NEXT: v_and_b32_e32 v9, -4, v4
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0
+; GFX940-NEXT: v_not_b32_e32 v10, v4
; GFX940-NEXT: s_mov_b64 s[2:3], exec
; GFX940-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -4423,31 +4609,30 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v7, v8, s[4:7], 0 offen
+; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB11_1
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_mov_b64 exec, s[2:3]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5
; GFX940-NEXT: s_movk_i32 s10, 0x7fff
; GFX940-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Loop Header: Depth=1
; GFX940-NEXT: ; Child Loop BB11_4 Depth 2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, 24, v7
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v10
+; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_mov_b64 s[8:9], exec
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v11
; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10
; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: s_mov_b64 s[8:9], exec
; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX940-NEXT: v_and_or_b32 v6, v7, v9, v4
+; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX940-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
@@ -4461,7 +4646,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB11_4
; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4475,7 +4660,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX940-NEXT: s_cbranch_execnz .LBB11_3
; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
@@ -4484,8 +4669,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-NEXT: v_not_b32_e32 v9, v6
; GFX11-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_readfirstlane_b32 s5, v1
@@ -4497,35 +4687,35 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v7, s[4:7], 0 offen
+; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB11_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB11_4 Depth 2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_max_f32_e32 v4, v4, v8
+; GFX11-NEXT: v_max_f32_e32 v4, v4, v10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX11-NEXT: v_and_or_b32 v5, 0xffffff, v6, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v4, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v6
@@ -4542,7 +4732,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB11_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4559,7 +4749,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
@@ -4568,7 +4758,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
-; GFX10-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX10-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v9, v6
; GFX10-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
; GFX10-NEXT: v_readfirstlane_b32 s9, v1
@@ -4578,30 +4772,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB11_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX10-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB11_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT: v_max_f32_e32 v4, v4, v8
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v10
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT: v_and_or_b32 v5, 0xffffff, v6, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX10-NEXT: v_mov_b32_e32 v4, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v6
; GFX10-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
@@ -4615,7 +4807,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB11_4
@@ -4632,15 +4824,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX10-NEXT: s_cbranch_execnz .LBB11_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX90A-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, 0xffffff
+; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4
+; GFX90A-NEXT: v_not_b32_e32 v10, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -4652,29 +4848,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v8, s[8:11], 0 offen
+; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5
; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
; GFX90A-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB11_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, 24, v7
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v10
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v11
; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14
; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v9, v4
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
@@ -4688,7 +4882,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB11_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4702,15 +4896,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX90A-NEXT: s_cbranch_execnz .LBB11_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX908-NEXT: v_and_b32_e32 v7, -4, v4
-; GFX908-NEXT: v_mov_b32_e32 v8, 0xffffff
+; GFX908-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4
+; GFX908-NEXT: v_not_b32_e32 v9, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -4722,29 +4920,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB11_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX908-NEXT: s_movk_i32 s14, 0x7fff
; GFX908-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB11_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v4, 24, v6
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX908-NEXT: v_max_f32_e32 v4, v4, v9
+; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_max_f32_e32 v4, v4, v10
; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14
-; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX908-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX908-NEXT: v_and_or_b32 v5, v6, v8, v4
+; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
; GFX908-NEXT: v_mov_b32_e32 v5, v6
@@ -4759,7 +4955,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB11_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4773,14 +4969,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX908-NEXT: s_cbranch_execnz .LBB11_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4
-; GFX8-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX8-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4
+; GFX8-NEXT: v_not_b32_e32 v9, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -4792,29 +4993,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX8-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB11_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_max_f32_e32 v4, v4, v8
+; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f32_e32 v4, v4, v10
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc
+; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_and_b32_e32 v5, v6, v9
; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
@@ -4830,7 +5029,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4844,14 +5043,18 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX8-NEXT: s_cbranch_execnz .LBB11_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX7-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
+; GFX7-NEXT: v_not_b32_e32 v9, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -4862,25 +5065,25 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB11_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
; GFX7-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB11_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_max_f32_e32 v4, v4, v8
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v10
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_mov_b64 s[12:13], exec
@@ -4896,7 +5099,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB11_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4910,7 +5113,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX7-NEXT: s_cbranch_execnz .LBB11_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -4918,7 +5121,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX6-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
+; GFX6-NEXT: v_not_b32_e32 v9, v4
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -4929,25 +5136,25 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB11_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
; GFX6-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB11_4 Depth 2
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT: v_max_f32_e32 v4, v4, v8
+; GFX6-NEXT: v_max_f32_e32 v4, v4, v10
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_mov_b64 s[12:13], exec
@@ -4963,7 +5170,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB11_4
; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4977,7 +5184,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX6-NEXT: s_cbranch_execnz .LBB11_3
; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 51f63c93af57b..89289c15dcae5 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -2354,57 +2354,66 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s4, 0x200
; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0
-; GFX12-NEXT: s_and_b32 s4, s4, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_and_b32 s5, s4, -4
+; GFX12-NEXT: s_and_b32 s4, s4, 3
+; GFX12-NEXT: v_mov_b32_e32 v4, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s4, 0x200
-; GFX940-NEXT: s_and_b32 s4, s4, -4
-; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: s_and_b32 s5, s4, -4
+; GFX940-NEXT: v_mov_b32_e32 v4, s5
; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
-; GFX940-NEXT: s_mov_b32 s6, 0xffffff
+; GFX940-NEXT: s_and_b32 s4, s4, 3
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
; GFX940-NEXT: v_max_f16_e32 v5, v0, v0
; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX940-NEXT: v_min_f16_sdwa v0, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX940-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX940-NEXT: v_min_f16_e32 v0, v0, v5
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
@@ -2415,7 +2424,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
; GFX940-NEXT: s_cbranch_execnz .LBB6_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
@@ -2423,25 +2432,29 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s4, 0x200
; GFX11-NEXT: v_max_f16_e32 v5, v0, v0
-; GFX11-NEXT: s_and_b32 s4, s4, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_and_b32 s5, s4, -4
+; GFX11-NEXT: s_and_b32 s4, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v4, s5
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: s_not_b32 s6, s5
; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_min_f16_e32 v0, v0, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2449,13 +2462,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB6_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
@@ -2463,19 +2476,23 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s8, 0x200
; GFX10-NEXT: v_max_f16_e32 v5, v0, v0
-; GFX10-NEXT: s_and_b32 s8, s8, -4
-; GFX10-NEXT: v_mov_b32_e32 v4, s8
-; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: s_and_b32 s9, s8, -4
+; GFX10-NEXT: s_and_b32 s8, s8, 3
+; GFX10-NEXT: v_mov_b32_e32 v4, s9
+; GFX10-NEXT: s_lshl_b32 s8, s8, 3
+; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT: s_not_b32 s10, s9
; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX10-NEXT: s_mov_b32 s9, 0
; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: v_min_f16_e32 v0, v0, v5
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0
; GFX10-NEXT: v_mov_b32_e32 v3, v1
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
@@ -2484,30 +2501,35 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v2
-; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_cbranch_execnz .LBB6_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s8, 0x200
-; GFX90A-NEXT: s_and_b32 s8, s8, -4
-; GFX90A-NEXT: v_mov_b32_e32 v4, s8
+; GFX90A-NEXT: s_and_b32 s9, s8, -4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s9
; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX90A-NEXT: s_mov_b32 s10, 0xffffff
+; GFX90A-NEXT: s_and_b32 s8, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT: s_not_b32 s11, s8
; GFX90A-NEXT: s_mov_b64 s[8:9], 0
; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0
; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX90A-NEXT: v_min_f16_sdwa v0, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT: v_min_f16_e32 v0, v0, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s10, v0
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -2519,25 +2541,30 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
; GFX90A-NEXT: s_cbranch_execnz .LBB6_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s8, 0x200
-; GFX908-NEXT: s_and_b32 s8, s8, -4
-; GFX908-NEXT: v_mov_b32_e32 v4, s8
+; GFX908-NEXT: s_and_b32 s9, s8, -4
+; GFX908-NEXT: v_mov_b32_e32 v4, s9
; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX908-NEXT: s_mov_b32 s10, 0xffffff
+; GFX908-NEXT: s_and_b32 s8, s8, 3
+; GFX908-NEXT: s_lshl_b32 s10, s8, 3
+; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT: s_not_b32 s11, s8
; GFX908-NEXT: s_mov_b64 s[8:9], 0
; GFX908-NEXT: v_max_f16_e32 v5, v0, v0
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX908-NEXT: v_min_f16_sdwa v0, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT: v_min_f16_e32 v0, v0, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, s10, v0
+; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
@@ -2550,24 +2577,30 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
; GFX908-NEXT: s_cbranch_execnz .LBB6_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s8, 0x200
-; GFX8-NEXT: s_and_b32 s8, s8, -4
-; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: s_and_b32 s9, s8, -4
+; GFX8-NEXT: v_mov_b32_e32 v4, s9
; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX8-NEXT: s_and_b32 s8, s8, 3
+; GFX8-NEXT: s_lshl_b32 s10, s8, 3
+; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT: s_not_b32 s11, s8
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: v_max_f16_e32 v5, v0, v0
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v1
-; GFX8-NEXT: v_min_f16_sdwa v0, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_min_f16_e32 v0, v0, v5
+; GFX8-NEXT: v_and_b32_e32 v2, s11, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: v_mov_b32_e32 v2, v0
@@ -2581,28 +2614,32 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
; GFX8-NEXT: s_cbranch_execnz .LBB6_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s8, 0x200
-; GFX7-NEXT: s_and_b32 s8, s8, -4
-; GFX7-NEXT: v_mov_b32_e32 v4, s8
+; GFX7-NEXT: s_and_b32 s9, s8, -4
+; GFX7-NEXT: v_mov_b32_e32 v4, s9
; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: s_and_b32 s8, s8, 3
+; GFX7-NEXT: s_lshl_b32 s10, s8, 3
+; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: s_not_b32 s11, s8
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffffff, v1
+; GFX7-NEXT: v_and_b32_e32 v2, s11, v1
; GFX7-NEXT: v_min_f32_e32 v0, v0, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: v_mov_b32_e32 v2, v0
@@ -2616,7 +2653,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
; GFX7-NEXT: s_cbranch_execnz .LBB6_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -2624,22 +2661,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s8, 0x200
-; GFX6-NEXT: s_and_b32 s8, s8, -4
-; GFX6-NEXT: v_mov_b32_e32 v4, s8
+; GFX6-NEXT: s_and_b32 s9, s8, -4
+; GFX6-NEXT: v_mov_b32_e32 v4, s9
; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: s_and_b32 s8, s8, 3
+; GFX6-NEXT: s_lshl_b32 s10, s8, 3
+; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: s_not_b32 s11, s8
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s11, v1
; GFX6-NEXT: v_min_f32_e32 v0, v0, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
; GFX6-NEXT: v_mov_b32_e32 v3, v1
; GFX6-NEXT: v_mov_b32_e32 v2, v0
@@ -2653,7 +2694,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
; GFX6-NEXT: s_cbranch_execnz .LBB6_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -2672,56 +2713,65 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s4, 0x200
; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0
-; GFX12-NEXT: s_and_b32 s4, s4, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_and_b32 s5, s4, -4
+; GFX12-NEXT: s_and_b32 s4, s4, 3
+; GFX12-NEXT: v_mov_b32_e32 v2, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s4, 0x200
-; GFX940-NEXT: s_and_b32 s4, s4, -4
-; GFX940-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NEXT: s_and_b32 s5, s4, -4
+; GFX940-NEXT: v_mov_b32_e32 v2, s5
; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
-; GFX940-NEXT: s_mov_b32 s6, 0xffffff
+; GFX940-NEXT: s_and_b32 s4, s4, 3
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
; GFX940-NEXT: v_max_f16_e32 v3, v0, v0
; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX940-NEXT: v_min_f16_sdwa v0, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX940-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX940-NEXT: v_min_f16_e32 v0, v0, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
@@ -2739,25 +2789,29 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s4, 0x200
; GFX11-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX11-NEXT: s_and_b32 s4, s4, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v2, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_and_b32 s5, s4, -4
+; GFX11-NEXT: s_and_b32 s4, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v2, s5
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: s_not_b32 s6, s5
; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_min_f16_e32 v0, v0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2765,12 +2819,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX11-NEXT: v_mov_b32_e32 v1, v4
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB7_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
@@ -2778,19 +2832,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s8, 0x200
; GFX10-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX10-NEXT: s_and_b32 s8, s8, -4
-; GFX10-NEXT: v_mov_b32_e32 v2, s8
-; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: s_and_b32 s9, s8, -4
+; GFX10-NEXT: s_and_b32 s8, s8, 3
+; GFX10-NEXT: v_mov_b32_e32 v2, s9
+; GFX10-NEXT: s_lshl_b32 s8, s8, 3
+; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT: s_not_b32 s10, s9
; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX10-NEXT: s_mov_b32 s9, 0
; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: v_min_f16_e32 v0, v0, v3
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0
; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
@@ -2799,29 +2857,34 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_cbranch_execnz .LBB7_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s8, 0x200
-; GFX90A-NEXT: s_and_b32 s8, s8, -4
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: s_and_b32 s9, s8, -4
+; GFX90A-NEXT: v_mov_b32_e32 v2, s9
; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX90A-NEXT: s_mov_b32 s10, 0xffffff
+; GFX90A-NEXT: s_and_b32 s8, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT: s_not_b32 s11, s8
; GFX90A-NEXT: s_mov_b64 s[8:9], 0
; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0
; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX90A-NEXT: v_min_f16_sdwa v0, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT: v_min_f16_e32 v0, v0, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s10, v0
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -2839,18 +2902,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s8, 0x200
-; GFX908-NEXT: s_and_b32 s8, s8, -4
-; GFX908-NEXT: v_mov_b32_e32 v2, s8
+; GFX908-NEXT: s_and_b32 s9, s8, -4
+; GFX908-NEXT: v_mov_b32_e32 v2, s9
; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX908-NEXT: s_mov_b32 s10, 0xffffff
+; GFX908-NEXT: s_and_b32 s8, s8, 3
+; GFX908-NEXT: s_lshl_b32 s10, s8, 3
+; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT: s_not_b32 s11, s8
; GFX908-NEXT: s_mov_b64 s[8:9], 0
; GFX908-NEXT: v_max_f16_e32 v3, v0, v0
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX908-NEXT: v_min_f16_sdwa v0, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT: v_min_f16_e32 v0, v0, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, s10, v0
+; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX908-NEXT: v_mov_b32_e32 v5, v1
; GFX908-NEXT: v_mov_b32_e32 v4, v0
; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
@@ -2869,17 +2937,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s8, 0x200
-; GFX8-NEXT: s_and_b32 s8, s8, -4
-; GFX8-NEXT: v_mov_b32_e32 v2, s8
+; GFX8-NEXT: s_and_b32 s9, s8, -4
+; GFX8-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX8-NEXT: s_and_b32 s8, s8, 3
+; GFX8-NEXT: s_lshl_b32 s10, s8, 3
+; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT: s_not_b32 s11, s8
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v1
-; GFX8-NEXT: v_min_f16_sdwa v0, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_min_f16_e32 v0, v0, v3
+; GFX8-NEXT: v_and_b32_e32 v4, s11, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v4, v0
@@ -2899,21 +2973,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s8, 0x200
-; GFX7-NEXT: s_and_b32 s8, s8, -4
-; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: s_and_b32 s9, s8, -4
+; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: s_and_b32 s8, s8, 3
+; GFX7-NEXT: s_lshl_b32 s10, s8, 3
+; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT: s_not_b32 s11, s8
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffffff, v1
+; GFX7-NEXT: v_and_b32_e32 v4, s11, v1
; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
; GFX7-NEXT: v_mov_b32_e32 v5, v1
; GFX7-NEXT: v_mov_b32_e32 v4, v0
@@ -2933,22 +3011,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s8, 0x200
-; GFX6-NEXT: s_and_b32 s8, s8, -4
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: s_and_b32 s9, s8, -4
+; GFX6-NEXT: v_mov_b32_e32 v2, s9
; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: s_and_b32 s8, s8, 3
+; GFX6-NEXT: s_lshl_b32 s10, s8, 3
+; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT: s_not_b32 s11, s8
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v1
+; GFX6-NEXT: v_and_b32_e32 v4, s11, v1
; GFX6-NEXT: v_min_f32_e32 v0, v0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
; GFX6-NEXT: v_mov_b32_e32 v5, v1
; GFX6-NEXT: v_mov_b32_e32 v4, v0
@@ -2979,8 +3061,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-NEXT: v_not_b32_e32 v9, v6
; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
@@ -2992,28 +3079,28 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: buffer_load_b32 v6, v7, s[4:7], null offen
+; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_max_num_f16_e32 v8, v5, v5
+; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
; GFX12-NEXT: ; Child Loop BB8_4 Depth 2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4
-; GFX12-NEXT: v_min_num_f16_e32 v4, v4, v8
+; GFX12-NEXT: v_min_num_f16_e32 v4, v4, v10
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, 0xffffff, v6, v4
+; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v5
; GFX12-NEXT: v_mov_b32_e32 v5, v6
; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
@@ -3029,7 +3116,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -3044,15 +3131,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_cbranch_execnz .LBB8_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX940-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v9, 0xffffff
+; GFX940-NEXT: v_and_b32_e32 v9, -4, v4
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0
+; GFX940-NEXT: v_not_b32_e32 v10, v4
; GFX940-NEXT: s_mov_b64 s[2:3], exec
; GFX940-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -3064,21 +3155,23 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v7, v8, s[4:7], 0 offen
+; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB8_1
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_mov_b64 exec, s[2:3]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_max_f16_e32 v10, v5, v5
+; GFX940-NEXT: v_max_f16_e32 v11, v5, v5
; GFX940-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Loop Header: Depth=1
; GFX940-NEXT: ; Child Loop BB8_4 Depth 2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX940-NEXT: v_min_f16_sdwa v4, v4, v10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7
+; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX940-NEXT: v_min_f16_e32 v4, v4, v11
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX940-NEXT: s_mov_b64 s[8:9], exec
-; GFX940-NEXT: v_and_or_b32 v6, v7, v9, v4
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
@@ -3093,7 +3186,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB8_4
; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -3107,7 +3200,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX940-NEXT: s_cbranch_execnz .LBB8_3
; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
@@ -3116,8 +3209,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-NEXT: v_not_b32_e32 v9, v6
; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_readfirstlane_b32 s5, v1
@@ -3129,28 +3227,28 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v7, s[4:7], 0 offen
+; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB8_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_max_f16_e32 v8, v5, v5
+; GFX11-NEXT: v_max_f16_e32 v10, v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB8_4 Depth 2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX11-NEXT: v_min_f16_e32 v4, v4, v8
+; GFX11-NEXT: v_min_f16_e32 v4, v4, v10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, 0xffffff, v6, v4
+; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX11-NEXT: v_mov_b32_e32 v4, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v6
; GFX11-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
@@ -3166,7 +3264,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB8_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -3182,7 +3280,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX11-NEXT: s_cbranch_execnz .LBB8_3
; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
@@ -3191,7 +3289,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
-; GFX10-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX10-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v9, v6
; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
; GFX10-NEXT: v_readfirstlane_b32 s9, v1
@@ -3201,24 +3303,24 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB8_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
-; GFX10-NEXT: v_max_f16_e32 v8, v5, v5
+; GFX10-NEXT: v_max_f16_e32 v10, v5, v5
; GFX10-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB8_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_min_f16_e32 v4, v4, v8
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT: v_and_or_b32 v5, 0xffffff, v6, v4
+; GFX10-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX10-NEXT: v_min_f16_e32 v4, v4, v10
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX10-NEXT: v_mov_b32_e32 v4, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v6
; GFX10-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
@@ -3232,7 +3334,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB8_4
@@ -3249,15 +3351,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX10-NEXT: s_cbranch_execnz .LBB8_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX90A-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, 0xffffff
+; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4
+; GFX90A-NEXT: v_not_b32_e32 v10, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -3269,20 +3375,22 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v8, s[8:11], 0 offen
+; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_max_f16_e32 v10, v5, v5
+; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5
; GFX90A-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB8_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX90A-NEXT: v_min_f16_sdwa v4, v4, v10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v9, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT: v_min_f16_e32 v4, v4, v11
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
@@ -3296,7 +3404,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB8_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -3310,15 +3418,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX90A-NEXT: s_cbranch_execnz .LBB8_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX908-NEXT: v_and_b32_e32 v7, -4, v4
-; GFX908-NEXT: v_mov_b32_e32 v8, 0xffffff
+; GFX908-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4
+; GFX908-NEXT: v_not_b32_e32 v9, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -3330,20 +3442,22 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB8_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_max_f16_e32 v9, v5, v5
+; GFX908-NEXT: v_max_f16_e32 v10, v5, v5
; GFX908-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB8_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX908-NEXT: v_min_f16_sdwa v4, v4, v9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_and_or_b32 v5, v6, v8, v4
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX908-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX908-NEXT: v_min_f16_e32 v4, v4, v10
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
; GFX908-NEXT: v_mov_b32_e32 v5, v6
@@ -3358,7 +3472,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB8_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -3372,14 +3486,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX908-NEXT: s_cbranch_execnz .LBB8_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4
-; GFX8-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX8-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4
+; GFX8-NEXT: v_not_b32_e32 v9, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -3391,20 +3510,22 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB8_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_max_f16_e32 v8, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v10, v5, v5
; GFX8-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB8_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX8-NEXT: v_min_f16_sdwa v4, v4, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT: v_min_f16_e32 v4, v4, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX8-NEXT: v_and_b32_e32 v5, v6, v9
; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
@@ -3420,7 +3541,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB8_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -3434,14 +3555,18 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX8-NEXT: s_cbranch_execnz .LBB8_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX7-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
+; GFX7-NEXT: v_not_b32_e32 v9, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -3452,25 +3577,25 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB8_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4
; GFX7-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB8_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_min_f32_e32 v4, v4, v8
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v10
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: v_mov_b32_e32 v5, v6
@@ -3485,7 +3610,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB8_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -3499,7 +3624,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX7-NEXT: s_cbranch_execnz .LBB8_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -3507,7 +3632,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX6-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
+; GFX6-NEXT: v_not_b32_e32 v9, v4
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -3518,25 +3647,25 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB8_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4
; GFX6-NEXT: .LBB8_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB8_4 Depth 2
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
; GFX6-NEXT: s_mov_b64 s[12:13], exec
-; GFX6-NEXT: v_min_f32_e32 v4, v4, v8
+; GFX6-NEXT: v_min_f32_e32 v4, v4, v10
; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: v_mov_b32_e32 v5, v6
@@ -3551,7 +3680,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB8_4
; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -3565,7 +3694,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX6-NEXT: s_cbranch_execnz .LBB8_3
; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -3588,15 +3717,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s4, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX12-NEXT: s_and_b32 s4, s4, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_and_b32 s5, s4, -4
+; GFX12-NEXT: s_and_b32 s4, s4, 3
+; GFX12-NEXT: v_mov_b32_e32 v4, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -3610,8 +3743,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX12-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -3619,42 +3752,43 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s4, 0x200
-; GFX940-NEXT: s_and_b32 s4, s4, -4
-; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: s_and_b32 s5, s4, -4
+; GFX940-NEXT: v_mov_b32_e32 v4, s5
; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
-; GFX940-NEXT: s_mov_b32 s6, 0xffffff
+; GFX940-NEXT: s_and_b32 s4, s4, 3
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX940-NEXT: s_movk_i32 s7, 0x7fff
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: v_min_f32_e32 v0, v0, v5
; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v2, v2, v0, s7
+; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX940-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -3666,7 +3800,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX940-NEXT: s_cbranch_execnz .LBB9_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
@@ -3674,16 +3808,20 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s4, 0x200
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT: s_and_b32 s4, s4, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_and_b32 s5, s4, -4
+; GFX11-NEXT: s_and_b32 s4, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v4, s5
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: s_not_b32 s6, s5
; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -3697,8 +3835,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX11-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
@@ -3707,13 +3845,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB9_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
@@ -3721,25 +3859,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s8, 0x200
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX10-NEXT: s_and_b32 s8, s8, -4
-; GFX10-NEXT: v_mov_b32_e32 v4, s8
-; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: s_and_b32 s9, s8, -4
+; GFX10-NEXT: s_and_b32 s8, s8, 3
+; GFX10-NEXT: v_mov_b32_e32 v4, s9
+; GFX10-NEXT: s_lshl_b32 s8, s8, 3
+; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT: s_not_b32 s10, s9
; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX10-NEXT: s_mov_b32 s9, 0
; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_min_f32_e32 v0, v0, v5
; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0
; GFX10-NEXT: v_mov_b32_e32 v3, v1
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
@@ -3748,39 +3888,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v2
-; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_cbranch_execnz .LBB9_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s8, 0x200
-; GFX90A-NEXT: s_and_b32 s8, s8, -4
-; GFX90A-NEXT: v_mov_b32_e32 v4, s8
+; GFX90A-NEXT: s_and_b32 s9, s8, -4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s9
; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX90A-NEXT: s_mov_b32 s10, 0xffffff
+; GFX90A-NEXT: s_and_b32 s8, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT: s_not_b32 s11, s8
; GFX90A-NEXT: s_mov_b64 s[8:9], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX90A-NEXT: s_movk_i32 s11, 0x7fff
+; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX90A-NEXT: v_min_f32_e32 v0, v0, v5
; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s11
+; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -3792,34 +3933,35 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s8, 0x200
-; GFX908-NEXT: s_and_b32 s8, s8, -4
-; GFX908-NEXT: v_mov_b32_e32 v4, s8
+; GFX908-NEXT: s_and_b32 s9, s8, -4
+; GFX908-NEXT: v_mov_b32_e32 v4, s9
; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX908-NEXT: s_mov_b32 s10, 0xffffff
+; GFX908-NEXT: s_and_b32 s8, s8, 3
+; GFX908-NEXT: s_lshl_b32 s10, s8, 3
+; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT: s_not_b32 s11, s8
; GFX908-NEXT: s_mov_b64 s[8:9], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX908-NEXT: s_movk_i32 s11, 0x7fff
+; GFX908-NEXT: s_movk_i32 s12, 0x7fff
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX908-NEXT: v_min_f32_e32 v0, v0, v5
; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v2, v2, v0, s11
+; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX908-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
@@ -3832,33 +3974,36 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX908-NEXT: s_cbranch_execnz .LBB9_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s8, 0x200
-; GFX8-NEXT: s_and_b32 s8, s8, -4
-; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: s_and_b32 s9, s8, -4
+; GFX8-NEXT: v_mov_b32_e32 v4, s9
; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX8-NEXT: s_and_b32 s8, s8, 3
+; GFX8-NEXT: s_lshl_b32 s10, s8, 3
+; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT: s_not_b32 s11, s8
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_min_f32_e32 v0, v0, v5
-; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, s11, v1
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: v_mov_b32_e32 v2, v0
@@ -3872,29 +4017,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX8-NEXT: s_cbranch_execnz .LBB9_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s8, 0x200
-; GFX7-NEXT: s_and_b32 s8, s8, -4
-; GFX7-NEXT: v_mov_b32_e32 v4, s8
+; GFX7-NEXT: s_and_b32 s9, s8, -4
+; GFX7-NEXT: v_mov_b32_e32 v4, s9
; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX7-NEXT: s_and_b32 s8, s8, 3
+; GFX7-NEXT: s_lshl_b32 s10, s8, 3
+; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: s_not_b32 s11, s8
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v0, v0, v5
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffffff, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX7-NEXT: v_and_b32_e32 v2, s11, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: v_mov_b32_e32 v2, v0
@@ -3908,7 +4057,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX7-NEXT: s_cbranch_execnz .LBB9_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -3916,23 +4065,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s8, 0x200
-; GFX6-NEXT: s_and_b32 s8, s8, -4
-; GFX6-NEXT: v_mov_b32_e32 v4, s8
+; GFX6-NEXT: s_and_b32 s9, s8, -4
+; GFX6-NEXT: v_mov_b32_e32 v4, s9
; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX6-NEXT: s_and_b32 s8, s8, 3
+; GFX6-NEXT: s_lshl_b32 s10, s8, 3
+; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: s_not_b32 s11, s8
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_min_f32_e32 v0, v0, v5
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX6-NEXT: v_and_b32_e32 v2, s11, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
; GFX6-NEXT: v_mov_b32_e32 v3, v1
; GFX6-NEXT: v_mov_b32_e32 v2, v0
@@ -3946,7 +4099,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX6-NEXT: s_cbranch_execnz .LBB9_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -3965,15 +4118,19 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s4, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-NEXT: s_and_b32 s4, s4, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_and_b32 s5, s4, -4
+; GFX12-NEXT: s_and_b32 s4, s4, 3
+; GFX12-NEXT: v_mov_b32_e32 v2, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -3987,8 +4144,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX12-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -3996,41 +4153,42 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s4, 0x200
-; GFX940-NEXT: s_and_b32 s4, s4, -4
-; GFX940-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NEXT: s_and_b32 s5, s4, -4
+; GFX940-NEXT: v_mov_b32_e32 v2, s5
; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
-; GFX940-NEXT: s_mov_b32 s6, 0xffffff
+; GFX940-NEXT: s_and_b32 s4, s4, 3
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX940-NEXT: s_movk_i32 s7, 0x7fff
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: v_min_f32_e32 v0, v0, v3
; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v4, v4, v0, s7
+; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX940-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -4049,16 +4207,20 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s4, 0x200
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: s_and_b32 s4, s4, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v2, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_and_b32 s5, s4, -4
+; GFX11-NEXT: s_and_b32 s4, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v2, s5
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: s_not_b32 s6, s5
; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -4072,8 +4234,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX11-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
@@ -4082,12 +4244,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX11-NEXT: v_mov_b32_e32 v1, v4
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB10_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
@@ -4095,25 +4257,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s8, 0x200
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT: s_and_b32 s8, s8, -4
-; GFX10-NEXT: v_mov_b32_e32 v2, s8
-; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: s_and_b32 s9, s8, -4
+; GFX10-NEXT: s_and_b32 s8, s8, 3
+; GFX10-NEXT: v_mov_b32_e32 v2, s9
+; GFX10-NEXT: s_lshl_b32 s8, s8, 3
+; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT: s_not_b32 s10, s9
; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX10-NEXT: s_mov_b32 s9, 0
; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_min_f32_e32 v0, v0, v3
; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_and_or_b32 v0, 0xffffff, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0
; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
@@ -4122,38 +4286,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_cbranch_execnz .LBB10_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s8, 0x200
-; GFX90A-NEXT: s_and_b32 s8, s8, -4
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: s_and_b32 s9, s8, -4
+; GFX90A-NEXT: v_mov_b32_e32 v2, s9
; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX90A-NEXT: s_mov_b32 s10, 0xffffff
+; GFX90A-NEXT: s_and_b32 s8, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT: s_not_b32 s11, s8
; GFX90A-NEXT: s_mov_b64 s[8:9], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX90A-NEXT: s_movk_i32 s11, 0x7fff
+; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3
; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s11
+; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -4171,27 +4336,28 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s8, 0x200
-; GFX908-NEXT: s_and_b32 s8, s8, -4
-; GFX908-NEXT: v_mov_b32_e32 v2, s8
+; GFX908-NEXT: s_and_b32 s9, s8, -4
+; GFX908-NEXT: v_mov_b32_e32 v2, s9
; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX908-NEXT: s_mov_b32 s10, 0xffffff
+; GFX908-NEXT: s_and_b32 s8, s8, 3
+; GFX908-NEXT: s_lshl_b32 s10, s8, 3
+; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT: s_not_b32 s11, s8
; GFX908-NEXT: s_mov_b64 s[8:9], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX908-NEXT: s_movk_i32 s11, 0x7fff
+; GFX908-NEXT: s_movk_i32 s12, 0x7fff
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX908-NEXT: v_min_f32_e32 v0, v0, v3
; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v4, v4, v0, s11
+; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX908-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0
; GFX908-NEXT: v_mov_b32_e32 v5, v1
; GFX908-NEXT: v_mov_b32_e32 v4, v0
; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
@@ -4210,26 +4376,29 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s8, 0x200
-; GFX8-NEXT: s_and_b32 s8, s8, -4
-; GFX8-NEXT: v_mov_b32_e32 v2, s8
+; GFX8-NEXT: s_and_b32 s9, s8, -4
+; GFX8-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX8-NEXT: s_and_b32 s8, s8, 3
+; GFX8-NEXT: s_lshl_b32 s10, s8, 3
+; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT: s_not_b32 s11, s8
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v3
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, s11, v1
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v4, v0
@@ -4249,22 +4418,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s8, 0x200
-; GFX7-NEXT: s_and_b32 s8, s8, -4
-; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: s_and_b32 s9, s8, -4
+; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX7-NEXT: s_and_b32 s8, s8, 3
+; GFX7-NEXT: s_lshl_b32 s10, s8, 3
+; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: s_not_b32 s11, s8
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffffff, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX7-NEXT: v_and_b32_e32 v4, s11, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
; GFX7-NEXT: v_mov_b32_e32 v5, v1
; GFX7-NEXT: v_mov_b32_e32 v4, v0
@@ -4284,23 +4457,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s8, 0x200
-; GFX6-NEXT: s_and_b32 s8, s8, -4
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: s_and_b32 s9, s8, -4
+; GFX6-NEXT: v_mov_b32_e32 v2, s9
; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX6-NEXT: s_and_b32 s8, s8, 3
+; GFX6-NEXT: s_lshl_b32 s10, s8, 3
+; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: s_not_b32 s11, s8
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_min_f32_e32 v0, v0, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX6-NEXT: v_and_b32_e32 v4, s11, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
; GFX6-NEXT: v_mov_b32_e32 v5, v1
; GFX6-NEXT: v_mov_b32_e32 v4, v0
@@ -4331,8 +4508,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-NEXT: v_not_b32_e32 v9, v6
; GFX12-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
@@ -4344,34 +4526,34 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: buffer_load_b32 v6, v7, s[4:7], null offen
+; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
; GFX12-NEXT: ; Child Loop BB11_4 Depth 2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v8
+; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v10
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX12-NEXT: v_and_or_b32 v5, 0xffffff, v6, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v4, v5
; GFX12-NEXT: v_mov_b32_e32 v5, v6
@@ -4388,7 +4570,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4403,15 +4585,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_cbranch_execnz .LBB11_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX940-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v9, 0xffffff
+; GFX940-NEXT: v_and_b32_e32 v9, -4, v4
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0
+; GFX940-NEXT: v_not_b32_e32 v10, v4
; GFX940-NEXT: s_mov_b64 s[2:3], exec
; GFX940-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -4423,31 +4609,30 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v7, v8, s[4:7], 0 offen
+; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB11_1
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_mov_b64 exec, s[2:3]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5
; GFX940-NEXT: s_movk_i32 s10, 0x7fff
; GFX940-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Loop Header: Depth=1
; GFX940-NEXT: ; Child Loop BB11_4 Depth 2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, 24, v7
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v10
+; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_mov_b64 s[8:9], exec
+; GFX940-NEXT: v_min_f32_e32 v4, v4, v11
; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10
; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: s_mov_b64 s[8:9], exec
; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX940-NEXT: v_and_or_b32 v6, v7, v9, v4
+; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX940-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
@@ -4461,7 +4646,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB11_4
; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4475,7 +4660,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX940-NEXT: s_cbranch_execnz .LBB11_3
; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
@@ -4484,8 +4669,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-NEXT: v_not_b32_e32 v9, v6
; GFX11-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_readfirstlane_b32 s5, v1
@@ -4497,35 +4687,35 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v7, s[4:7], 0 offen
+; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB11_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB11_4 Depth 2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_min_f32_e32 v4, v4, v8
+; GFX11-NEXT: v_min_f32_e32 v4, v4, v10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX11-NEXT: v_and_or_b32 v5, 0xffffff, v6, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v4, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v6
@@ -4542,7 +4732,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB11_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4559,7 +4749,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
@@ -4568,7 +4758,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
-; GFX10-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX10-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v9, v6
; GFX10-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
; GFX10-NEXT: v_readfirstlane_b32 s9, v1
@@ -4578,30 +4772,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB11_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX10-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB11_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT: v_min_f32_e32 v4, v4, v8
+; GFX10-NEXT: v_min_f32_e32 v4, v4, v10
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT: v_and_or_b32 v5, 0xffffff, v6, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX10-NEXT: v_mov_b32_e32 v4, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v6
; GFX10-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
@@ -4615,7 +4807,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB11_4
@@ -4632,15 +4824,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX10-NEXT: s_cbranch_execnz .LBB11_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX90A-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, 0xffffff
+; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4
+; GFX90A-NEXT: v_not_b32_e32 v10, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -4652,29 +4848,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v8, s[8:11], 0 offen
+; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5
; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
; GFX90A-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB11_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, 24, v7
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v10
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_min_f32_e32 v4, v4, v11
; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14
; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v9, v4
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
@@ -4688,7 +4882,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB11_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4702,15 +4896,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX90A-NEXT: s_cbranch_execnz .LBB11_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX908-NEXT: v_and_b32_e32 v7, -4, v4
-; GFX908-NEXT: v_mov_b32_e32 v8, 0xffffff
+; GFX908-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4
+; GFX908-NEXT: v_not_b32_e32 v9, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -4722,29 +4920,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB11_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX908-NEXT: s_movk_i32 s14, 0x7fff
; GFX908-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB11_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v4, 24, v6
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX908-NEXT: v_min_f32_e32 v4, v4, v9
+; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_min_f32_e32 v4, v4, v10
; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14
-; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX908-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX908-NEXT: v_and_or_b32 v5, v6, v8, v4
+; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
; GFX908-NEXT: v_mov_b32_e32 v5, v6
@@ -4759,7 +4955,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB11_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4773,14 +4969,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX908-NEXT: s_cbranch_execnz .LBB11_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4
-; GFX8-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX8-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4
+; GFX8-NEXT: v_not_b32_e32 v9, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -4792,29 +4993,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX8-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB11_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_min_f32_e32 v4, v4, v8
+; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f32_e32 v4, v4, v10
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc
+; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_and_b32_e32 v5, v6, v9
; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
@@ -4830,7 +5029,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4844,14 +5043,18 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX8-NEXT: s_cbranch_execnz .LBB11_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX7-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
+; GFX7-NEXT: v_not_b32_e32 v9, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -4862,25 +5065,25 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB11_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
; GFX7-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB11_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_min_f32_e32 v4, v4, v8
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v10
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_mov_b64 s[12:13], exec
@@ -4896,7 +5099,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB11_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4910,7 +5113,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX7-NEXT: s_cbranch_execnz .LBB11_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -4918,7 +5121,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX6-NEXT: v_and_b32_e32 v7, -4, v4
+; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
+; GFX6-NEXT: v_not_b32_e32 v9, v4
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -4929,25 +5136,25 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v7, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB11_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
; GFX6-NEXT: .LBB11_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB11_4 Depth 2
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT: v_min_f32_e32 v4, v4, v8
+; GFX6-NEXT: v_min_f32_e32 v4, v4, v10
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_mov_b64 s[12:13], exec
@@ -4963,7 +5170,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB11_4
; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4977,7 +5184,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX6-NEXT: s_cbranch_execnz .LBB11_3
; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll
index 35999842d6d71..cc98b5333c5bb 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll
@@ -196,8 +196,7 @@ define i32 @ptrtoint_offset(ptr addrspace(7) %ptr) {
; CHECK-LABEL: define i32 @ptrtoint_offset
; CHECK-SAME: ({ ptr addrspace(8), i32 } [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
-; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT: [[RET:%.*]] = or i32 poison, [[PTR_OFF]]
+; CHECK-NEXT: [[RET:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
; CHECK-NEXT: ret i32 [[RET]]
;
%ret = ptrtoint ptr addrspace(7) %ptr to i32
More information about the llvm-commits
mailing list