[llvm] [GlobalIsel] Combine G_PTR_ADD. (PR #95647)
Thorsten Schütt via llvm-commits
llvm-commits at lists.llvm.org
Sat Jun 15 13:29:35 PDT 2024
https://github.com/tschuett updated https://github.com/llvm/llvm-project/pull/95647
>From b4fb3d6c5c154ab80c9c7bf51c91c2eb58e2fd85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Thu, 6 Jun 2024 15:09:55 +0200
Subject: [PATCH 1/3] [GlobalIsel] Combine G_PTR_ADD.
Hints from https://reviews.llvm.org/D109528
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 10 +
.../include/llvm/Target/GlobalISel/Combine.td | 71 ++++-
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 209 ++++++++++++++
.../GlobalISel/CombinerHelperVectorOps.cpp | 49 ++++
.../AArch64/GlobalISel/combine-ptradd.mir | 273 ++++++++++++++++++
.../AArch64/GlobalISel/combine-vscale.mir | 48 +++
6 files changed, 659 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-ptradd.mir
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 43659564d5ace..2bce6ba470f67 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -879,6 +879,16 @@ class CombinerHelper {
bool matchShlOfVScale(const MachineOperand &MO, BuildFnTy &MatchInfo);
+ bool matchPtrAddWithSub(const MachineOperand &MO, BuildFnTy &MatchInfo);
+ bool matchPtrAddWithAdd(const MachineOperand &MO, BuildFnTy &MatchInfo);
+ bool matchPtrAddsFoldConstants(MachineOperand &MO, BuildFnTy &MatchInfo);
+ bool matchPtrAddWFoldDistributedConstants(const MachineOperand &MO,
+ BuildFnTy &MatchInfo);
+ bool matchPtrAddMoveInner(MachineOperand &MO, BuildFnTy &MatchInfo);
+
+ bool matchPtrAddWithAddVScale(const MachineOperand &MO, BuildFnTy &MatchInfo);
+ bool matchPtrAddWithSubVScale(const MachineOperand &MO, BuildFnTy &MatchInfo);
+
private:
/// Checks for legality of an indexed variant of \p LdSt.
bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index bd43b95899030..a35200de105b3 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1675,6 +1675,75 @@ shl_of_vscale,
sub_of_vscale,
]>;
+def PtrAddWithSub : GICombineRule<
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (G_CONSTANT $right, $imm),
+ (G_SUB $offset, $left, $right),
+ (G_PTR_ADD $root, $base, $offset),
+ [{ return Helper.matchPtrAddWithSub(${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
+
+def PtrAddWithAdd : GICombineRule<
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (G_CONSTANT $right, $imm),
+ (G_ADD $offset, $left, $right),
+ (G_PTR_ADD $root, $base, $offset),
+ [{ return Helper.matchPtrAddWithAdd(${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
+
+def PtrAdd2DistributedConstOffsets : GICombineRule<
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (G_CONSTANT $const2, $imm2),
+ (G_CONSTANT $const1, $imm1),
+ (G_PTR_ADD $pointer1, $base, $const2),
+ (G_PTR_ADD $pointer, $pointer1, $nonconst),
+ (G_PTR_ADD $root, $pointer, $const1),
+ [{ return Helper.matchPtrAddWFoldDistributedConstants(${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
+
+def PtrAdd2ConstOffsets : GICombineRule<
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (G_CONSTANT $const2, $imm2),
+ (G_CONSTANT $const1, $imm1),
+ (G_PTR_ADD $pointer, $base, $const2),
+ (G_PTR_ADD $root, $pointer, $const1),
+ [{ return Helper.matchPtrAddsFoldConstants(${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
+
+def PtrAddMoveInner : GICombineRule<
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (G_CONSTANT $const, $imm),
+ (G_PTR_ADD $pointer, $base, $const),
+ (G_PTR_ADD $root, $pointer, $nonconst),
+ [{ return Helper.matchPtrAddMoveInner(${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
+
+def PtrAddWithAddVScale : GICombineRule<
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (G_VSCALE $right, $imm),
+ (G_ADD $offset, $left, $right),
+ (G_PTR_ADD $root, $base, $offset),
+ [{ return Helper.matchPtrAddWithAddVScale(${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
+
+def PtrSubWithSubVScale : GICombineRule<
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (G_VSCALE $right, $imm),
+ (G_SUB $offset, $left, $right),
+ (G_PTR_ADD $root, $base, $offset),
+ [{ return Helper.matchPtrAddWithSubVScale(${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
+
+
+def ptradd_combines: GICombineGroup<[
+PtrAddWithSub,
+PtrAddWithAdd,
+PtrAdd2ConstOffsets,
+PtrAdd2DistributedConstOffsets,
+PtrAddMoveInner,
+PtrAddWithAddVScale,
+PtrSubWithSubVScale
+]>;
// fold ((0-A) + B) -> B-A
def ZeroMinusAPlusB : GICombineRule<
@@ -1818,7 +1887,7 @@ def constant_fold_binops : GICombineGroup<[constant_fold_binop,
def prefer_sign_combines : GICombineGroup<[nneg_zext]>;
def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
- vector_ops_combines, freeze_combines,
+ vector_ops_combines, freeze_combines, ptradd_combines,
insert_vec_elt_combines, extract_vec_elt_combines, combines_for_extload,
combine_extracted_vector_load,
undef_combines, identity_combines, phi_combines,
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 31030accd43f7..7c195afcfb6f7 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -7437,3 +7437,212 @@ bool CombinerHelper::matchNonNegZext(const MachineOperand &MO,
return false;
}
+
+bool CombinerHelper::matchPtrAddWithSub(const MachineOperand &MO,
+ BuildFnTy &MatchInfo) {
+ GPtrAdd *Inner = cast<GPtrAdd>(MRI.getVRegDef(MO.getReg()));
+ GSub *Sub = cast<GSub>(MRI.getVRegDef(Inner->getOffsetReg()));
+
+ // sub(x, c) -> add(x, -c)
+
+ // one-use check
+ if (!MRI.hasOneNonDBGUse(Sub->getReg(0)))
+ return false;
+
+ // Cannot fail due to pattern.
+ std::optional<APInt> MaybeImm = getIConstantVRegVal(Sub->getRHSReg(), MRI);
+ if (!MaybeImm)
+ return false;
+
+ LLT ConstTy = MRI.getType(Inner->getOffsetReg());
+
+ if (!isConstantLegalOrBeforeLegalizer(ConstTy))
+ return false;
+
+ Register Dst = MO.getReg();
+ LLT DstTy = MRI.getType(Dst);
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ auto Base = B.buildConstant(ConstTy, -(*MaybeImm));
+ auto PtrAdd = B.buildPtrAdd(DstTy, Inner->getBaseReg(), Sub->getLHSReg());
+ B.buildPtrAdd(Dst, PtrAdd, Base);
+ };
+
+ return true;
+}
+
+bool CombinerHelper::matchPtrAddWithAdd(const MachineOperand &MO,
+ BuildFnTy &MatchInfo) {
+ GPtrAdd *Inner = cast<GPtrAdd>(MRI.getVRegDef(MO.getReg()));
+ GAdd *Add = cast<GAdd>(MRI.getVRegDef(Inner->getOffsetReg()));
+
+ // one-use check
+ if (!MRI.hasOneNonDBGUse(Add->getReg(0)))
+ return false;
+
+ Register Dst = MO.getReg();
+ LLT DstTy = MRI.getType(Dst);
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ auto PtrAdd = B.buildPtrAdd(DstTy, Inner->getBaseReg(), Add->getLHSReg());
+ B.buildPtrAdd(Dst, PtrAdd, Add->getRHSReg());
+ };
+
+ return true;
+}
+
+bool CombinerHelper::matchPtrAddsFoldConstants(MachineOperand &MO,
+ BuildFnTy &MatchInfo) {
+ GPtrAdd *Inner = cast<GPtrAdd>(MRI.getVRegDef(MO.getReg()));
+ GPtrAdd *Second = cast<GPtrAdd>(MRI.getVRegDef(Inner->getBaseReg()));
+
+ // one-use check
+ if (!MRI.hasOneNonDBGUse(Second->getReg(0)))
+ return false;
+
+ // Cannot fail due to pattern.
+ std::optional<APInt> MaybeImm1 =
+ getIConstantVRegVal(Inner->getOffsetReg(), MRI);
+ if (!MaybeImm1)
+ return false;
+
+ // Cannot fail due to pattern.
+ std::optional<APInt> MaybeImm2 =
+ getIConstantVRegVal(Second->getOffsetReg(), MRI);
+ if (!MaybeImm2)
+ return false;
+
+ // Check if we can combine the two offsets into a legal addressing mode.
+ // To do so, we first need to find a load/store user of the pointer to get
+ // the access type. We cannot put the memory access into the MIR pattern.
+ Type *AccessTy = nullptr;
+ auto &MF = *MO.getParent()->getMF();
+ for (auto &UseMI :
+ MRI.use_nodbg_instructions(Inner->getOperand(0).getReg())) {
+ if (auto *LdSt = dyn_cast<GLoadStore>(&UseMI)) {
+ AccessTy = getTypeForLLT(LdSt->getMMO().getMemoryType(),
+ MF.getFunction().getContext());
+ break;
+ }
+ }
+
+ // Did we found a memory access?
+ if (!AccessTy)
+ return false;
+
+ TargetLoweringBase::AddrMode AM;
+ AM.HasBaseReg = true;
+ AM.BaseOffs = (*MaybeImm1 + *MaybeImm2).getSExtValue();
+
+ Register Dst = MO.getReg();
+ LLT DstTy = MRI.getType(Dst);
+
+ unsigned AS = DstTy.getAddressSpace();
+
+ const auto &TLI = getTargetLowering();
+
+ // Can we combine the two offsets?
+ if (!TLI.isLegalAddressingMode(MF.getDataLayout(), AM, AccessTy, AS))
+ return false;
+
+ LLT ConstTy = MRI.getType(Second->getOffsetReg());
+
+ if (!isConstantLegalOrBeforeLegalizer(ConstTy))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ auto Offset = B.buildConstant(ConstTy, AM.BaseOffs);
+ B.buildPtrAdd(Dst, Second->getBaseReg(), Offset);
+ };
+
+ return true;
+}
+
+bool CombinerHelper::matchPtrAddWFoldDistributedConstants(
+ const MachineOperand &MO, BuildFnTy &MatchInfo) {
+ GPtrAdd *Inner = cast<GPtrAdd>(MRI.getVRegDef(MO.getReg()));
+ GPtrAdd *Second = cast<GPtrAdd>(MRI.getVRegDef(Inner->getBaseReg()));
+ GPtrAdd *Third = cast<GPtrAdd>(MRI.getVRegDef(Second->getBaseReg()));
+
+ if (!MRI.hasOneNonDBGUse(Second->getReg(0)) ||
+ !MRI.hasOneNonDBGUse(Third->getReg(0)))
+ return false;
+
+ // Cannot fail due to pattern.
+ std::optional<APInt> MaybeImm1 =
+ getIConstantVRegVal(Inner->getOffsetReg(), MRI);
+ if (!MaybeImm1)
+ return false;
+
+ // Cannot fail due to pattern.
+ std::optional<APInt> MaybeImm2 =
+ getIConstantVRegVal(Third->getOffsetReg(), MRI);
+ if (!MaybeImm2)
+ return false;
+
+ // Check if we can combine the two offsets into a legal addressing mode.
+ // To do so, we first need to find a load/store user of the pointer to get
+ // the access type. We cannot put the memory access into the MIR pattern.
+ Type *AccessTy = nullptr;
+ auto &MF = *MO.getParent()->getMF();
+ for (auto &UseMI :
+ MRI.use_nodbg_instructions(Inner->getOperand(0).getReg())) {
+ if (auto *LdSt = dyn_cast<GLoadStore>(&UseMI)) {
+ AccessTy = getTypeForLLT(LdSt->getMMO().getMemoryType(),
+ MF.getFunction().getContext());
+ break;
+ }
+ }
+
+ // Did we found a memory access?
+ if (!AccessTy)
+ return false;
+
+ TargetLoweringBase::AddrMode AM;
+ AM.HasBaseReg = true;
+ AM.BaseOffs = (*MaybeImm1 + *MaybeImm2).getSExtValue();
+
+ Register Dst = MO.getReg();
+ LLT DstTy = MRI.getType(Dst);
+ unsigned AS = DstTy.getAddressSpace();
+
+ const auto &TLI = getTargetLowering();
+
+ // Can we combine the two offsets?
+ if (!TLI.isLegalAddressingMode(MF.getDataLayout(), AM, AccessTy, AS))
+ return false;
+
+ LLT ConstTy = MRI.getType(Third->getOffsetReg());
+
+ if (!isConstantLegalOrBeforeLegalizer(ConstTy))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ auto Offset = B.buildConstant(ConstTy, AM.BaseOffs);
+ auto PtrAdd =
+ B.buildPtrAdd(DstTy, Third->getBaseReg(), Second->getOffsetReg());
+ B.buildPtrAdd(Dst, PtrAdd, Offset);
+ };
+
+ return true;
+}
+
+bool CombinerHelper::matchPtrAddMoveInner(MachineOperand &MO,
+ BuildFnTy &MatchInfo) {
+ GPtrAdd *Inner = cast<GPtrAdd>(MRI.getVRegDef(MO.getReg()));
+ GPtrAdd *Second = cast<GPtrAdd>(MRI.getVRegDef(Inner->getBaseReg()));
+
+ if (!MRI.hasOneNonDBGUse(Second->getReg(0)))
+ return false;
+
+ Register Dst = MO.getReg();
+ LLT DstTy = MRI.getType(Dst);
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ auto PtrAdd =
+ B.buildPtrAdd(DstTy, Second->getBaseReg(), Inner->getOffsetReg());
+ B.buildPtrAdd(Dst, PtrAdd, Second->getOffsetReg());
+ };
+
+ return true;
+}
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
index 66b1c5f8ca82c..4c962342e631a 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
@@ -484,3 +484,52 @@ bool CombinerHelper::matchShlOfVScale(const MachineOperand &MO,
return true;
}
+
+bool CombinerHelper::matchPtrAddWithAddVScale(const MachineOperand &MO,
+ BuildFnTy &MatchInfo) {
+ GPtrAdd *Inner = cast<GPtrAdd>(MRI.getVRegDef(MO.getReg()));
+ GAdd *Add = cast<GAdd>(MRI.getVRegDef(Inner->getOffsetReg()));
+ GVScale *VScale = cast<GVScale>(MRI.getVRegDef(Add->getRHSReg()));
+
+ // one-use check
+ if (!MRI.hasOneNonDBGUse(Add->getReg(0)) ||
+ !MRI.hasOneNonDBGUse(VScale->getReg(0)))
+ return false;
+
+ Register Dst = MO.getReg();
+ LLT DstTy = MRI.getType(Dst);
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ auto PtrAdd = B.buildPtrAdd(DstTy, Inner->getBaseReg(), Add->getLHSReg());
+ B.buildPtrAdd(Dst, PtrAdd, Add->getRHSReg());
+ };
+
+ return true;
+}
+
+bool CombinerHelper::matchPtrAddWithSubVScale(const MachineOperand &MO,
+ BuildFnTy &MatchInfo) {
+ GPtrAdd *Inner = cast<GPtrAdd>(MRI.getVRegDef(MO.getReg()));
+ GSub *Sub = cast<GSub>(MRI.getVRegDef(Inner->getOffsetReg()));
+ GVScale *VScale = cast<GVScale>(MRI.getVRegDef(Sub->getRHSReg()));
+
+ // one-use check
+ if (!MRI.hasOneNonDBGUse(Sub->getReg(0)) ||
+ !MRI.hasOneNonDBGUse(VScale->getReg(0)))
+ return false;
+
+ Register Dst = MO.getReg();
+ LLT DstTy = MRI.getType(Dst);
+ LLT VScaleTy = MRI.getType(Inner->getOffsetReg());
+
+ if (!isLegalOrBeforeLegalizer({TargetOpcode::G_VSCALE, VScaleTy}))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ auto VScaleMI = B.buildVScale(VScaleTy, -VScale->getSrc());
+ auto PtrAdd = B.buildPtrAdd(DstTy, Inner->getBaseReg(), Sub->getLHSReg());
+ B.buildPtrAdd(Dst, PtrAdd, VScaleMI);
+ };
+
+ return true;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-ptradd.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-ptradd.mir
new file mode 100644
index 0000000000000..31e975f66d07b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-ptradd.mir
@@ -0,0 +1,273 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s
+
+---
+name: ptradd_with_sub
+body: |
+ bb.1:
+ liveins: $x0
+
+ ; CHECK-LABEL: name: ptradd_with_sub
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1600
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD]], [[C]](s64)
+ ; CHECK-NEXT: $x0 = COPY [[PTR_ADD1]](p0)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+ %0:_(p0) = COPY $x0
+ %1:_(s64) = COPY $x0
+ %2:_(s64) = G_CONSTANT i64 1600
+ %10:_(s64) = G_SUB %1, %2(s64)
+ %11:_(p0) = G_PTR_ADD %0, %10(s64)
+ $x0 = COPY %11(p0)
+ RET_ReallyLR implicit $x0
+
+...
+---
+name: ptradd_with_sub_multiple_use
+body: |
+ bb.1:
+ liveins: $x0
+
+ ; CHECK-LABEL: name: ptradd_with_sub_multiple_use
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1600
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY1]], [[C]]
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[SUB]](s64)
+ ; CHECK-NEXT: $x0 = COPY [[PTR_ADD]](p0)
+ ; CHECK-NEXT: $x1 = COPY [[SUB]](s64)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+ %0:_(p0) = COPY $x0
+ %1:_(s64) = COPY $x0
+ %2:_(s64) = G_CONSTANT i64 1600
+ %10:_(s64) = G_SUB %1, %2(s64)
+ %11:_(p0) = G_PTR_ADD %0, %10(s64)
+ $x0 = COPY %11(p0)
+ $x1 = COPY %10(s64)
+ RET_ReallyLR implicit $x0
+
+...
+---
+name: ptradd_with_add
+body: |
+ bb.1:
+ liveins: $x0
+
+ ; CHECK-LABEL: name: ptradd_with_add
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1600
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD]], [[C]](s64)
+ ; CHECK-NEXT: $x0 = COPY [[PTR_ADD1]](p0)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+ %0:_(p0) = COPY $x0
+ %1:_(s64) = COPY $x1
+ %2:_(s64) = G_CONSTANT i64 1600
+ %10:_(s64) = G_ADD %1, %2(s64)
+ %11:_(p0) = G_PTR_ADD %0, %10(s64)
+ $x0 = COPY %11(p0)
+ RET_ReallyLR implicit $x0
+
+...
+---
+name: load_with_two_constants
+alignment: 4
+tracksRegLiveness: true
+liveins:
+ - { reg: '$x0' }
+body: |
+ bb.1:
+ liveins: $x0
+
+ ; CHECK-LABEL: name: load_with_two_constants
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 17
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nsw G_PTR_ADD [[COPY]], [[C]](s64)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64))
+ ; CHECK-NEXT: $x0 = COPY [[LOAD]](s64)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+ %0:_(p0) = COPY $x0
+ %1:_(s64) = COPY $x0
+ %3:_(s64) = G_CONSTANT i64 16
+ %4:_(s64) = G_CONSTANT i64 1
+ %13:_(p0) = G_PTR_ADD %0, %3(s64)
+ %12:_(p0) = nsw G_PTR_ADD %13, %4(s64)
+ %14:_(s64) = G_LOAD %12(p0) :: (load (s64))
+ $x0 = COPY %14(s64)
+ RET_ReallyLR implicit $x0
+
+...
+---
+name: load_with_two_constants_large
+body: |
+ bb.1:
+ liveins: $x0
+
+ ; CHECK-LABEL: name: load_with_two_constants_large
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3000
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nsw G_PTR_ADD [[COPY]], [[C]](s64)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64))
+ ; CHECK-NEXT: $x0 = COPY [[LOAD]](s64)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+ %0:_(p0) = COPY $x0
+ %1:_(s64) = COPY $x1
+ %3:_(s64) = G_CONSTANT i64 1000
+ %4:_(s64) = G_CONSTANT i64 2000
+ %13:_(p0) = G_PTR_ADD %0, %3(s64)
+ %12:_(p0) = nsw G_PTR_ADD %13, %4(s64)
+ %14:_(s64) = G_LOAD %12(p0) :: (load (s64))
+ $x0 = COPY %14(s64)
+ RET_ReallyLR implicit $x0
+
+...
+---
+name: load_with_two_distributed_constants
+body: |
+ bb.1:
+ liveins: $x0
+
+ ; CHECK-LABEL: name: load_with_two_distributed_constants
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 17
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = nsw G_PTR_ADD [[PTR_ADD]], [[C]](s64)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p0) :: (load (s64))
+ ; CHECK-NEXT: $x0 = COPY [[LOAD]](s64)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+ %0:_(p0) = COPY $x0
+ %1:_(s64) = COPY $x1
+ %3:_(s64) = G_CONSTANT i64 16
+ %4:_(s64) = G_CONSTANT i64 1
+ %14:_(p0) = G_PTR_ADD %0, %3(s64)
+ %13:_(p0) = G_PTR_ADD %14, %1(s64)
+ %12:_(p0) = nsw G_PTR_ADD %13, %4(s64)
+ %15:_(s64) = G_LOAD %12(p0) :: (load (s64))
+ $x0 = COPY %15(s64)
+ RET_ReallyLR implicit $x0
+
+...
+---
+name: move_inner
+body: |
+ bb.1:
+ liveins: $x0
+
+ ; CHECK-LABEL: name: move_inner
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = nsw G_PTR_ADD [[PTR_ADD]], [[C]](s64)
+ ; CHECK-NEXT: $x0 = COPY [[PTR_ADD1]](p0)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+ %0:_(p0) = COPY $x0
+ %1:_(s64) = COPY $x1
+ %3:_(s64) = G_CONSTANT i64 16
+ %13:_(p0) = G_PTR_ADD %0, %3(s64)
+ %12:_(p0) = nsw G_PTR_ADD %13, %1(s64)
+ $x0 = COPY %12(p0)
+ RET_ReallyLR implicit $x0
+
+...
+---
+name: move_inner_rotate
+body: |
+ bb.1:
+ liveins: $x0
+
+ ; CHECK-LABEL: name: move_inner_rotate
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+ ; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(s64) = G_CONSTANT_FOLD_BARRIER [[C1]]
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[CONSTANT_FOLD_BARRIER]](s64)
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = nsw G_PTR_ADD [[PTR_ADD]], [[C]](s64)
+ ; CHECK-NEXT: $x0 = COPY [[PTR_ADD1]](p0)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+ %0:_(p0) = COPY $x0
+ %1:_(s64) = COPY $x1
+ %3:_(s64) = G_CONSTANT i64 16
+ %4:_(s64) = G_CONSTANT i64 32
+ %5:_(s64) = G_CONSTANT_FOLD_BARRIER %4
+ %13:_(p0) = G_PTR_ADD %0, %3(s64)
+ %12:_(p0) = nsw G_PTR_ADD %13, %5(s64)
+ $x0 = COPY %12(p0)
+ RET_ReallyLR implicit $x0
+
+...
+---
+name: load_with_three_constants
+body: |
+ bb.1:
+ liveins: $x0
+
+ ; CHECK-LABEL: name: load_with_three_constants
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 60
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nsw G_PTR_ADD [[COPY]], [[C]](s64)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64))
+ ; CHECK-NEXT: $x0 = COPY [[LOAD]](s64)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+ %0:_(p0) = COPY $x0
+ %1:_(s64) = COPY $x1
+ %2:_(s64) = G_CONSTANT i64 10
+ %3:_(s64) = G_CONSTANT i64 20
+ %4:_(s64) = G_CONSTANT i64 30
+ %12:_(p0) = G_PTR_ADD %0, %2(s64)
+ %13:_(p0) = G_PTR_ADD %12, %3(s64)
+ %14:_(p0) = nsw G_PTR_ADD %13, %4(s64)
+ %15:_(s64) = G_LOAD %14(p0) :: (load (s64))
+ $x0 = COPY %15(s64)
+ RET_ReallyLR implicit $x0
+
+...
+---
+name: load_with_two_constants_and_sub
+body: |
+ bb.1:
+ liveins: $x0
+
+ ; CHECK-LABEL: name: load_with_two_constants_and_sub
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 20
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = nsw G_PTR_ADD [[PTR_ADD]], [[C]](s64)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p0) :: (load (s64))
+ ; CHECK-NEXT: $x0 = COPY [[LOAD]](s64)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+ %0:_(p0) = COPY $x0
+ %1:_(s64) = COPY $x1
+ %2:_(s64) = G_CONSTANT i64 10
+ %4:_(s64) = G_CONSTANT i64 30
+ %12:_(s64) = G_SUB %1, %2(s64)
+ %13:_(p0) = G_PTR_ADD %0, %12(s64)
+ %14:_(p0) = nsw G_PTR_ADD %13, %4(s64)
+ %15:_(s64) = G_LOAD %14(p0) :: (load (s64))
+ $x0 = COPY %15(s64)
+ RET_ReallyLR implicit $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-vscale.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-vscale.mir
index 9b7a44954afdb..6ae93d6dbea5e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-vscale.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-vscale.mir
@@ -111,3 +111,51 @@ body: |
%shl:_(s64) = nuw G_SHL %lhs(s64), %rhs(s64)
$x0 = COPY %shl(s64)
RET_ReallyLR implicit $x0
+...
+---
+name: ptradd_add_vscale
+body: |
+ bb.1:
+ liveins: $x0, $x1
+ ; CHECK-LABEL: name: ptradd_add_vscale
+ ; CHECK: liveins: $x0, $x1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %p:_(p0) = COPY $x0
+ ; CHECK-NEXT: %opaque:_(s64) = COPY $x0
+ ; CHECK-NEXT: %vs:_(s64) = G_VSCALE i64 11
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD %p, %opaque(s64)
+ ; CHECK-NEXT: %ptradd:_(p0) = G_PTR_ADD [[PTR_ADD]], %vs(s64)
+ ; CHECK-NEXT: $x0 = COPY %ptradd(p0)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+ %p:_(p0) = COPY $x0
+ %opaque:_(s64) = COPY $x0
+ %cons:_(s64) = G_CONSTANT i64 2
+ %vs:_(s64) = G_VSCALE i64 11
+ %rhs:_(s64) = G_ADD %opaque(s64), %vs(s64)
+ %ptradd:_(p0) = nuw G_PTR_ADD %p(p0), %rhs(s64)
+ $x0 = COPY %ptradd(p0)
+ RET_ReallyLR implicit $x0
+...
+---
+name: ptradd_sub_vscale
+body: |
+ bb.1:
+ liveins: $x0, $x1
+ ; CHECK-LABEL: name: ptradd_sub_vscale
+ ; CHECK: liveins: $x0, $x1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %p:_(p0) = COPY $x0
+ ; CHECK-NEXT: %opaque:_(s64) = COPY $x0
+ ; CHECK-NEXT: [[VSCALE:%[0-9]+]]:_(s64) = G_VSCALE i64 -11
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD %p, %opaque(s64)
+ ; CHECK-NEXT: %ptradd:_(p0) = G_PTR_ADD [[PTR_ADD]], [[VSCALE]](s64)
+ ; CHECK-NEXT: $x0 = COPY %ptradd(p0)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+ %p:_(p0) = COPY $x0
+ %opaque:_(s64) = COPY $x0
+ %cons:_(s64) = G_CONSTANT i64 2
+ %vs:_(s64) = G_VSCALE i64 11
+ %rhs:_(s64) = G_SUB %opaque(s64), %vs(s64)
+ %ptradd:_(p0) = nuw G_PTR_ADD %p(p0), %rhs(s64)
+ $x0 = COPY %ptradd(p0)
+ RET_ReallyLR implicit $x0
>From 51a56afd94376de9005624923cb2114de0205354 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Sat, 15 Jun 2024 20:37:48 +0200
Subject: [PATCH 2/3] fix AMDGPU
---
.../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 874 +---------------
.../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 972 +-----------------
2 files changed, 88 insertions(+), 1758 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index cdfb71b9bf6b6..bc2a23966c9ca 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -10293,49 +10293,19 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB42_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB42_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16:
@@ -10510,49 +10480,19 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB43_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB43_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
@@ -10730,56 +10670,19 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr %ptr, <2
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB44_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, v0
-; GFX940-NEXT: v_mov_b32_e32 v5, v1
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
-; GFX940-NEXT: flat_load_dword v0, v[0:1]
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v1, v0
-; GFX940-NEXT: v_pk_add_f16 v0, v1, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:63488 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB44_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
@@ -10968,46 +10871,19 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB45_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB45_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16:
@@ -11174,46 +11050,19 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB46_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB46_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
@@ -11387,52 +11236,19 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB47_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_load_dword v5, v[4:5]
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:63488
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB47_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
@@ -11618,49 +11434,19 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB48_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB48_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
@@ -11840,46 +11626,19 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB49_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB49_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
@@ -12059,88 +11818,19 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB50_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1]
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s4, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX940-NEXT: s_mov_b32 s5, 0x7060302
-; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX940-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB50_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
@@ -12403,88 +12093,19 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB51_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s4, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX940-NEXT: s_mov_b32 s5, 0x7060302
-; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX940-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB51_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
@@ -12750,95 +12371,19 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB52_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, v0
-; GFX940-NEXT: v_mov_b32_e32 v5, v1
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
-; GFX940-NEXT: flat_load_dword v0, v[0:1]
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX940-NEXT: s_movk_i32 s4, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX940-NEXT: s_mov_b32 s5, 0x7060302
-; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
-; GFX940-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX940-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4
-; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1]
-; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:63488 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB52_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
@@ -13115,85 +12660,19 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB53_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1]
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s4, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX940-NEXT: s_mov_b32 s5, 0x7060302
-; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX940-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
-; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB53_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
@@ -13448,85 +12927,19 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB54_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s4, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX940-NEXT: s_mov_b32 s5, 0x7060302
-; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX940-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
-; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB54_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
@@ -13788,91 +13201,19 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB55_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_load_dword v3, v[4:5]
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s4, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX940-NEXT: s_mov_b32 s5, 0x7060302
-; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX940-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
-; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:63488
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB55_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
@@ -14146,88 +13487,19 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB56_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s4, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX940-NEXT: s_mov_b32 s5, 0x7060302
-; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX940-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB56_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
@@ -14495,85 +13767,19 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB57_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s4, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX940-NEXT: s_mov_b32 s5, 0x7060302
-; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX940-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
-; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB57_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 5f60c8ac2b3b7..77c8e034e68a0 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -11207,49 +11207,19 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB42_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB42_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16:
@@ -11305,23 +11275,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2
; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB42_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16:
@@ -11481,49 +11437,19 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB43_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB43_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
@@ -11579,23 +11505,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspa
; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB43_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
@@ -11757,49 +11669,19 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr addrspa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB44_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB44_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
@@ -11855,23 +11737,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr addrspa
; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
@@ -12037,46 +11905,19 @@ define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB45_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB45_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16:
@@ -12129,22 +11970,9 @@ define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB45_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16:
@@ -12298,46 +12126,19 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB46_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB46_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
@@ -12390,22 +12191,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB46_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
@@ -12562,46 +12350,19 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB47_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB47_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
@@ -12654,22 +12415,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB47_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
@@ -12834,49 +12582,19 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrsp
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB48_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB48_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
@@ -12932,25 +12650,11 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrsp
; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB48_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
@@ -13112,46 +12816,19 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB49_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB49_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
@@ -13204,24 +12881,11 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(
; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB49_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
@@ -13382,88 +13046,19 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB50_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s4, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX940-NEXT: s_mov_b32 s5, 0x7060302
-; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX940-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0
+; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB50_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16:
@@ -13779,88 +13374,19 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB51_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s4, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX940-NEXT: s_mov_b32 s5, 0x7060302
-; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX940-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0
+; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB51_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
@@ -14178,88 +13704,19 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB52_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s4, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX940-NEXT: s_mov_b32 s5, 0x7060302
-; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX940-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB52_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
@@ -14581,85 +14038,19 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB53_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s4, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX940-NEXT: s_mov_b32 s5, 0x7060302
-; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX940-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
-; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB53_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16:
@@ -14965,85 +14356,19 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB54_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s4, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX940-NEXT: s_mov_b32 s5, 0x7060302
-; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX940-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
-; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
+; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB54_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
@@ -15352,85 +14677,19 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB55_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s4, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX940-NEXT: s_mov_b32 s5, 0x7060302
-; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX940-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
-; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB55_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
@@ -15747,88 +15006,19 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB56_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s4, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX940-NEXT: s_mov_b32 s5, 0x7060302
-; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX940-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB56_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
@@ -16148,85 +15338,19 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB57_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s4, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX940-NEXT: s_mov_b32 s5, 0x7060302
-; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX940-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
-; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB57_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
>From 11a6409f49bc1722db61a11551904f602f567bd1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Sat, 15 Jun 2024 21:31:51 +0200
Subject: [PATCH 3/3] next AMDGPU fix
---
.../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 874 +++++++++++++++-
.../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 972 +++++++++++++++++-
2 files changed, 1758 insertions(+), 88 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index bc2a23966c9ca..cdfb71b9bf6b6 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -10293,19 +10293,49 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB42_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB42_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16:
@@ -10480,19 +10510,49 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB43_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB43_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
@@ -10670,19 +10730,56 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr %ptr, <2
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB44_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v4, v0
+; GFX940-NEXT: v_mov_b32_e32 v5, v1
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
+; GFX940-NEXT: s_movk_i32 s0, 0xf800
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
+; GFX940-NEXT: flat_load_dword v0, v[0:1]
+; GFX940-NEXT: s_mov_b32 s1, -1
+; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
+; GFX940-NEXT: v_pk_add_f16 v0, v1, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:63488 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB44_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
@@ -10871,19 +10968,46 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
-; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB45_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_load_dword v5, v[0:1]
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB45_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16:
@@ -11050,19 +11174,46 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
-; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB46_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB46_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
@@ -11236,19 +11387,52 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048
-; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB47_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_movk_i32 s0, 0xf800
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX940-NEXT: flat_load_dword v5, v[4:5]
+; GFX940-NEXT: s_mov_b32 s1, -1
+; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:63488
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB47_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
@@ -11434,19 +11618,49 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB48_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB48_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
@@ -11626,19 +11840,46 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
-; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB49_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB49_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
@@ -11818,19 +12059,88 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB50_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT: s_mov_b32 s5, 0x7060302
+; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB50_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
@@ -12093,19 +12403,88 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB51_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT: s_mov_b32 s5, 0x7060302
+; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB51_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
@@ -12371,19 +12750,95 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB52_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v4, v0
+; GFX940-NEXT: v_mov_b32_e32 v5, v1
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
+; GFX940-NEXT: s_movk_i32 s0, 0xf800
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
+; GFX940-NEXT: flat_load_dword v0, v[0:1]
+; GFX940-NEXT: s_mov_b32 s1, -1
+; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT: s_mov_b32 s5, 0x7060302
+; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX940-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:63488 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB52_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
@@ -12660,19 +13115,85 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
-; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB53_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT: s_mov_b32 s5, 0x7060302
+; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB53_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
@@ -12927,19 +13448,85 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
-; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT: s_mov_b32 s5, 0x7060302
+; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB54_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
@@ -13201,19 +13788,91 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048
-; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_movk_i32 s0, 0xf800
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX940-NEXT: flat_load_dword v3, v[4:5]
+; GFX940-NEXT: s_mov_b32 s1, -1
+; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT: s_mov_b32 s5, 0x7060302
+; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:63488
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB55_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
@@ -13487,19 +14146,88 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB56_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT: s_mov_b32 s5, 0x7060302
+; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB56_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
@@ -13767,19 +14495,85 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
-; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB57_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT: s_mov_b32 s5, 0x7060302
+; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB57_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 77c8e034e68a0..5f60c8ac2b3b7 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -11207,19 +11207,49 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB42_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB42_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16:
@@ -11275,9 +11305,23 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2
; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB42_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16:
@@ -11437,19 +11481,49 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB43_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB43_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
@@ -11505,9 +11579,23 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspa
; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB43_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
@@ -11669,19 +11757,49 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr addrspa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB44_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB44_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
@@ -11737,9 +11855,23 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr addrspa
; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 glc
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
@@ -11905,19 +12037,46 @@ define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
-; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB45_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB45_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16:
@@ -11970,9 +12129,22 @@ define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB45_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16:
@@ -12126,19 +12298,46 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
-; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB46_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB46_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
@@ -12191,9 +12390,22 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB46_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
@@ -12350,19 +12562,46 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
-; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB47_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB47_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
@@ -12415,9 +12654,22 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB47_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
@@ -12582,19 +12834,49 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrsp
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB48_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB48_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
@@ -12650,11 +12932,25 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrsp
; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB48_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
@@ -12816,19 +13112,46 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_pk_add_f16 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
-; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB49_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 sc1
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB49_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
@@ -12881,11 +13204,24 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(
; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB49_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
@@ -13046,19 +13382,88 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB50_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT: s_mov_b32 s5, 0x7060302
+; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB50_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16:
@@ -13374,19 +13779,88 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB51_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT: s_mov_b32 s5, 0x7060302
+; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB51_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
@@ -13704,19 +14178,88 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB52_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT: s_mov_b32 s5, 0x7060302
+; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB52_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
@@ -14038,19 +14581,85 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off
-; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB53_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT: s_mov_b32 s5, 0x7060302
+; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB53_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16:
@@ -14356,19 +14965,85 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
-; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT: s_mov_b32 s5, 0x7060302
+; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB54_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
@@ -14677,19 +15352,85 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048
-; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT: s_mov_b32 s5, 0x7060302
+; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB55_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
@@ -15006,19 +15747,88 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB56_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT: s_mov_b32 s5, 0x7060302
+; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB56_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
@@ -15338,19 +16148,85 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
-; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB57_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT: s_mov_b32 s5, 0x7060302
+; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 sc1
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB57_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
More information about the llvm-commits
mailing list