[llvm-branch-commits] [llvm] AMDGPU: Custom expand flat cmpxchg which may access private (PR #109410)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Sep 20 05:00:29 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
64-bit flat cmpxchg instructions do not work correctly for scratch
addresses, and need to be expanded as non-atomic.
Allow custom expansion of cmpxchg in AtomicExpand, as is
already the case for atomicrmw.
---
Patch is 90.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109410.diff
10 Files Affected:
- (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+5)
- (modified) llvm/include/llvm/Transforms/Utils/LowerAtomic.h (+7)
- (modified) llvm/lib/CodeGen/AtomicExpandPass.cpp (+4)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+95-51)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+3)
- (modified) llvm/lib/Transforms/Utils/LowerAtomic.cpp (+15-6)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (+932-95)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll (+3-3)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll (+3-3)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-cmpxchg-flat-maybe-private.ll (+94-10)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 3842af56e6b3d7..678b169568afcf 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2204,6 +2204,11 @@ class TargetLoweringBase {
"Generic atomicrmw expansion unimplemented on this target");
}
+ /// Perform a cmpxchg expansion using a target-specific method.
+ virtual void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+ llvm_unreachable("Generic cmpxchg expansion unimplemented on this target");
+ }
+
/// Perform a bit test atomicrmw using a target-specific intrinsic. This
/// represents the combined bit test intrinsic which will be lowered at a late
/// stage by the backend.
diff --git a/llvm/include/llvm/Transforms/Utils/LowerAtomic.h b/llvm/include/llvm/Transforms/Utils/LowerAtomic.h
index b25b281667f9cb..295c2bd2b4b47e 100644
--- a/llvm/include/llvm/Transforms/Utils/LowerAtomic.h
+++ b/llvm/include/llvm/Transforms/Utils/LowerAtomic.h
@@ -23,6 +23,13 @@ class IRBuilderBase;
/// Convert the given Cmpxchg into primitive load and compare.
bool lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI);
+/// Emit IR to implement the given cmpxchg operation on values in registers,
+/// returning the new value.
+std::pair<Value *, Value *> buildAtomicCmpXchgValue(IRBuilderBase &Builder,
+ Value *Ptr, Value *Cmp,
+ Value *Val,
+ Align Alignment);
+
/// Convert the given RMWI into primitive load and stores,
/// assuming that doing so is legal. Return true if the lowering
/// succeeds.
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 5a3e529e5ebd02..37242d16001ca3 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -1671,6 +1671,10 @@ bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
return true;
case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
return lowerAtomicCmpXchgInst(CI);
+ case TargetLoweringBase::AtomicExpansionKind::Expand: {
+ TLI->emitExpandAtomicCmpXchg(CI);
+ return true;
+ }
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index febd741f947ee1..889c174eec07c6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16497,9 +16497,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
- return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
- ? AtomicExpansionKind::NotAtomic
- : AtomicExpansionKind::None;
+ unsigned AddrSpace = CmpX->getPointerAddressSpace();
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
+ return AtomicExpansionKind::NotAtomic;
+
+ if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
+ return AtomicExpansionKind::None;
+
+ const DataLayout &DL = CmpX->getDataLayout();
+
+ Type *ValTy = CmpX->getNewValOperand()->getType();
+
+ // If a 64-bit flat atomic may alias private, we need to avoid using the
+ // atomic in the private case.
+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
+ : AtomicExpansionKind::None;
}
const TargetRegisterClass *
@@ -16663,40 +16675,8 @@ bool SITargetLowering::checkForPhysRegDependency(
return false;
}
-void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
- AtomicRMWInst::BinOp Op = AI->getOperation();
-
- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
- Op == AtomicRMWInst::Xor) {
- if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
- ConstVal && ConstVal->isNullValue()) {
- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
- AI->setOperation(AtomicRMWInst::Add);
-
- // TODO: Turn the below private handling into a no-op for idempotent
- // cases.
- }
- }
-
- // The non-flat expansions should only perform the de-canonicalization of
- // identity values.
- if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
- return;
-
- // FullFlatEmulation is true if we need to issue the private, shared, and
- // global cases.
- //
- // If this is false, we are only dealing with the flat-targeting-private case,
- // where we only insert a check for private and still use the flat instruction
- // for global and shared.
-
- // TODO: Avoid the private check for the fadd case depending on
- // noalias.addrspace.
-
- bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
- Subtarget->hasAtomicFaddInsts() &&
- AI->getType()->isFloatTy();
-
+void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
+ Instruction *AI) const {
// Given: atomicrmw fadd ptr %addr, float %val ordering
//
// With this expansion we produce the following code:
@@ -16743,6 +16723,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
IRBuilder<> Builder(AI);
LLVMContext &Ctx = Builder.getContext();
+ auto *RMW = dyn_cast<AtomicRMWInst>(AI);
+ const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
+ : AtomicCmpXchgInst::getPointerOperandIndex();
+ Value *Addr = AI->getOperand(PtrOpIdx);
+
+ /// TODO: Only need to check private, then emit flat-known-not private (no
+ /// need for shared block, or cast to global).
+ AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
+
+ Align Alignment;
+ if (RMW)
+ Alignment = RMW->getAlign();
+ else if (CX)
+ Alignment = CX->getAlign();
+ else
+ llvm_unreachable("unhandled atomic operation");
+
+ // FullFlatEmulation is true if we need to issue the private, shared, and
+ // global cases.
+ //
+ // If this is false, we are only dealing with the flat-targeting-private case,
+ // where we only insert a check for private and still use the flat instruction
+ // for global and shared.
+
+ bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
+ Subtarget->hasAtomicFaddInsts() &&
+ RMW->getType()->isFloatTy();
+
// If the return value isn't used, do not introduce a false use in the phi.
bool ReturnValueIsUsed = !AI->use_empty();
@@ -16764,11 +16772,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
- Value *Val = AI->getValOperand();
- Type *ValTy = Val->getType();
- Value *Addr = AI->getPointerOperand();
- Align Alignment = AI->getAlign();
-
std::prev(BB->end())->eraseFromParent();
Builder.SetInsertPoint(BB);
@@ -16783,8 +16786,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
Instruction *Clone = AI->clone();
Clone->insertInto(SharedBB, SharedBB->end());
- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
- .set(CastToLocal);
+ Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
LoadedShared = Clone;
Builder.CreateBr(PhiBB);
@@ -16796,14 +16798,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
Builder.SetInsertPoint(PrivateBB);
+
Value *CastToPrivate = Builder.CreateAddrSpaceCast(
Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
- Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
- Alignment, "loaded.private");
- Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
+ Value *LoadedPrivate;
+ if (RMW) {
+ LoadedPrivate = Builder.CreateAlignedLoad(
+ RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
+
+ Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
+ LoadedPrivate, RMW->getValOperand());
+
+ Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
+ } else {
+ auto [ResultLoad, Equal] =
+ buildAtomicCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
+ CX->getNewValOperand(), CX->getAlign());
+
+ Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
+ ResultLoad, 0);
+ LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
+ }
- Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
Builder.CreateBr(PhiBB);
Builder.SetInsertPoint(GlobalBB);
@@ -16813,8 +16830,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
if (FullFlatEmulation) {
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
- .set(CastToGlobal);
+ AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
}
AI->removeFromParent();
@@ -16838,7 +16854,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
Builder.SetInsertPoint(PhiBB);
if (ReturnValueIsUsed) {
- PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
+ PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
AI->replaceAllUsesWith(Loaded);
if (FullFlatEmulation)
Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16850,6 +16866,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
Builder.CreateBr(ExitBB);
}
+void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
+ AtomicRMWInst::BinOp Op = AI->getOperation();
+
+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
+ Op == AtomicRMWInst::Xor) {
+ if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
+ ConstVal && ConstVal->isNullValue()) {
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
+ AI->setOperation(AtomicRMWInst::Add);
+
+ // We may still need the private-alias-flat handling below.
+
+ // TODO: Skip this for cases where we cannot access remote memory.
+ }
+ }
+
+ // The non-flat expansions should only perform the de-canonicalization of
+ // identity values.
+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
+ return;
+
+ emitExpandAtomicAddrSpacePredicate(AI);
+}
+
+void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+ emitExpandAtomicAddrSpacePredicate(CI);
+}
+
LoadInst *
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
IRBuilder<> Builder(AI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 6c3edf37945e24..32e110fdfa84d4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -544,7 +544,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
AtomicExpansionKind
shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+
+ void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
+ void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override;
LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
diff --git a/llvm/lib/Transforms/Utils/LowerAtomic.cpp b/llvm/lib/Transforms/Utils/LowerAtomic.cpp
index 8b3a0ce338e577..89c49d4a0732f6 100644
--- a/llvm/lib/Transforms/Utils/LowerAtomic.cpp
+++ b/llvm/lib/Transforms/Utils/LowerAtomic.cpp
@@ -25,13 +25,11 @@ bool llvm::lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
Value *Cmp = CXI->getCompareOperand();
Value *Val = CXI->getNewValOperand();
- LoadInst *Orig =
- Builder.CreateAlignedLoad(Val->getType(), Ptr, CXI->getAlign());
- Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
- Value *Res = Builder.CreateSelect(Equal, Val, Orig);
- Builder.CreateAlignedStore(Res, Ptr, CXI->getAlign());
+ auto [Orig, Equal] =
+ buildAtomicCmpXchgValue(Builder, Ptr, Cmp, Val, CXI->getAlign());
- Res = Builder.CreateInsertValue(PoisonValue::get(CXI->getType()), Orig, 0);
+ Value *Res =
+ Builder.CreateInsertValue(PoisonValue::get(CXI->getType()), Orig, 0);
Res = Builder.CreateInsertValue(Res, Equal, 1);
CXI->replaceAllUsesWith(Res);
@@ -39,6 +37,17 @@ bool llvm::lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
return true;
}
+std::pair<Value *, Value *>
+llvm::buildAtomicCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp,
+ Value *Val, Align Alignment) {
+ LoadInst *Orig = Builder.CreateAlignedLoad(Val->getType(), Ptr, Alignment);
+ Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
+ Value *Res = Builder.CreateSelect(Equal, Val, Orig);
+ Builder.CreateAlignedStore(Res, Ptr, Alignment);
+
+ return {Orig, Equal};
+}
+
Value *llvm::buildAtomicRMWValue(AtomicRMWInst::BinOp Op,
IRBuilderBase &Builder, Value *Loaded,
Value *Val) {
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 7b5797d4139a19..53d63c297d8098 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -5220,11 +5220,29 @@ entry:
define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old) {
; GCN1-LABEL: atomic_cmpxchg_i64_offset:
; GCN1: ; %bb.0: ; %entry
+; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN1-NEXT: s_mov_b32 s14, -1
+; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GCN1-NEXT: s_add_u32 s12, s12, s9
+; GCN1-NEXT: s_addc_u32 s13, s13, 0
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s2, s4, 32
; GCN1-NEXT: s_addc_u32 s3, s5, 0
+; GCN1-NEXT: s_cmp_eq_u32 s3, s8
+; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GCN1-NEXT: s_mov_b64 s[4:5], -1
+; GCN1-NEXT: s_cbranch_vccnz .LBB90_3
+; GCN1-NEXT: ; %bb.1: ; %Flow
+; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_vccz .LBB90_4
+; GCN1-NEXT: .LBB90_2: ; %atomicrmw.phi
+; GCN1-NEXT: s_endpgm
+; GCN1-NEXT: .LBB90_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v5, s3
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
@@ -5234,15 +5252,51 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: s_cbranch_execnz .LBB90_2
+; GCN1-NEXT: .LBB90_4: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
+; GCN1-NEXT: v_mov_b32_e32 v5, s6
+; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN1-NEXT: s_cselect_b32 s2, s2, -1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
+; GCN1-NEXT: s_add_i32 s2, s2, 4
+; GCN1-NEXT: v_mov_b32_e32 v3, s2
+; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
+; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT: v_mov_b32_e32 v4, s7
+; GCN1-NEXT: s_waitcnt vmcnt(0)
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1]
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
+; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_cmpxchg_i64_offset:
; GCN2: ; %bb.0: ; %entry
+; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; GCN2-NEXT: s_mov_b32 s90, -1
+; GCN2-NEXT: s_mov_b32 s91, 0xe80000
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GCN2-NEXT: s_add_u32 s88, s88, s9
+; GCN2-NEXT: s_addc_u32 s89, s89, 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s2, s4, 32
; GCN2-NEXT: s_addc_u32 s3, s5, 0
+; GCN2-NEXT: s_cmp_eq_u32 s3, s8
+; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GCN2-NEXT: s_mov_b64 s[4:5], -1
+; GCN2-NEXT: s_cbranch_vccnz .LBB90_3
+; GCN2-NEXT: ; %bb.1: ; %Flow
+; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_vccz .LBB90_4
+; GCN2-NEXT: .LBB90_2: ; %atomicrmw.phi
+; GCN2-NEXT: s_endpgm
+; GCN2-NEXT: .LBB90_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v5, s3
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
@@ -5252,6 +5306,23 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: s_cbranch_execnz .LBB90_2
+; GCN2-NEXT: .LBB90_4: ; %atomicrmw.private
+; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GCN2-NEXT: s_cselect_b32 s2, s2, -1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: s_add_i32 s2, s2, 4
+; GCN2-NEXT: v_mov_b32_e32 v3, s2
+; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
+; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT: v_mov_b32_e32 v5, s6
+; GCN2-NEXT: v_mov_b32_e32 v4, s7
+; GCN2-NEXT: s_waitcnt vmcnt(0)
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1]
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
+; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_offset:
@@ -5259,14 +5330,39 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_cmp_eq_u32 s3, s9
+; GFX12-NEXT: s_cselect_b32 s4, -1, 0
+; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT: s_mov_b32 s4, -1
+; GFX12-NEXT: s_cbranch_vccnz .LBB90_3
+; GFX12-NEXT: ; %bb.1: ; %Flow
+; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_vccz .LBB90_4
+; GFX12-NEXT: .LBB90_2: ; %atomicrmw.phi
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: .LBB90_3: ; %atomicrmw.global
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX12-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_cbranch_execnz .LBB90_2
+; GFX12-NEXT: .LBB90_4: ; %atomicrmw.private
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX12-NEXT: s_cselect_b32 s2, s2, -1
+; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s7, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s6, vcc_lo
+; GFX12-NEXT: scratch_store_b64 off, v[0:1], s2
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, p...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/109410
More information about the llvm-branch-commits
mailing list