[llvm] e7ec123 - [AMDGPU] Implement idempotent atomic lowering
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 8 14:10:08 PST 2023
Author: Stanislav Mekhanoshin
Date: 2023-03-08T14:09:59-08:00
New Revision: e7ec123c6af9ed4856e301512444487528a59dee
URL: https://github.com/llvm/llvm-project/commit/e7ec123c6af9ed4856e301512444487528a59dee
DIFF: https://github.com/llvm/llvm-project/commit/e7ec123c6af9ed4856e301512444487528a59dee.diff
LOG: [AMDGPU] Implement idempotent atomic lowering
This turns an idempotent atomic operation into an atomic load.
Fixes: SWDEV-385135
Differential Revision: https://reviews.llvm.org/D144759
Added:
llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 74aaebaad4f1b..5f285e80796b7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13435,3 +13435,25 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
AI->replaceAllUsesWith(Loaded);
AI->eraseFromParent();
}
+
+LoadInst *
+SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
+ IRBuilder<> Builder(AI);
+ auto Order = AI->getOrdering();
+
+ // The optimization removes store aspect of the atomicrmw. Therefore, cache
+ // must be flushed if the atomic ordering had a release semantics. This is
+ // not necessary a fence, a release fence just coincides to do that flush.
+ // Avoid replacing of an atomicrmw with a release semantics.
+ if (isReleaseOrStronger(Order))
+ return nullptr;
+
+ LoadInst *LI = Builder.CreateAlignedLoad(
+ AI->getType(), AI->getPointerOperand(), AI->getAlign());
+ LI->setAtomic(Order, AI->getSyncScopeID());
+ LI->copyMetadata(*AI);
+ LI->takeName(AI);
+ AI->replaceAllUsesWith(LI);
+ AI->eraseFromParent();
+ return LI;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 0aba20585daf7..295f01e9843b5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -497,6 +497,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
+ LoadInst *
+ lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
+
const TargetRegisterClass *getRegClassFor(MVT VT,
bool isDivergent) const override;
bool requiresUniformRegister(MachineFunction &MF,
diff --git a/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll b/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll
new file mode 100644
index 0000000000000..fbb36f1d7ec8f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll
@@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -atomic-expand < %s | FileCheck --check-prefix=OPT %s
+
+define i32 @global_agent_monotonic_idempotent_or(ptr addrspace(1) %in) {
+; GFX940-LABEL: global_agent_monotonic_idempotent_or:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v0, v[0:1], off sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+; OPT-LABEL: @global_agent_monotonic_idempotent_or(
+; OPT-NEXT: entry:
+; OPT-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] syncscope("agent-one-as") monotonic, align 4
+; OPT-NEXT: ret i32 [[VAL]]
+;
+entry:
+ %val = atomicrmw or ptr addrspace(1) %in, i32 0 syncscope("agent-one-as") monotonic, align 4
+ ret i32 %val
+}
+
+define i32 @global_agent_acquire_idempotent_or(ptr addrspace(1) %in) {
+; GFX940-LABEL: global_agent_acquire_idempotent_or:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v0, v[0:1], off sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+; OPT-LABEL: @global_agent_acquire_idempotent_or(
+; OPT-NEXT: entry:
+; OPT-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] syncscope("agent-one-as") acquire, align 4
+; OPT-NEXT: ret i32 [[VAL]]
+;
+entry:
+ %val = atomicrmw or ptr addrspace(1) %in, i32 0 syncscope("agent-one-as") acquire, align 4
+ ret i32 %val
+}
+
+define i32 @global_agent_release_idempotent_or(ptr addrspace(1) %in) {
+; GFX940-LABEL: global_agent_release_idempotent_or:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: global_atomic_or v0, v[0:1], v2, off sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+; OPT-LABEL: @global_agent_release_idempotent_or(
+; OPT-NEXT: entry:
+; OPT-NEXT: [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") release, align 4
+; OPT-NEXT: ret i32 [[VAL]]
+;
+entry:
+ %val = atomicrmw or ptr addrspace(1) %in, i32 0 syncscope("agent-one-as") release, align 4
+ ret i32 %val
+}
+
+define i32 @global_agent_acquire_release_idempotent_or(ptr addrspace(1) %in) {
+; GFX940-LABEL: global_agent_acquire_release_idempotent_or:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: global_atomic_or v0, v[0:1], v2, off sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+; OPT-LABEL: @global_agent_acquire_release_idempotent_or(
+; OPT-NEXT: entry:
+; OPT-NEXT: [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") acq_rel, align 4
+; OPT-NEXT: ret i32 [[VAL]]
+;
+entry:
+ %val = atomicrmw or ptr addrspace(1) %in, i32 0 syncscope("agent-one-as") acq_rel, align 4
+ ret i32 %val
+}
+
+define i32 @global_agent_seq_cst_idempotent_or(ptr addrspace(1) %in) {
+; GFX940-LABEL: global_agent_seq_cst_idempotent_or:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: global_atomic_or v0, v[0:1], v2, off sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+; OPT-LABEL: @global_agent_seq_cst_idempotent_or(
+; OPT-NEXT: entry:
+; OPT-NEXT: [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") seq_cst, align 4
+; OPT-NEXT: ret i32 [[VAL]]
+;
+entry:
+ %val = atomicrmw or ptr addrspace(1) %in, i32 0 syncscope("agent-one-as") seq_cst, align 4
+ ret i32 %val
+}
+
+define i32 @global_agent_monotonic_idempotent_add(ptr addrspace(1) %in) {
+; GFX940-LABEL: global_agent_monotonic_idempotent_add:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v0, v[0:1], off sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+; OPT-LABEL: @global_agent_monotonic_idempotent_add(
+; OPT-NEXT: entry:
+; OPT-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] syncscope("workgroup") monotonic, align 4
+; OPT-NEXT: ret i32 [[VAL]]
+;
+entry:
+ %val = atomicrmw add ptr addrspace(1) %in, i32 0 syncscope("workgroup") monotonic, align 4
+ ret i32 %val
+}
+
+define i32 @global_agent_monotonic_idempotent_sub(ptr addrspace(1) %in) {
+; GFX940-LABEL: global_agent_monotonic_idempotent_sub:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v0, v[0:1], off
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+; OPT-LABEL: @global_agent_monotonic_idempotent_sub(
+; OPT-NEXT: entry:
+; OPT-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] syncscope("wavefront") monotonic, align 4
+; OPT-NEXT: ret i32 [[VAL]]
+;
+entry:
+ %val = atomicrmw sub ptr addrspace(1) %in, i32 0 syncscope("wavefront") monotonic, align 4
+ ret i32 %val
+}
+
+define i32 @global_system_monotonic_idempotent_xor(ptr addrspace(1) %in) {
+; GFX940-LABEL: global_system_monotonic_idempotent_xor:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v0, v[0:1], off sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+; OPT-LABEL: @global_system_monotonic_idempotent_xor(
+; OPT-NEXT: entry:
+; OPT-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] monotonic, align 4
+; OPT-NEXT: ret i32 [[VAL]]
+;
+entry:
+ %val = atomicrmw xor ptr addrspace(1) %in, i32 0 monotonic, align 4
+ ret i32 %val
+}
+
+define i32 @global_agent_monotonic_idempotent_and(ptr addrspace(1) %in) {
+; GFX940-LABEL: global_agent_monotonic_idempotent_and:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v0, v[0:1], off
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+; OPT-LABEL: @global_agent_monotonic_idempotent_and(
+; OPT-NEXT: entry:
+; OPT-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] syncscope("singlethread") monotonic, align 4
+; OPT-NEXT: ret i32 [[VAL]]
+;
+entry:
+ %val = atomicrmw and ptr addrspace(1) %in, i32 -1 syncscope("singlethread") monotonic, align 4
+ ret i32 %val
+}
More information about the llvm-commits
mailing list