[llvm-branch-commits] [llvm] AMDGPU: Start considering new atomicrmw metadata on integer operations (PR #122138)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jan 8 08:33:49 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
Start considering !amdgpu.no.remote.memory.access and
!amdgpu.no.fine.grained.host.memory metadata when deciding to expand
integer atomic operations. This does not yet attempt to accurately
handle fadd/fmin/fmax, which are trickier and require migrating the
old "amdgpu-unsafe-fp-atomics" attribute.
---
Patch is 1.43 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/122138.diff
28 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+53-12)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll (+31-30)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll (+2521-614)
- (modified) llvm/test/CodeGen/AMDGPU/acc-ldst.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+14-12)
- (modified) llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll (+17-17)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics.ll (+87-85)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll (+100-926)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (+82-80)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll (+3758-1362)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll (+258-1374)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll (+100-1212)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll (+82-80)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics.ll (+319-79)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll (+98-984)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll (+82-80)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll (+109-1221)
- (modified) llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll (+42-28)
- (modified) llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll (+1-1)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll (+534-159)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll (+990-49)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll (+30-330)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll (+990-49)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll (+30-330)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll (+209-6)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll (+130-8)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll (+43-21)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 513251e398ad4d..5fa8e1532096f7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16621,19 +16621,60 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::UDecWrap: {
if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
- // Always expand system scope atomics.
- if (HasSystemScope) {
- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
- Op == AtomicRMWInst::Xor) {
- // Atomic sub/or/xor do not work over PCI express, but atomic add
- // does. InstCombine transforms these with 0 to or, so undo that.
- if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
- ConstVal && ConstVal->isNullValue())
- return AtomicExpansionKind::Expand;
- }
-
- return AtomicExpansionKind::CmpXChg;
+ // On most subtargets, for atomicrmw operations other than add/xchg,
+ // whether or not the instructions will behave correctly depends on where
+ // the address physically resides and what interconnect is used in the
+ // system configuration. On some some targets the instruction will nop,
+ // and in others synchronization will only occur at degraded device scope.
+ //
+ // If the allocation is known local to the device, the instructions should
+ // work correctly.
+ if (RMW->hasMetadata("amdgpu.no.remote.memory"))
+ return atomicSupportedIfLegalIntType(RMW);
+
+ // If fine-grained remote memory works at device scope, we don't need to
+ // do anything.
+ if (!HasSystemScope &&
+ Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
+ return atomicSupportedIfLegalIntType(RMW);
+
+ // If we are targeting a remote allocated address, it depends what kind of
+ // allocation the address belongs to.
+ //
+ // If the allocation is fine-grained (in host memory, or in PCIe peer
+ // device memory), the operation will fail depending on the target.
+ //
+ // Note fine-grained host memory access does work on APUs or if XGMI is
+ // used, but we do not know if we are targeting an APU or the system
+ // configuration from the ISA version/target-cpu.
+ if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
+ return atomicSupportedIfLegalIntType(RMW);
+
+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
+ Op == AtomicRMWInst::Xor) {
+ // Atomic sub/or/xor do not work over PCI express, but atomic add
+ // does. InstCombine transforms these with 0 to or, so undo that.
+ if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
+ ConstVal && ConstVal->isNullValue())
+ return AtomicExpansionKind::Expand;
}
+
+ // If the allocation could be in remote, fine-grained memory, the rmw
+ // instructions may fail. cmpxchg should work, so emit that. On some
+ // system configurations, PCIe atomics aren't supported so cmpxchg won't
+ // even work, so you're out of luck anyway.
+
+ // In summary:
+ //
+ // Cases that may fail:
+ // - fine-grained pinned host memory
+ // - fine-grained migratable host memory
+ // - fine-grained PCIe peer device
+ //
+ // Cases that should work, but may be treated overly conservatively.
+ // - fine-grained host memory on an APU
+ // - fine-grained XGMI peer device
+ return AtomicExpansionKind::CmpXChg;
}
return atomicSupportedIfLegalIntType(RMW);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index 35aa3cfbc841c8..11d01189375ce5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -85,7 +85,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -350,7 +350,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -427,7 +427,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -656,7 +656,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -723,7 +723,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -962,7 +962,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
%gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id
%gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr addrspace(1) %out.gep, align 4
ret void
}
@@ -1040,7 +1040,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
%gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -1119,7 +1119,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr %out, align 4
ret void
}
@@ -1206,7 +1206,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr %ptr, i32 4
- %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr %out, align 4
ret void
}
@@ -1442,7 +1442,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -1516,7 +1516,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr %ptr, i32 4
- %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -1780,7 +1780,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
%gep.tid = getelementptr i32, ptr %ptr, i32 %id
%out.gep = getelementptr i32, ptr %out, i32 %id
%gep = getelementptr i32, ptr %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr %out.gep, align 4
ret void
}
@@ -1875,7 +1875,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, ptr %ptr, i32 %id
%gep = getelementptr i32, ptr %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -1969,7 +1969,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
store i64 %result, ptr %out, align 4
ret void
}
@@ -2071,7 +2071,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr %ptr, i32 4
- %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
store i64 %result, ptr %out, align 4
ret void
}
@@ -2144,7 +2144,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
ret void
}
@@ -2223,7 +2223,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr %ptr, i32 4
- %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
ret void
}
@@ -2536,7 +2536,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
%out.gep = getelementptr i64, ptr %out, i32 %id
%gep = getelementptr i64, ptr %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
store i64 %result, ptr %out.gep, align 4
ret void
}
@@ -2635,7 +2635,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
%gep = getelementptr i64, ptr %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
ret void
}
@@ -2724,7 +2724,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0
- %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i32 9 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i32 9 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %idx.0, ptr addrspace(1) %add_use, align 4
store i32 %result, ptr addrspace(1) %out, align 4
ret void
@@ -2807,7 +2807,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
store i64 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -2953,7 +2953,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
ret void
}
@@ -3016,7 +3016,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
- %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
ret void
}
@@ -3092,7 +3092,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
store i64 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -3174,7 +3174,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
store i64 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -3440,7 +3440,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
ret void
}
@@ -3512,7 +3512,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
ret void
}
@@ -3788,7 +3788,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
%gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id
%gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
store i64 %result, ptr addrspace(1) %out.gep, align 4
ret void
}
@@ -3871,7 +3871,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
%gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
ret void
}
@@ -3966,7 +3966,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
- %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
store i32 %idx.0, ptr addrspace(1) %add_use, align 4
store i64 %result, ptr addrspace(1) %out, align 4
ret void
@@ -3977,6 +3977,7 @@ attributes #1 = { nounwind }
attributes #2 = { nounwind memory(none) }
!0 = !{i32 5, i32 6}
+!1 = !{}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index 00d932b584aaf9..1eefad8b6cc6e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -332,13 +332,28 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_inc_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: s_mov_b64 s[4:5], 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_load_dword s6, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v2, s6
+; CI-NEXT: .LBB4_1: ; %atomicrmw.start
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; C...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/122138
More information about the llvm-branch-commits
mailing list