[llvm-branch-commits] [llvm] AMDGPU: Expand flat atomics that may access private memory (PR #109407)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Sep 20 04:56:15 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
If the runtime flat address resolves to a scratch address,
64-bit atomics do not work correctly. Insert a runtime address
space check (which is quite likely to be uniform) and select between
the non-atomic and real atomic cases.
Consider noalias.addrspace metadata and avoid this expansion when
possible (we also need to consider it to avoid infinitely expanding
after adding the predication code).
---
Patch is 2.29 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109407.diff
22 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+106-33)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll (+11-8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll (+11-8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll (+8-7)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll (+2-4)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+1129-738)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+1115-724)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+1115-724)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+3243-1098)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (+297-113)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll (+8745-1168)
- (modified) llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll (+8-7)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll (+260-36)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll (+39-1)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll (+39-1)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll (+39-2)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll (+556-179)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll (+16-2)
- (modified) llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll (+5-4)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a9754ba357893f..febd741f947ee1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -39,6 +39,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/ModRef.h"
@@ -16236,12 +16237,39 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
: TargetLowering::AtomicExpansionKind::CmpXChg;
}
+/// Return if a flat address space atomicrmw can access private memory.
+static bool flatInstrMayAccessPrivate(const Instruction *I) {
+ const MDNode *NoaliasAddrSpaceMD =
+ I->getMetadata(LLVMContext::MD_noalias_addrspace);
+ if (!NoaliasAddrSpaceMD)
+ return true;
+
+ // FIXME: Can this actually fail? Why is this optional?
+ if (std::optional<ConstantRange> CR =
+ getConstantRangeFromMetadata(*NoaliasAddrSpaceMD)) {
+ return !CR->contains(APInt(32, AMDGPUAS::PRIVATE_ADDRESS));
+ }
+
+ llvm_unreachable("Why is getConstantRangeFromMetadata optional");
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
return AtomicExpansionKind::NotAtomic;
+ // 64-bit flat atomics that dynamically reside in private memory will silently
+ // be dropped.
+ //
+ // Note that we will emit a new copy of the original atomic in the expansion,
+ // which will be incrementally relegalized.
+ const DataLayout &DL = RMW->getFunction()->getDataLayout();
+ if (AS == AMDGPUAS::FLAT_ADDRESS &&
+ DL.getTypeSizeInBits(RMW->getType()) == 64 &&
+ flatInstrMayAccessPrivate(RMW))
+ return AtomicExpansionKind::Expand;
+
auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
OptimizationRemarkEmitter ORE(RMW->getFunction());
ORE.emit([=]() {
@@ -16640,20 +16668,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
Op == AtomicRMWInst::Xor) {
- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
- assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
- "this cannot be replaced with add");
- AI->setOperation(AtomicRMWInst::Add);
- return;
+ if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
+ ConstVal && ConstVal->isNullValue()) {
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
+ AI->setOperation(AtomicRMWInst::Add);
+
+ // TODO: Turn the below private handling into a no-op for idempotent
+ // cases.
+ }
}
- assert(Subtarget->hasAtomicFaddInsts() &&
- "target should have atomic fadd instructions");
- assert(AI->getType()->isFloatTy() &&
- AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
- "generic atomicrmw expansion only supports FP32 operand in flat "
- "address space");
- assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
+ // The non-flat expansions should only perform the de-canonicalization of
+ // identity values.
+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
+ return;
+
+ // FullFlatEmulation is true if we need to issue the private, shared, and
+ // global cases.
+ //
+ // If this is false, we are only dealing with the flat-targeting-private case,
+ // where we only insert a check for private and still use the flat instruction
+ // for global and shared.
+
+ // TODO: Avoid the private check for the fadd case depending on
+ // noalias.addrspace.
+
+ bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
+ Subtarget->hasAtomicFaddInsts() &&
+ AI->getType()->isFloatTy();
// Given: atomicrmw fadd ptr %addr, float %val ordering
//
@@ -16693,6 +16735,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
//
// atomicrmw.end:
// [...]
+ //
+ //
+ // For 64-bit atomics which may reside in private memory, we perform a simpler
+ // version that only inserts the private check, and uses the flat operation.
IRBuilder<> Builder(AI);
LLVMContext &Ctx = Builder.getContext();
@@ -16704,9 +16750,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
Function *F = BB->getParent();
BasicBlock *ExitBB =
BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
- BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
- BasicBlock *CheckPrivateBB =
- BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
+ BasicBlock *SharedBB = nullptr;
+
+ BasicBlock *CheckPrivateBB = BB;
+ if (FullFlatEmulation) {
+ SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
+ CheckPrivateBB =
+ BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
+ }
+
BasicBlock *PrivateBB =
BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16719,23 +16771,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
std::prev(BB->end())->eraseFromParent();
Builder.SetInsertPoint(BB);
- CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
- {Addr}, nullptr, "is.shared");
- Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
- Builder.SetInsertPoint(SharedBB);
- Value *CastToLocal = Builder.CreateAddrSpaceCast(
- Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
+ Value *LoadedShared = nullptr;
+ if (FullFlatEmulation) {
+ CallInst *IsShared = Builder.CreateIntrinsic(
+ Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
+ Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
+ Builder.SetInsertPoint(SharedBB);
+ Value *CastToLocal = Builder.CreateAddrSpaceCast(
+ Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
- Instruction *Clone = AI->clone();
- Clone->insertInto(SharedBB, SharedBB->end());
- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
- .set(CastToLocal);
- Instruction *LoadedShared = Clone;
+ Instruction *Clone = AI->clone();
+ Clone->insertInto(SharedBB, SharedBB->end());
+ Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
+ .set(CastToLocal);
+ LoadedShared = Clone;
- Builder.CreateBr(PhiBB);
+ Builder.CreateBr(PhiBB);
+ Builder.SetInsertPoint(CheckPrivateBB);
+ }
- Builder.SetInsertPoint(CheckPrivateBB);
CallInst *IsPrivate = Builder.CreateIntrinsic(
Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16752,15 +16807,32 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
Builder.CreateBr(PhiBB);
Builder.SetInsertPoint(GlobalBB);
- Value *CastToGlobal = Builder.CreateAddrSpaceCast(
- Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
- Value *LoadedGlobal = AI;
- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
+ // Continue using a flat instruction if we only emitted the check for private.
+ Instruction *LoadedGlobal = AI;
+ if (FullFlatEmulation) {
+ Value *CastToGlobal = Builder.CreateAddrSpaceCast(
+ Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
+ AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
+ .set(CastToGlobal);
+ }
AI->removeFromParent();
AI->insertInto(GlobalBB, GlobalBB->end());
+ // The new atomicrmw may go through another round of legalization later.
+ if (!FullFlatEmulation) {
+ // We inserted the runtime check already, make sure we do not try to
+ // re-expand this.
+ // TODO: Should union with any existing metadata.
+ MDBuilder MDB(F->getContext());
+ MDNode *RangeNotPrivate =
+ MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
+ APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
+ LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
+ RangeNotPrivate);
+ }
+
Builder.CreateBr(PhiBB);
Builder.SetInsertPoint(PhiBB);
@@ -16768,7 +16840,8 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
if (ReturnValueIsUsed) {
PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
AI->replaceAllUsesWith(Loaded);
- Loaded->addIncoming(LoadedShared, SharedBB);
+ if (FullFlatEmulation)
+ Loaded->addIncoming(LoadedShared, SharedBB);
Loaded->addIncoming(LoadedPrivate, PrivateBB);
Loaded->addIncoming(LoadedGlobal, GlobalBB);
Loaded->takeName(AI);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index df81b926bceb39..eea4fd5c20cec0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -1343,7 +1343,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret double %result
}
@@ -1494,7 +1494,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ %unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2230,3 +2230,4 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
}
!0 = !{}
+!1 = !{i32 5, i32 6}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index 53d9bf0751a1d4..f47ea7bd458fb4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -1343,7 +1343,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret double %result
}
@@ -1494,7 +1494,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ %unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2230,3 +2230,4 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
}
!0 = !{}
+!1 = !{i32 5, i32 6}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index 705bcbddf227a6..f5555f8251b47e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -1657,7 +1657,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
store i64 %result, ptr %out, align 4
ret void
}
@@ -1759,7 +1759,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr %ptr, i32 4
- %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
store i64 %result, ptr %out, align 4
ret void
}
@@ -1832,7 +1832,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
ret void
}
@@ -1911,7 +1911,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr %ptr, i32 4
- %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
ret void
}
@@ -1990,7 +1990,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr %ptr, i32 4
- %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0
ret void
}
@@ -2118,7 +2118,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
%out.gep = getelementptr i64, ptr %out, i32 %id
%gep = getelementptr i64, ptr %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
store i64 %result, ptr %out.gep, align 4
ret void
}
@@ -2217,7 +2217,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
%gep = getelementptr i64, ptr %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
ret void
}
@@ -3340,7 +3340,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
- %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
store i32 %idx.0, ptr addrspace(1) %add_use, align 4
store i64 %result, ptr addrspace(1) %out, align 4
ret void
@@ -3349,5 +3349,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
attributes #0 = { nounwind speculatable willreturn memory(none) }
attributes #1 = { nounwind }
attributes #2 = { nounwind memory(none) }
+
+!0 = !{i32 5, i32 6}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index b3a7e65f771c43..3090cc4dddaf87 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -2782,7 +2782,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_endpgm
- %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
store i64 %result, ptr %out, align 4
ret void
}
@@ -2884,7 +2884,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
store i64 %result, ptr %out, align 4
ret void
}
@@ -2986,7 +2986,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8
+ %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0
store i64 %result, ptr %out, align 4
ret void
}
@@ -3059,7 +3059,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
ret void
}
@@ -3138,7 +3138,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
ret void
}
@@ -3217,7 +3217,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8
+ %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0
ret void
}
@@ -3345,7 +3345,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
%out.gep = getelementptr i64, ptr %out, i32 %id
%gep = getelementptr i64, ptr %gep.tid, i32 5
- %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
store i64 %result, ptr %out.gep, align 4
ret void
}
@@ -3444,7 +3444,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
%gep = getelementptr i64, ptr %gep.tid, i32 5
- %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
ret void
}
@@ -3554,5 +3554,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
attributes #0 = { nounwind speculatable willreturn memory(none) }
attributes #1 = { nounwind }
attributes #2 = { nounwind memory(none) }
+
+!0 = !{i32 5, i32 6}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/109407
More information about the llvm-branch-commits
mailing list