[llvm-branch-commits] [llvm] [AMDGPU] Expand scratch atomics to flat atomics if GAS is enabled (PR #154710)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Aug 21 02:34:14 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Pierre van Houtryve (Pierre-vh)
<details>
<summary>Changes</summary>
---
Patch is 1.02 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154710.diff
9 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+46-4)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+2)
- (modified) llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll (-12)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll (+3235-504)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll (+2892-540)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll (+3131-475)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll (+2892-540)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll (+2938-540)
- (added) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-private-gas.ll (+172)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 561019bb65549..60faf211df0d9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17808,11 +17808,19 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) {
!AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS);
}
+static TargetLowering::AtomicExpansionKind
+getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
+ // For GAS, lower to flat atomic.
+ return STI.hasGloballyAddressableScratch()
+ ? TargetLowering::AtomicExpansionKind::Expand
+ : TargetLowering::AtomicExpansionKind::NotAtomic;
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
- return AtomicExpansionKind::NotAtomic;
+ return getPrivateAtomicExpansionKind(*getSubtarget());
// 64-bit flat atomics that dynamically reside in private memory will silently
// be dropped.
@@ -18038,14 +18046,14 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
- ? AtomicExpansionKind::NotAtomic
+ ? getPrivateAtomicExpansionKind(*getSubtarget())
: AtomicExpansionKind::None;
}
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
- ? AtomicExpansionKind::NotAtomic
+ ? getPrivateAtomicExpansionKind(*getSubtarget())
: AtomicExpansionKind::None;
}
@@ -18053,7 +18061,7 @@ TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
unsigned AddrSpace = CmpX->getPointerAddressSpace();
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
- return AtomicExpansionKind::NotAtomic;
+ return getPrivateAtomicExpansionKind(*getSubtarget());
if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
return AtomicExpansionKind::None;
@@ -18423,9 +18431,24 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
Builder.CreateBr(ExitBB);
}
+static void convertScratchAtomicToFlatAtomic(Instruction *I,
+ unsigned PtrOpIdx) {
+ Value *PtrOp = I->getOperand(PtrOpIdx);
+ assert(PtrOp->getType()->getPointerAddressSpace() ==
+ AMDGPUAS::PRIVATE_ADDRESS);
+
+ Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
+ Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
+ I->getIterator());
+ I->setOperand(PtrOpIdx, ASCast);
+}
+
void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
AtomicRMWInst::BinOp Op = AI->getOperation();
+ if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(AI, AI->getPointerOperandIndex());
+
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
Op == AtomicRMWInst::Xor) {
if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
@@ -18448,9 +18471,28 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
}
void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+ if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(CI, CI->getPointerOperandIndex());
+
emitExpandAtomicAddrSpacePredicate(CI);
}
+void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const {
+ if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(LI, LI->getPointerOperandIndex());
+
+ llvm_unreachable(
+ "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
+}
+
+void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const {
+ if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
+
+ llvm_unreachable(
+ "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
+}
+
LoadInst *
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
IRBuilder<> Builder(AI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index dedd9ae170774..e96b702367299 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -562,6 +562,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override;
+ void emitExpandAtomicLoad(LoadInst *LI) const override;
+ void emitExpandAtomicStore(StoreInst *SI) const override;
LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
index d13d76fcfabf4..fcdba69c30213 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
@@ -86,15 +86,3 @@ entry:
store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4
ret void
}
-
-; GCN: scratch_atomic_store:
-; CU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; NOCU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; GCN: .amdhsa_kernel scratch_atomic_store
-; CU: .amdhsa_uses_cu_stores 1
-; NOCU: .amdhsa_uses_cu_stores 0
-define amdgpu_kernel void @scratch_atomic_store(ptr addrspace(5) %dst, i32 %val) {
-entry:
- store atomic i32 %val, ptr addrspace(5) %dst syncscope("wavefront") unordered, align 4
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
index af5b529fc387e..fe345f9244066 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
@@ -179,11 +179,35 @@ define amdgpu_kernel void @private_agent_unordered_load(
;
; GFX1250-LABEL: private_agent_unordered_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -358,11 +382,35 @@ define amdgpu_kernel void @private_agent_monotonic_load(
;
; GFX1250-LABEL: private_agent_monotonic_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -537,11 +585,36 @@ define amdgpu_kernel void @private_agent_acquire_load(
;
; GFX1250-LABEL: private_agent_acquire_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -716,11 +789,42 @@ define amdgpu_kernel void @private_agent_seq_cst_load(
;
; GFX1250-LABEL: private_agent_seq_cst_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -875,11 +979,35 @@ define amdgpu_kernel void @private_agent_unordered_store(
;
; GFX1250-LABEL: private_agent_unordered_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1032,11 +1160,35 @@ define amdgpu_kernel void @private_agent_monotonic_store(
;
; GFX1250-LABEL: private_agent_monotonic_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1189,11 +1341,39 @@ define amdgpu_kernel void @private_agent_release_store(
;
; GFX1250-LABEL: private_agent_release_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1346,11 +1526,39 @@ define amdgpu_kernel void @private_agent_seq_cst_store(
;
; GFX1250-LABEL: private_agent_seq_cst_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/154710
More information about the llvm-branch-commits
mailing list