[llvm-branch-commits] [llvm] [AMDGPU] Expand scratch atomics to flat atomics if GAS is enabled (PR #154710)

via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Thu Aug 21 02:34:15 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Pierre van Houtryve (Pierre-vh)

<details>
<summary>Changes</summary>



---

Patch is 1.02 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154710.diff


9 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+46-4) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+2) 
- (modified) llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll (-12) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll (+3235-504) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll (+2892-540) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll (+3131-475) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll (+2892-540) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll (+2938-540) 
- (added) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-private-gas.ll (+172) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 561019bb65549..60faf211df0d9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17808,11 +17808,19 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) {
          !AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS);
 }
 
+static TargetLowering::AtomicExpansionKind
+getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
+  // For GAS, lower to flat atomic.
+  return STI.hasGloballyAddressableScratch()
+             ? TargetLowering::AtomicExpansionKind::Expand
+             : TargetLowering::AtomicExpansionKind::NotAtomic;
+}
+
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   unsigned AS = RMW->getPointerAddressSpace();
   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
-    return AtomicExpansionKind::NotAtomic;
+    return getPrivateAtomicExpansionKind(*getSubtarget());
 
   // 64-bit flat atomics that dynamically reside in private memory will silently
   // be dropped.
@@ -18038,14 +18046,14 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
-             ? AtomicExpansionKind::NotAtomic
+             ? getPrivateAtomicExpansionKind(*getSubtarget())
              : AtomicExpansionKind::None;
 }
 
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
-             ? AtomicExpansionKind::NotAtomic
+             ? getPrivateAtomicExpansionKind(*getSubtarget())
              : AtomicExpansionKind::None;
 }
 
@@ -18053,7 +18061,7 @@ TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
   unsigned AddrSpace = CmpX->getPointerAddressSpace();
   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
-    return AtomicExpansionKind::NotAtomic;
+    return getPrivateAtomicExpansionKind(*getSubtarget());
 
   if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
     return AtomicExpansionKind::None;
@@ -18423,9 +18431,24 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
   Builder.CreateBr(ExitBB);
 }
 
+static void convertScratchAtomicToFlatAtomic(Instruction *I,
+                                             unsigned PtrOpIdx) {
+  Value *PtrOp = I->getOperand(PtrOpIdx);
+  assert(PtrOp->getType()->getPointerAddressSpace() ==
+         AMDGPUAS::PRIVATE_ADDRESS);
+
+  Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
+  Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
+                                              I->getIterator());
+  I->setOperand(PtrOpIdx, ASCast);
+}
+
 void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   AtomicRMWInst::BinOp Op = AI->getOperation();
 
+  if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(AI, AI->getPointerOperandIndex());
+
   if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
       Op == AtomicRMWInst::Xor) {
     if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
@@ -18448,9 +18471,28 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
 }
 
 void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+  if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(CI, CI->getPointerOperandIndex());
+
   emitExpandAtomicAddrSpacePredicate(CI);
 }
 
+void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const {
+  if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(LI, LI->getPointerOperandIndex());
+
+  llvm_unreachable(
+      "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
+}
+
+void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const {
+  if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
+
+  llvm_unreachable(
+      "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
+}
+
 LoadInst *
 SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   IRBuilder<> Builder(AI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index dedd9ae170774..e96b702367299 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -562,6 +562,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
   void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
   void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override;
+  void emitExpandAtomicLoad(LoadInst *LI) const override;
+  void emitExpandAtomicStore(StoreInst *SI) const override;
 
   LoadInst *
   lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
index d13d76fcfabf4..fcdba69c30213 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
@@ -86,15 +86,3 @@ entry:
   store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4
   ret void
 }
-
-; GCN:     scratch_atomic_store:
-; CU:        scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; NOCU:      scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; GCN:     .amdhsa_kernel scratch_atomic_store
-; CU:        .amdhsa_uses_cu_stores 1
-; NOCU:      .amdhsa_uses_cu_stores 0
-define amdgpu_kernel void @scratch_atomic_store(ptr addrspace(5) %dst, i32 %val) {
-entry:
-  store atomic i32 %val, ptr addrspace(5) %dst syncscope("wavefront") unordered, align 4
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
index af5b529fc387e..fe345f9244066 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
@@ -179,11 +179,35 @@ define amdgpu_kernel void @private_agent_unordered_load(
 ;
 ; GFX1250-LABEL: private_agent_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -358,11 +382,35 @@ define amdgpu_kernel void @private_agent_monotonic_load(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -537,11 +585,36 @@ define amdgpu_kernel void @private_agent_acquire_load(
 ;
 ; GFX1250-LABEL: private_agent_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -716,11 +789,42 @@ define amdgpu_kernel void @private_agent_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -875,11 +979,35 @@ define amdgpu_kernel void @private_agent_unordered_store(
 ;
 ; GFX1250-LABEL: private_agent_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1032,11 +1160,35 @@ define amdgpu_kernel void @private_agent_monotonic_store(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1189,11 +1341,39 @@ define amdgpu_kernel void @private_agent_release_store(
 ;
 ; GFX1250-LABEL: private_agent_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1346,11 +1526,39 @@ define amdgpu_kernel void @private_agent_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/154710


More information about the llvm-branch-commits mailing list