[llvm] [AMDGPU][gfx1250] Use SCOPE_SE for stores that may hit scratch (PR #150586)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 28 02:04:17 PDT 2025
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/150586
>From 0f8a62d457e813e18eb3b3e849a5bd8bdf71a822 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 24 Jul 2025 12:06:33 +0200
Subject: [PATCH 1/2] [AMDGPU][gfx1250] Use SCOPE_SE for stores that may hit
scratch
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 29 +-----
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 26 +++++
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 6 ++
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 29 ++++--
.../AMDGPU/gfx1250-scratch-scope-se.ll | 95 +++++++++++++++++++
llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll | 88 ++++++++---------
6 files changed, 194 insertions(+), 79 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index dd3f2fe25a239..520c321cb7ea8 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -552,7 +552,7 @@ class SIInsertWaitcnts {
(!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
// FLAT and SCRATCH instructions may access scratch. Other VMEM
// instructions do not.
- if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
+ if (TII->mayAccessScratchThroughFlat(Inst))
return SCRATCH_WRITE_ACCESS;
return VMEM_WRITE_ACCESS;
}
@@ -565,7 +565,6 @@ class SIInsertWaitcnts {
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
- bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
bool isVmemAccess(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
@@ -2160,32 +2159,6 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
return false;
}
-// This is a flat memory operation. Check to see if it has memory tokens for
-// either scratch or FLAT.
-bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
- const MachineInstr &MI) const {
- assert(TII->isFLAT(MI));
-
- // SCRATCH instructions always access scratch.
- if (TII->isFLATScratch(MI))
- return true;
-
- // GLOBAL instructions never access scratch.
- if (TII->isFLATGlobal(MI))
- return false;
-
- // If there are no memory operands then conservatively assume the flat
- // operation may access scratch.
- if (MI.memoperands_empty())
- return true;
-
- // See if any memory operand specifies an address space that involves scratch.
- return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
- unsigned AS = Memop->getAddrSpace();
- return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
- });
-}
-
bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
(TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 8d6c1d0466024..2aa6b4e82f9d5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4249,6 +4249,32 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
}
+bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
+ if (!isFLAT(MI) || isFLATGlobal(MI))
+ return false;
+
+ // If scratch is not initialized, we can never access it.
+ if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
+ return false;
+
+ // SCRATCH instructions always access scratch.
+ if (isFLATScratch(MI))
+ return true;
+
+ // If there are no memory operands then conservatively assume the flat
+ // operation may access scratch.
+ if (MI.memoperands_empty())
+ return true;
+
+ // TODO (?): Does this need to be taught how to read noalias.addrspace ?
+
+ // See if any memory operand specifies an address space that involves scratch.
+ return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
+ unsigned AS = Memop->getAddrSpace();
+ return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
+ });
+}
+
bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
// Skip the full operand and register alias search modifiesRegister
// does. There's only a handful of instructions that touch this, it's only an
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 2ffb7835c3125..e042b59eb0f04 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -678,6 +678,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
}
+ /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with
+ /// SCRATCH_ memory operands.
+ /// Conservatively correct; will return true if \p MI cannot be proven
+ /// to not hit scratch.
+ bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+
static bool isBlockLoadStore(uint16_t Opcode) {
switch (Opcode) {
case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 0e8a420fbb70a..d05bcdbc3e26f 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -321,7 +321,7 @@ class SICacheControl {
bool IsNonTemporal,
bool IsLastUse = false) const = 0;
- virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
+ virtual bool finalizeStore(MachineBasicBlock::iterator &MI, bool Atomic) const {
return false;
};
@@ -602,7 +602,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool IsVolatile, bool IsNonTemporal,
bool IsLastUse) const override;
- bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
+ bool finalizeStore(MachineBasicBlock::iterator &MI, bool Atomic) const override;
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
@@ -2551,11 +2551,25 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
-bool SIGfx12CacheControl::expandSystemScopeStore(
- MachineBasicBlock::iterator &MI) const {
+bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI,
+ bool Atomic) const {
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
- if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
- return insertWaitsBeforeSystemScopeStore(MI);
+ if (!CPol)
+ return false;
+
+ const unsigned Scope = CPol->getImm() & CPol::SCOPE;
+
+ // GFX12.0 only: Extra waits needed before system scope stores.
+ if (!ST.hasGFX1250Insts()) {
+ if (!Atomic && Scope == CPol::SCOPE_SYS)
+ return insertWaitsBeforeSystemScopeStore(MI);
+ return false;
+ }
+
+ // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
+ // space.
+ if (TII->mayAccessScratchThroughFlat(*MI) && Scope == CPol::SCOPE_CU)
+ return setScope(MI, CPol::SCOPE_SE);
return false;
}
@@ -2674,6 +2688,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
+ Changed |= CC->finalizeStore(MI, /*Atomic=*/true);
return Changed;
}
@@ -2686,7 +2701,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
// instruction field, do not confuse it with atomic scope.
- Changed |= CC->expandSystemScopeStore(MI);
+ Changed |= CC->finalizeStore(MI, /*Atomic=*/false);
return Changed;
}
diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll
new file mode 100644
index 0000000000000..d1e82a06077f5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s
+
+; Test that stores that may hit scratch are correctly promoted to SCOPE_SE.
+
+define void @test_scratch_store(ptr addrspace(5) %ptr, i32 %val) {
+; GCN-LABEL: test_scratch_store:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SE
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ store i32 %val, ptr addrspace(5) %ptr
+ ret void
+}
+
+define void @test_unknown_flat_store(ptr %ptr, i32 %val) {
+; GCN-LABEL: test_unknown_flat_store:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GCN-NEXT: s_wait_dscnt 0x0
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ store i32 %val, ptr %ptr
+ ret void
+}
+
+define void @test_flat_store_no_scratch_alloc(ptr %ptr, i32 %val) #0 {
+; GCN-LABEL: test_flat_store_no_scratch_alloc:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: flat_store_b32 v[0:1], v2
+; GCN-NEXT: s_wait_dscnt 0x0
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ store i32 %val, ptr %ptr
+ ret void
+}
+
+; TODO: handle
+define void @test_flat_store_noalias_addrspace(ptr %ptr, i32 %val) {
+; GCN-LABEL: test_flat_store_noalias_addrspace:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GCN-NEXT: s_wait_dscnt 0x0
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ store i32 %val, ptr %ptr, !noalias.addrspace !{i32 5, i32 6}
+ ret void
+}
+
+; TODO: would be nice to handle too
+define void @test_flat_store_select(ptr addrspace(1) %a, ptr addrspace(3) %b, i1 %cond, i32 %val) {
+; GCN-SDAG-LABEL: test_flat_store_select:
+; GCN-SDAG: ; %bb.0:
+; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
+; GCN-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v2
+; GCN-SDAG-NEXT: v_and_b32_e32 v3, 1, v3
+; GCN-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo
+; GCN-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, s1, vcc_lo
+; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GCN-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GCN-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v2, v0
+; GCN-SDAG-NEXT: flat_store_b32 v[0:1], v4 scope:SCOPE_SE
+; GCN-SDAG-NEXT: s_wait_dscnt 0x0
+; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GCN-GISEL-LABEL: test_flat_store_select:
+; GCN-GISEL: ; %bb.0:
+; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v2
+; GCN-GISEL-NEXT: v_and_b32_e32 v3, 1, v3
+; GCN-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GCN-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo
+; GCN-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, s1, vcc_lo
+; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GCN-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GCN-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GCN-GISEL-NEXT: flat_store_b32 v[0:1], v4 scope:SCOPE_SE
+; GCN-GISEL-NEXT: s_wait_dscnt 0x0
+; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31]
+ %a.ascast = addrspacecast ptr addrspace(1) %a to ptr
+ %b.ascast = addrspacecast ptr addrspace(3) %b to ptr
+ %ptr = select i1 %cond, ptr %a.ascast, ptr %b.ascast
+ store i32 %val, ptr %ptr
+ ret void
+}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index fd644a35f61e3..3a898a9214461 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -124,27 +124,27 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
; GCN-SDAG-NEXT: s_clause 0xd
-; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:52
-; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:48
-; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:44
-; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:40
-; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:36
-; GCN-SDAG-NEXT: scratch_store_b32 off, v45, s32 offset:32
-; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28
-; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24
-; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20
-; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16
-; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12
-; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8
-; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4
-; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32
+; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:52 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:48 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:44 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:40 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:36 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v45, s32 offset:32 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE
; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:224
; GCN-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:56 ; 16-byte Folded Spill
+; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:56 scope:SCOPE_SE ; 16-byte Folded Spill
; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:240
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:72 ; 16-byte Folded Spill
+; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:72 scope:SCOPE_SE ; 16-byte Folded Spill
; GCN-SDAG-NEXT: s_clause 0xd
; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:192
; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:208
@@ -206,27 +206,27 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
; GCN-GISEL-NEXT: s_clause 0xf
-; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:60
-; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:56
-; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:52
-; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:48
-; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:44
-; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 offset:40
-; GCN-GISEL-NEXT: scratch_store_b32 off, v46, s32 offset:36
-; GCN-GISEL-NEXT: scratch_store_b32 off, v47, s32 offset:32
-; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28
-; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24
-; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20
-; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16
-; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12
-; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8
-; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4
-; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32
+; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:60 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:56 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:52 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:48 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:44 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 offset:40 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v46, s32 offset:36 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v47, s32 offset:32 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE
; GCN-GISEL-NEXT: s_wait_xcnt 0x8
; GCN-GISEL-NEXT: v_dual_mov_b32 v46, v3 :: v_dual_mov_b32 v47, v4
; GCN-GISEL-NEXT: global_load_b128 v[2:5], v[0:1], off offset:32
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: scratch_store_b128 off, v[2:5], s32 offset:80 ; 16-byte Folded Spill
+; GCN-GISEL-NEXT: scratch_store_b128 off, v[2:5], s32 offset:80 scope:SCOPE_SE ; 16-byte Folded Spill
; GCN-GISEL-NEXT: s_clause 0xe
; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:48
; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off offset:64
@@ -244,7 +244,7 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
; GCN-GISEL-NEXT: global_load_b128 v[60:63], v[0:1], off offset:16
; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[0:1], off offset:240
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:64 ; 16-byte Folded Spill
+; GCN-GISEL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:64 scope:SCOPE_SE ; 16-byte Folded Spill
; GCN-GISEL-NEXT: scratch_load_b128 v[0:3], off, s32 offset:80 th:TH_LOAD_LU ; 16-byte Folded Reload
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
; GCN-GISEL-NEXT: s_clause 0xe
@@ -299,10 +299,10 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
; GCN-SDAG-NEXT: s_clause 0x3
-; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:12
-; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:8
-; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4
-; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32
+; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:12 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:8 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 scope:SCOPE_SE
; GCN-SDAG-NEXT: s_clause 0x7
; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:112
; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:96
@@ -385,12 +385,12 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
; GCN-GISEL-NEXT: s_clause 0x5
-; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:20
-; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:16
-; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:12
-; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:8
-; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:4
-; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32
+; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:20 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:16 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:12 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:8 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:4 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 scope:SCOPE_SE
; GCN-GISEL-NEXT: s_clause 0x7
; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:80
; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off
>From 5c225b52301756b210b96401fb9b82216b810e11 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Fri, 25 Jul 2025 10:22:40 +0200
Subject: [PATCH 2/2] clang-format
---
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index d05bcdbc3e26f..607825ea50e7c 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -321,7 +321,8 @@ class SICacheControl {
bool IsNonTemporal,
bool IsLastUse = false) const = 0;
- virtual bool finalizeStore(MachineBasicBlock::iterator &MI, bool Atomic) const {
+ virtual bool finalizeStore(MachineBasicBlock::iterator &MI,
+ bool Atomic) const {
return false;
};
@@ -602,7 +603,8 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool IsVolatile, bool IsNonTemporal,
bool IsLastUse) const override;
- bool finalizeStore(MachineBasicBlock::iterator &MI, bool Atomic) const override;
+ bool finalizeStore(MachineBasicBlock::iterator &MI,
+ bool Atomic) const override;
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
More information about the llvm-commits
mailing list