[llvm] 20795e0 - [AMDGPU][SIMemoryLegalizer] Combine GFX10-11 CacheControl Classes (#168058)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 18 01:13:01 PST 2025
Author: Pierre van Houtryve
Date: 2025-11-18T10:12:56+01:00
New Revision: 20795e06ed8b91109d5237ffb02f37245d1f781d
URL: https://github.com/llvm/llvm-project/commit/20795e06ed8b91109d5237ffb02f37245d1f781d
DIFF: https://github.com/llvm/llvm-project/commit/20795e06ed8b91109d5237ffb02f37245d1f781d.diff
LOG: [AMDGPU][SIMemoryLegalizer] Combine GFX10-11 CacheControl Classes (#168058)
Also breaks the long inheritance chains by making both
`SIGfx10CacheControl` and
`SIGfx12CacheControl` inherit from `SICacheControl` directly.
With this patch, we now just have 3 `SICacheControl` implementations
that each
do their own thing, and there is no more code hidden 3 superclasses
above (which made this code harder to read and maintain than it needed
to be).
Added:
Modified:
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 49aba39872138..bf04c7fa132c0 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -404,7 +404,7 @@ class SICacheControl {
/// Generates code sequences for the memory model of all GFX targets below
/// GFX10.
-class SIGfx6CacheControl : public SICacheControl {
+class SIGfx6CacheControl final : public SICacheControl {
public:
SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
@@ -443,14 +443,27 @@ class SIGfx6CacheControl : public SICacheControl {
Position Pos) const override;
};
-class SIGfx10CacheControl : public SIGfx6CacheControl {
+/// Generates code sequences for the memory model of GFX10/11.
+class SIGfx10CacheControl final : public SICacheControl {
public:
- SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
+ SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const override;
+ bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override {
+ return false;
+ }
+
+ bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override {
+ return false;
+ }
+
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal,
@@ -463,23 +476,17 @@ class SIGfx10CacheControl : public SIGfx6CacheControl {
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
-};
-
-class SIGfx11CacheControl : public SIGfx10CacheControl {
-public:
- SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
- bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
-
- bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
- SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal,
- bool IsLastUse) const override;
+ bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
+ Position Pos) const override {
+ return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+ IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+ /*AtomicsOnly=*/false);
+ }
};
-class SIGfx12CacheControl : public SIGfx11CacheControl {
+class SIGfx12CacheControl final : public SICacheControl {
protected:
// Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
// \returns Returns true if \p MI is modified, false otherwise.
@@ -504,7 +511,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
public:
- SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
+ SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {
// GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
// the behavior is the same if assuming GFX12.0 in CU mode.
assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
@@ -915,10 +922,8 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
GCNSubtarget::Generation Generation = ST.getGeneration();
if (Generation < AMDGPUSubtarget::GFX10)
return std::make_unique<SIGfx6CacheControl>(ST);
- if (Generation < AMDGPUSubtarget::GFX11)
- return std::make_unique<SIGfx10CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX12)
- return std::make_unique<SIGfx11CacheControl>(ST);
+ return std::make_unique<SIGfx10CacheControl>(ST);
return std::make_unique<SIGfx12CacheControl>(ST);
}
@@ -1438,8 +1443,7 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
}
bool SIGfx10CacheControl::enableLoadCacheBypass(
- const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
+ const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
assert(MI->mayLoad() && !MI->mayStore());
bool Changed = false;
@@ -1450,7 +1454,9 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
case SIAtomicScope::AGENT:
// Set the L0 and L1 cache policies to MISS_EVICT.
// Note: there is no L2 cache coherent bypass control at the ISA level.
- Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
+ // For GFX10, set GLC+DLC, for GFX11, only set GLC.
+ Changed |=
+ enableCPolBits(MI, CPol::GLC | (AMDGPU::isGFX10(ST) ? CPol::DLC : 0));
break;
case SIAtomicScope::WORKGROUP:
// In WGP mode the waves of a work-group can be executing on either CU of
@@ -1504,6 +1510,10 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
}
+ // GFX11: Set MALL NOALLOC for both load and store instructions.
+ if (AMDGPU::isGFX11(ST))
+ Changed |= enableCPolBits(MI, CPol::DLC);
+
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
@@ -1524,6 +1534,10 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
Changed |= enableCPolBits(MI, CPol::GLC);
Changed |= enableCPolBits(MI, CPol::SLC);
+ // GFX11: Set MALL NOALLOC for both load and store instructions.
+ if (AMDGPU::isGFX11(ST))
+ Changed |= enableCPolBits(MI, CPol::DLC);
+
return Changed;
}
@@ -1722,102 +1736,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return Changed;
}
-bool SIGfx11CacheControl::enableLoadCacheBypass(
- const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const {
- assert(MI->mayLoad() && !MI->mayStore());
- bool Changed = false;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
- // Set the L0 and L1 cache policies to MISS_EVICT.
- // Note: there is no L2 cache coherent bypass control at the ISA level.
- Changed |= enableCPolBits(MI, CPol::GLC);
- break;
- case SIAtomicScope::WORKGROUP:
- // In WGP mode the waves of a work-group can be executing on either CU of
- // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
- // CU mode all waves of a work-group are on the same CU, and so the L0
- // does not need to be bypassed.
- if (!ST.isCuModeEnabled())
- Changed |= enableCPolBits(MI, CPol::GLC);
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to bypass.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory caches
- /// to be bypassed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- return Changed;
-}
-
-bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
- MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
-
- // Only handle load and store, not atomic read-modify-write insructions. The
- // latter use glc to indicate if the atomic returns a result and so must not
- // be used for cache control.
- assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
-
- // Only update load and store, not LLVM IR atomic read-modify-write
- // instructions. The latter are always marked as volatile so cannot sensibly
- // handle it as do not want to pessimize all atomics. Also they do not support
- // the nontemporal attribute.
- assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
-
- bool Changed = false;
-
- if (IsVolatile) {
- // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
- // and MISS_LRU for store instructions.
- // Note: there is no L2 cache coherent bypass control at the ISA level.
- if (Op == SIMemOp::LOAD)
- Changed |= enableCPolBits(MI, CPol::GLC);
-
- // Set MALL NOALLOC for load and store instructions.
- Changed |= enableCPolBits(MI, CPol::DLC);
-
- // Ensure operation has completed at system scope to cause all volatile
- // operations to be visible outside the program in a global order. Do not
- // request cross address space as only the global address space can be
- // observable outside the program, so no need to cause a waitcnt for LDS
- // address space operations.
- Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered,
- /*AtomicsOnly=*/false);
- return Changed;
- }
-
- if (IsNonTemporal) {
- // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
- // and L2 cache policy to STREAM.
- // For stores setting both GLC and SLC configures L0 and L1 cache policy
- // to MISS_EVICT and the L2 cache policy to STREAM.
- if (Op == SIMemOp::STORE)
- Changed |= enableCPolBits(MI, CPol::GLC);
- Changed |= enableCPolBits(MI, CPol::SLC);
-
- // Set MALL NOALLOC for load and store instructions.
- Changed |= enableCPolBits(MI, CPol::DLC);
- return Changed;
- }
-
- return Changed;
-}
-
bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
AMDGPU::CPol::CPol Value) const {
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
More information about the llvm-commits
mailing list