[llvm] [AMDGPU][SIMemoryLegalizer] Combine all GFX6-9 CacheControl Classes (PR #168052)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 17 01:07:06 PST 2025
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/168052
>From 3d3df4c637f3fac616dff63e23c7f0180b1828f5 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Fri, 14 Nov 2025 14:11:10 +0100
Subject: [PATCH 1/2] [AMDGPU][SIMemoryLegalizer] Combine all GFX6-9
CacheControl Classes
Merge the following classes into `SIGfx6CacheControl`:
- SIGfx7CacheControl
- SIGfx90ACacheControl
- SIGfx940CacheControl
They were all very similar and had a lot of duplicated boilerplate just to implement one or two codegen differences. GFX90A/GFX940 have a bit more differences, but they're still manageable under one class because the general behavior is the same.
This removes 500 lines of code and puts everything into a single place which I think makes it a lot easier to maintain, at the cost of a slight increase in complexity for some functions.
There is still a lot of room for improvement but I think this patch is already big enough as is and I don't want to bundle too much into one review.
---
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 1084 +++++-------------
1 file changed, 294 insertions(+), 790 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 0dac6d2d7ee4b..965631b3c09e4 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -302,16 +302,17 @@ class SICacheControl {
SICacheControl(const GCNSubtarget &ST);
- /// Sets named bit \p BitName to "true" if present in instruction \p MI.
+ /// Sets CPol \p Bits to "true" if present in instruction \p MI.
/// \returns Returns true if \p MI is modified, false otherwise.
- bool enableNamedBit(const MachineBasicBlock::iterator MI,
- AMDGPU::CPol::CPol Bit) const;
+ bool enableCPolBits(const MachineBasicBlock::iterator MI,
+ unsigned Bits) const;
/// Check if any atomic operation on AS can affect memory accessible via the
/// global address space.
bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const;
public:
+ using CPol = AMDGPU::CPol::CPol;
/// Create a cache control for the subtarget \p ST.
static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
@@ -401,21 +402,9 @@ class SICacheControl {
virtual ~SICacheControl() = default;
};
+/// Generates code sequences for the memory model of all GFX targets below
+/// GFX10.
class SIGfx6CacheControl : public SICacheControl {
-protected:
-
- /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::GLC);
- }
-
- /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::SLC);
- }
-
public:
SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
@@ -454,114 +443,9 @@ class SIGfx6CacheControl : public SICacheControl {
Position Pos) const override;
};
-class SIGfx7CacheControl : public SIGfx6CacheControl {
-public:
-
- SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
-
- bool insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
-
-};
-
-class SIGfx90ACacheControl : public SIGfx7CacheControl {
-public:
-
- SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
-
- bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
-
- bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
-
- bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
- SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal,
- bool IsLastUse) const override;
-
- bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order, bool AtomicsOnly) const override;
-
- bool insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
-
- bool insertRelease(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- bool IsCrossAddrSpaceOrdering,
- Position Pos) const override;
-};
-
-class SIGfx940CacheControl : public SIGfx90ACacheControl {
-protected:
-
- /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::SC0);
- }
-
- /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::SC1);
- }
-
- /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::NT);
- }
-
-public:
- SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
-
- bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
-
- bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
-
- bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
-
- bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
- SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal,
- bool IsLastUse) const override;
-
- bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace, Position Pos) const override;
-
- bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
- Position Pos) const override;
-};
-
-class SIGfx10CacheControl : public SIGfx7CacheControl {
-protected:
-
- /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
- /// is modified, false otherwise.
- bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit(MI, AMDGPU::CPol::DLC);
- }
-
+class SIGfx10CacheControl : public SIGfx6CacheControl {
public:
-
- SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
+ SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
@@ -601,6 +485,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
// \returns Returns true if \p MI is modified, false otherwise.
bool setTH(const MachineBasicBlock::iterator MI,
AMDGPU::CPol::CPol Value) const;
+
// Sets Scope policy to \p Value if CPol operand is present in instruction \p
// MI. \returns Returns true if \p MI is modified, false otherwise.
bool setScope(const MachineBasicBlock::iterator MI,
@@ -1006,13 +891,13 @@ SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
InsertCacheInv = !AmdgcnSkipCacheInvalidations;
}
-bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
- AMDGPU::CPol::CPol Bit) const {
+bool SICacheControl::enableCPolBits(const MachineBasicBlock::iterator MI,
+ unsigned Bits) const {
MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
if (!CPol)
return false;
- CPol->setImm(CPol->getImm() | Bit);
+ CPol->setImm(CPol->getImm() | Bits);
return true;
}
@@ -1028,14 +913,8 @@ bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const {
/* static */
std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
GCNSubtarget::Generation Generation = ST.getGeneration();
- if (ST.hasGFX940Insts())
- return std::make_unique<SIGfx940CacheControl>(ST);
- if (ST.hasGFX90AInsts())
- return std::make_unique<SIGfx90ACacheControl>(ST);
- if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
- return std::make_unique<SIGfx6CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX10)
- return std::make_unique<SIGfx7CacheControl>(ST);
+ return std::make_unique<SIGfx6CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX11)
return std::make_unique<SIGfx10CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX12)
@@ -1048,33 +927,61 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
assert(MI->mayLoad() && !MI->mayStore());
- bool Changed = false;
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
+ if (!canAffectGlobalAddrSpace(AddrSpace)) {
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+ return false;
+ }
+
+ bool Changed = false;
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ if (ST.hasGFX940Insts()) {
+ // Set SC bits to indicate system scope.
+ Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
+ break;
+ }
+ [[fallthrough]];
+ case SIAtomicScope::AGENT:
+ if (ST.hasGFX940Insts()) {
+ // Set SC bits to indicate agent scope.
+ Changed |= enableCPolBits(MI, CPol::SC1);
+ } else {
// Set L1 cache policy to MISS_EVICT.
// Note: there is no L2 cache bypass policy at the ISA level.
- Changed |= enableGLCBit(MI);
- break;
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to bypass.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
+ Changed |= enableCPolBits(MI, CPol::GLC);
+ }
+ break;
+ case SIAtomicScope::WORKGROUP:
+ if (ST.hasGFX940Insts()) {
+ // In threadgroup split mode the waves of a work-group can be executing
+ // on different CUs. Therefore need to bypass the L1 which is per CU.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are
+ // on the same CU, and so the L1 does not need to be bypassed. Setting
+ // SC bits to indicate work-group scope will do this automatically.
+ Changed |= enableCPolBits(MI, CPol::SC0);
+ } else if (ST.hasGFX90AInsts()) {
+ // In threadgroup split mode the waves of a work-group can be executing
+ // on different CUs. Therefore need to bypass the L1 which is per CU.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are
+ // on the same CU, and so the L1 does not need to be bypassed.
+ if (ST.isTgSplitEnabled())
+ Changed |= enableCPolBits(MI, CPol::GLC);
}
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to bypass.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
}
- /// The scratch address space does not need the global memory caches
- /// to be bypassed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
return Changed;
}
@@ -1085,8 +992,39 @@ bool SIGfx6CacheControl::enableStoreCacheBypass(
assert(!MI->mayLoad() && MI->mayStore());
bool Changed = false;
- /// The L1 cache is write through so does not need to be bypassed. There is no
- /// bypass control for the L2 cache at the isa level.
+ /// For targets other than GFX940, the L1 cache is write through so does not
+ /// need to be bypassed. There is no bypass control for the L2 cache at the
+ /// isa level.
+
+ if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Set SC bits to indicate system scope.
+ Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
+ break;
+ case SIAtomicScope::AGENT:
+ // Set SC bits to indicate agent scope.
+ Changed |= enableCPolBits(MI, CPol::SC1);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // Set SC bits to indicate workgroup scope.
+ Changed |= enableCPolBits(MI, CPol::SC0);
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Leave SC bits unset to indicate wavefront scope.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+ }
return Changed;
}
@@ -1098,10 +1036,31 @@ bool SIGfx6CacheControl::enableRMWCacheBypass(
assert(MI->mayLoad() && MI->mayStore());
bool Changed = false;
- /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
- /// bypassed, and the GLC bit is instead used to indicate if they are
- /// return or no-return.
- /// Note: there is no L2 cache coherent bypass control at the ISA level.
+ /// For targets other than GFX940, do not set GLC for RMW atomic operations as
+ /// L0/L1 cache is automatically bypassed, and the GLC bit is instead used to
+ /// indicate if they are return or no-return. Note: there is no L2 cache
+ /// coherent bypass control at the ISA level.
+ /// For GFX90A+, RMW atomics implicitly bypass the L1 cache.
+
+ if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Set SC1 bit to indicate system scope.
+ Changed |= enableCPolBits(MI, CPol::SC1);
+ break;
+ case SIAtomicScope::AGENT:
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // RMW atomic operations implicitly bypass the L1 cache and only use SC1
+ // to indicate system or agent scope. The SC0 bit is used to indicate if
+ // they are return or no-return. Leave SC1 bit unset to indicate agent
+ // scope.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
return Changed;
}
@@ -1123,11 +1082,15 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
bool Changed = false;
if (IsVolatile) {
- // Set L1 cache policy to be MISS_EVICT for load instructions
- // and MISS_LRU for store instructions.
- // Note: there is no L2 cache bypass policy at the ISA level.
- if (Op == SIMemOp::LOAD)
- Changed |= enableGLCBit(MI);
+ if (ST.hasGFX940Insts()) {
+ // Set SC bits to indicate system scope.
+ Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
+ } else if (Op == SIMemOp::LOAD) {
+ // Set L1 cache policy to be MISS_EVICT for load instructions
+ // and MISS_LRU for store instructions.
+ // Note: there is no L2 cache bypass policy at the ISA level.
+ Changed |= enableCPolBits(MI, CPol::GLC);
+ }
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
@@ -1142,10 +1105,13 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
}
if (IsNonTemporal) {
- // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
- // for both loads and stores, and the L2 cache policy to STREAM.
- Changed |= enableGLCBit(MI);
- Changed |= enableSLCBit(MI);
+ if (ST.hasGFX940Insts()) {
+ Changed |= enableCPolBits(MI, CPol::NT);
+ } else {
+ // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
+ // for both loads and stores, and the L2 cache policy to STREAM.
+ Changed |= enableCPolBits(MI, CPol::SLC | CPol::GLC);
+ }
return Changed;
}
@@ -1166,6 +1132,26 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
++MI;
+ // GFX90A+
+ if (ST.hasGFX90AInsts() && ST.isTgSplitEnabled()) {
+ // In threadgroup split mode the waves of a work-group can be executing on
+ // different CUs. Therefore need to wait for global or GDS memory operations
+ // to complete to ensure they are visible to waves in the other CUs.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are on
+ // the same CU, so no need to wait for global memory as all waves in the
+ // work-group access the same the L1, nor wait for GDS as access are ordered
+ // on a CU.
+ if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
+ SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
+ (Scope == SIAtomicScope::WORKGROUP)) {
+ // Same as <GFX90A at AGENT scope;
+ Scope = SIAtomicScope::AGENT;
+ }
+ // In threadgroup split mode LDS cannot be allocated so no need to wait for
+ // LDS memory operations.
+ AddrSpace &= ~SIAtomicAddrSpace::LDS;
+ }
+
bool VMCnt = false;
bool LGKMCnt = false;
@@ -1260,6 +1246,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
return Changed;
}
+static bool canUseBUFFER_WBINVL1_VOL(const GCNSubtarget &ST) {
+ if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ return false;
+ return !(ST.isAmdPalOS() || ST.isMesa3DOS());
+}
+
bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
@@ -1275,17 +1267,92 @@ bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
++MI;
+ const unsigned InvalidateL1 = canUseBUFFER_WBINVL1_VOL(ST)
+ ? AMDGPU::BUFFER_WBINVL1_VOL
+ : AMDGPU::BUFFER_WBINVL1;
+
if (canAffectGlobalAddrSpace(AddrSpace)) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
+ if (ST.hasGFX940Insts()) {
+ // Ensures that following loads will not see stale remote VMEM data or
+ // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
+ // and CC will never be stale due to the local memory probes.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate system scope.
+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
+ // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
+ // remove any cache lines of earlier writes by the same wave and ensures
+ // later reads by the same wave will refetch the cache lines.
+ Changed = true;
+ break;
+ }
+
+ if (ST.hasGFX90AInsts()) {
+ // Ensures that following loads will not see stale remote VMEM data or
+ // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
+ // and CC will never be stale due to the local memory probes.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
+ BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
+ // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed
+ // to remove any cache lines of earlier writes by the same wave and
+ // ensures later reads by the same wave will refetch the cache lines.
+ Changed = true;
+ break;
+ }
+ [[fallthrough]];
case SIAtomicScope::AGENT:
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
+ if (ST.hasGFX940Insts()) {
+ // Ensures that following loads will not see stale remote date or local
+ // MTYPE NC global data. Local MTYPE RW and CC memory will never be
+ // stale due to the memory probes.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate agent scope.
+ .addImm(AMDGPU::CPol::SC1);
+ // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+ // does not reorder memory operations with respect to preceeding buffer
+ // invalidate. The invalidate is guaranteed to remove any cache lines of
+ // earlier writes and ensures later writes will refetch the cache lines.
+ } else
+ BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
Changed = true;
break;
case SIAtomicScope::WORKGROUP:
+ if (ST.isTgSplitEnabled()) {
+ if (ST.hasGFX940Insts()) {
+ // In threadgroup split mode the waves of a work-group can be
+ // executing on different CUs. Therefore need to invalidate the L1
+ // which is per CU. Otherwise in non-threadgroup split mode all waves
+ // of a work-group are on the same CU, and so the L1 does not need to
+ // be invalidated.
+
+ // Ensures L1 is invalidated if in threadgroup split mode. In
+ // non-threadgroup split mode it is a NOP, but no point generating it
+ // in that case if know not in that mode.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate work-group scope.
+ .addImm(AMDGPU::CPol::SC0);
+ // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+ // does not reorder memory operations with respect to preceeding
+ // buffer invalidate. The invalidate is guaranteed to remove any cache
+ // lines of earlier writes and ensures later writes will refetch the
+ // cache lines.
+ Changed = true;
+ } else if (ST.hasGFX90AInsts()) {
+ BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
+ Changed = true;
+ }
+ }
+ break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
- // No cache to invalidate.
+ // For GFX940, we could generate "BUFFER_INV" but it would do nothing as
+ // there are no caches to invalidate. All other targets have no cache to
+ // invalidate.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
@@ -1310,618 +1377,57 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
Position Pos) const {
- return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
- /*AtomicsOnly=*/false);
-}
-
-bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
- if (!InsertCacheInv)
- return false;
-
bool Changed = false;
- MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
-
- const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
+ if (ST.hasGFX90AInsts()) {
+ MachineBasicBlock &MBB = *MI->getParent();
+ const DebugLoc &DL = MI->getDebugLoc();
- const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
- ? AMDGPU::BUFFER_WBINVL1
- : AMDGPU::BUFFER_WBINVL1_VOL;
-
- if (Pos == Position::AFTER)
- ++MI;
+ if (Pos == Position::AFTER)
+ ++MI;
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
- BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
- Changed = true;
- break;
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to invalidate.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
+ if (canAffectGlobalAddrSpace(AddrSpace)) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
+ // to initiate writeback of any dirty cache lines of earlier writes by
+ // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
+ // writeback has completed.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+ // Set SC bits to indicate system scope.
+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
+ Changed = true;
+ break;
+ case SIAtomicScope::AGENT:
+ if (ST.hasGFX940Insts()) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+ // Set SC bits to indicate agent scope.
+ .addImm(AMDGPU::CPol::SC1);
+
+ // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
+ // SIAtomicScope::AGENT, the following insertWait will generate the
+ // required "S_WAITCNT vmcnt(0)".
+ Changed = true;
+ }
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it
+ // would writeback, and would require an otherwise unnecessary
+ // "S_WAITCNT vmcnt(0)".
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
}
- }
- /// The scratch address space does not need the global memory cache
- /// to be flushed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- if (Pos == Position::AFTER)
- --MI;
-
- return Changed;
-}
-
-bool SIGfx90ACacheControl::enableLoadCacheBypass(
- const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const {
- assert(MI->mayLoad() && !MI->mayStore());
- bool Changed = false;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
- // Set the L1 cache policy to MISS_LRU.
- // Note: there is no L2 cache bypass policy at the ISA level.
- Changed |= enableGLCBit(MI);
- break;
- case SIAtomicScope::WORKGROUP:
- // In threadgroup split mode the waves of a work-group can be executing on
- // different CUs. Therefore need to bypass the L1 which is per CU.
- // Otherwise in non-threadgroup split mode all waves of a work-group are
- // on the same CU, and so the L1 does not need to be bypassed.
- if (ST.isTgSplitEnabled())
- Changed |= enableGLCBit(MI);
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to bypass.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory caches
- /// to be bypassed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- return Changed;
-}
-
-bool SIGfx90ACacheControl::enableRMWCacheBypass(
- const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const {
- assert(MI->mayLoad() && MI->mayStore());
- bool Changed = false;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
- /// Do not set glc for RMW atomic operations as they implicitly bypass
- /// the L1 cache, and the glc bit is instead used to indicate if they are
- /// return or no-return.
- break;
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- return Changed;
-}
-
-bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
- MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
- // Only handle load and store, not atomic read-modify-write insructions. The
- // latter use glc to indicate if the atomic returns a result and so must not
- // be used for cache control.
- assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
-
- // Only update load and store, not LLVM IR atomic read-modify-write
- // instructions. The latter are always marked as volatile so cannot sensibly
- // handle it as do not want to pessimize all atomics. Also they do not support
- // the nontemporal attribute.
- assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
-
- bool Changed = false;
-
- if (IsVolatile) {
- // Set L1 cache policy to be MISS_EVICT for load instructions
- // and MISS_LRU for store instructions.
- // Note: there is no L2 cache bypass policy at the ISA level.
- if (Op == SIMemOp::LOAD)
- Changed |= enableGLCBit(MI);
-
- // Ensure operation has completed at system scope to cause all volatile
- // operations to be visible outside the program in a global order. Do not
- // request cross address space as only the global address space can be
- // observable outside the program, so no need to cause a waitcnt for LDS
- // address space operations.
- Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered,
- /*AtomicsOnly=*/false);
-
- return Changed;
- }
-
- if (IsNonTemporal) {
- // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
- // for both loads and stores, and the L2 cache policy to STREAM.
- Changed |= enableGLCBit(MI);
- Changed |= enableSLCBit(MI);
- return Changed;
- }
-
- return Changed;
-}
-
-bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsCrossAddrSpaceOrdering,
- Position Pos, AtomicOrdering Order,
- bool AtomicsOnly) const {
- if (ST.isTgSplitEnabled()) {
- // In threadgroup split mode the waves of a work-group can be executing on
- // different CUs. Therefore need to wait for global or GDS memory operations
- // to complete to ensure they are visible to waves in the other CUs.
- // Otherwise in non-threadgroup split mode all waves of a work-group are on
- // the same CU, so no need to wait for global memory as all waves in the
- // work-group access the same the L1, nor wait for GDS as access are ordered
- // on a CU.
- if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
- SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
- (Scope == SIAtomicScope::WORKGROUP)) {
- // Same as GFX7 using agent scope.
- Scope = SIAtomicScope::AGENT;
- }
- // In threadgroup split mode LDS cannot be allocated so no need to wait for
- // LDS memory operations.
- AddrSpace &= ~SIAtomicAddrSpace::LDS;
- }
- return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
- IsCrossAddrSpaceOrdering, Pos, Order,
- AtomicsOnly);
-}
-
-bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
- if (!InsertCacheInv)
- return false;
-
- bool Changed = false;
-
- MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
-
- if (Pos == Position::AFTER)
- ++MI;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Ensures that following loads will not see stale remote VMEM data or
- // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
- // CC will never be stale due to the local memory probes.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
- // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
- // hardware does not reorder memory operations by the same wave with
- // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
- // remove any cache lines of earlier writes by the same wave and ensures
- // later reads by the same wave will refetch the cache lines.
- Changed = true;
- break;
- case SIAtomicScope::AGENT:
- // Same as GFX7.
- break;
- case SIAtomicScope::WORKGROUP:
- // In threadgroup split mode the waves of a work-group can be executing on
- // different CUs. Therefore need to invalidate the L1 which is per CU.
- // Otherwise in non-threadgroup split mode all waves of a work-group are
- // on the same CU, and so the L1 does not need to be invalidated.
- if (ST.isTgSplitEnabled()) {
- // Same as GFX7 using agent scope.
- Scope = SIAtomicScope::AGENT;
- }
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // Same as GFX7.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory cache
- /// to be flushed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- if (Pos == Position::AFTER)
- --MI;
-
- Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
-
- return Changed;
-}
-
-bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- bool IsCrossAddrSpaceOrdering,
- Position Pos) const {
- bool Changed = false;
-
- MachineBasicBlock &MBB = *MI->getParent();
- const DebugLoc &DL = MI->getDebugLoc();
-
- if (Pos == Position::AFTER)
- ++MI;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
- // hardware does not reorder memory operations by the same wave with
- // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
- // to initiate writeback of any dirty cache lines of earlier writes by the
- // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
- // writeback has completed.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
- // Set SC bits to indicate system scope.
- .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
- // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
- // vmcnt(0)" needed by the "BUFFER_WBL2".
- Changed = true;
- break;
- case SIAtomicScope::AGENT:
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // Same as GFX7.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- if (Pos == Position::AFTER)
- --MI;
-
- Changed |=
- SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
- IsCrossAddrSpaceOrdering, Pos);
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::enableLoadCacheBypass(
- const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const {
- assert(MI->mayLoad() && !MI->mayStore());
- bool Changed = false;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Set SC bits to indicate system scope.
- Changed |= enableSC0Bit(MI);
- Changed |= enableSC1Bit(MI);
- break;
- case SIAtomicScope::AGENT:
- // Set SC bits to indicate agent scope.
- Changed |= enableSC1Bit(MI);
- break;
- case SIAtomicScope::WORKGROUP:
- // In threadgroup split mode the waves of a work-group can be executing on
- // different CUs. Therefore need to bypass the L1 which is per CU.
- // Otherwise in non-threadgroup split mode all waves of a work-group are
- // on the same CU, and so the L1 does not need to be bypassed. Setting SC
- // bits to indicate work-group scope will do this automatically.
- Changed |= enableSC0Bit(MI);
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // Leave SC bits unset to indicate wavefront scope.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory caches
- /// to be bypassed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::enableStoreCacheBypass(
- const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
- assert(!MI->mayLoad() && MI->mayStore());
- bool Changed = false;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Set SC bits to indicate system scope.
- Changed |= enableSC0Bit(MI);
- Changed |= enableSC1Bit(MI);
- break;
- case SIAtomicScope::AGENT:
- // Set SC bits to indicate agent scope.
- Changed |= enableSC1Bit(MI);
- break;
- case SIAtomicScope::WORKGROUP:
- // Set SC bits to indicate workgroup scope.
- Changed |= enableSC0Bit(MI);
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // Leave SC bits unset to indicate wavefront scope.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory caches
- /// to be bypassed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::enableRMWCacheBypass(
- const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const {
- assert(MI->mayLoad() && MI->mayStore());
- bool Changed = false;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Set SC1 bit to indicate system scope.
- Changed |= enableSC1Bit(MI);
- break;
- case SIAtomicScope::AGENT:
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // RMW atomic operations implicitly bypass the L1 cache and only use SC1
- // to indicate system or agent scope. The SC0 bit is used to indicate if
- // they are return or no-return. Leave SC1 bit unset to indicate agent
- // scope.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
- MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
- // Only handle load and store, not atomic read-modify-write insructions. The
- // latter use glc to indicate if the atomic returns a result and so must not
- // be used for cache control.
- assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
-
- // Only update load and store, not LLVM IR atomic read-modify-write
- // instructions. The latter are always marked as volatile so cannot sensibly
- // handle it as do not want to pessimize all atomics. Also they do not support
- // the nontemporal attribute.
- assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
-
- bool Changed = false;
-
- if (IsVolatile) {
- // Set SC bits to indicate system scope.
- Changed |= enableSC0Bit(MI);
- Changed |= enableSC1Bit(MI);
-
- // Ensure operation has completed at system scope to cause all volatile
- // operations to be visible outside the program in a global order. Do not
- // request cross address space as only the global address space can be
- // observable outside the program, so no need to cause a waitcnt for LDS
- // address space operations.
- Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered,
- /*AtomicsOnly=*/false);
-
- return Changed;
- }
-
- if (IsNonTemporal) {
- Changed |= enableNTBit(MI);
- return Changed;
- }
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
- if (!InsertCacheInv)
- return false;
-
- bool Changed = false;
-
- MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
-
- if (Pos == Position::AFTER)
- ++MI;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Ensures that following loads will not see stale remote VMEM data or
- // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
- // CC will never be stale due to the local memory probes.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
- // Set SC bits to indicate system scope.
- .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
- // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
- // hardware does not reorder memory operations by the same wave with
- // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
- // remove any cache lines of earlier writes by the same wave and ensures
- // later reads by the same wave will refetch the cache lines.
- Changed = true;
- break;
- case SIAtomicScope::AGENT:
- // Ensures that following loads will not see stale remote date or local
- // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
- // due to the memory probes.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
- // Set SC bits to indicate agent scope.
- .addImm(AMDGPU::CPol::SC1);
- // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
- // does not reorder memory operations with respect to preceeding buffer
- // invalidate. The invalidate is guaranteed to remove any cache lines of
- // earlier writes and ensures later writes will refetch the cache lines.
- Changed = true;
- break;
- case SIAtomicScope::WORKGROUP:
- // In threadgroup split mode the waves of a work-group can be executing on
- // different CUs. Therefore need to invalidate the L1 which is per CU.
- // Otherwise in non-threadgroup split mode all waves of a work-group are
- // on the same CU, and so the L1 does not need to be invalidated.
- if (ST.isTgSplitEnabled()) {
- // Ensures L1 is invalidated if in threadgroup split mode. In
- // non-threadgroup split mode it is a NOP, but no point generating it in
- // that case if know not in that mode.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
- // Set SC bits to indicate work-group scope.
- .addImm(AMDGPU::CPol::SC0);
- // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
- // does not reorder memory operations with respect to preceeding buffer
- // invalidate. The invalidate is guaranteed to remove any cache lines of
- // earlier writes and ensures later writes will refetch the cache lines.
- Changed = true;
- }
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // Could generate "BUFFER_INV" but it would do nothing as there are no
- // caches to invalidate.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory cache
- /// to be flushed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- if (Pos == Position::AFTER)
- --MI;
-
- return Changed;
-}
-
-bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- bool IsCrossAddrSpaceOrdering,
- Position Pos) const {
- bool Changed = false;
-
- MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
-
- if (Pos == Position::AFTER)
- ++MI;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
- // hardware does not reorder memory operations by the same wave with
- // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
- // to initiate writeback of any dirty cache lines of earlier writes by the
- // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
- // writeback has completed.
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
- // Set SC bits to indicate system scope.
- .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
- // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
- // SIAtomicScope::SYSTEM, the following insertWait will generate the
- // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
- Changed = true;
- break;
- case SIAtomicScope::AGENT:
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
- // Set SC bits to indicate agent scope.
- .addImm(AMDGPU::CPol::SC1);
-
- // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
- // SIAtomicScope::AGENT, the following insertWait will generate the
- // required "S_WAITCNT vmcnt(0)".
- Changed = true;
- break;
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // Do not generate "BUFFER_WBL2" as there are no caches it would
- // writeback, and would require an otherwise unnecessary
- // "S_WAITCNT vmcnt(0)".
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
+ if (Pos == Position::AFTER)
+ --MI;
}
- if (Pos == Position::AFTER)
- --MI;
-
// Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
// S_WAITCNT needed.
Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
@@ -1944,8 +1450,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
case SIAtomicScope::AGENT:
// Set the L0 and L1 cache policies to MISS_EVICT.
// Note: there is no L2 cache coherent bypass control at the ISA level.
- Changed |= enableGLCBit(MI);
- Changed |= enableDLCBit(MI);
+ Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
break;
case SIAtomicScope::WORKGROUP:
// In WGP mode the waves of a work-group can be executing on either CU of
@@ -1953,7 +1458,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
// CU mode all waves of a work-group are on the same CU, and so the L0
// does not need to be bypassed.
if (!ST.isCuModeEnabled())
- Changed |= enableGLCBit(MI);
+ Changed |= enableCPolBits(MI, CPol::GLC);
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
@@ -1996,8 +1501,7 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
// and MISS_LRU for store instructions.
// Note: there is no L2 cache coherent bypass control at the ISA level.
if (Op == SIMemOp::LOAD) {
- Changed |= enableGLCBit(MI);
- Changed |= enableDLCBit(MI);
+ Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
}
// Ensure operation has completed at system scope to cause all volatile
@@ -2017,8 +1521,8 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
// For stores setting both GLC and SLC configures L0 and L1 cache policy
// to MISS_EVICT and the L2 cache policy to STREAM.
if (Op == SIMemOp::STORE)
- Changed |= enableGLCBit(MI);
- Changed |= enableSLCBit(MI);
+ Changed |= enableCPolBits(MI, CPol::GLC);
+ Changed |= enableCPolBits(MI, CPol::SLC);
return Changed;
}
@@ -2230,7 +1734,7 @@ bool SIGfx11CacheControl::enableLoadCacheBypass(
case SIAtomicScope::AGENT:
// Set the L0 and L1 cache policies to MISS_EVICT.
// Note: there is no L2 cache coherent bypass control at the ISA level.
- Changed |= enableGLCBit(MI);
+ Changed |= enableCPolBits(MI, CPol::GLC);
break;
case SIAtomicScope::WORKGROUP:
// In WGP mode the waves of a work-group can be executing on either CU of
@@ -2238,7 +1742,7 @@ bool SIGfx11CacheControl::enableLoadCacheBypass(
// CU mode all waves of a work-group are on the same CU, and so the L0
// does not need to be bypassed.
if (!ST.isCuModeEnabled())
- Changed |= enableGLCBit(MI);
+ Changed |= enableCPolBits(MI, CPol::GLC);
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
@@ -2281,10 +1785,10 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
// and MISS_LRU for store instructions.
// Note: there is no L2 cache coherent bypass control at the ISA level.
if (Op == SIMemOp::LOAD)
- Changed |= enableGLCBit(MI);
+ Changed |= enableCPolBits(MI, CPol::GLC);
// Set MALL NOALLOC for load and store instructions.
- Changed |= enableDLCBit(MI);
+ Changed |= enableCPolBits(MI, CPol::DLC);
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
@@ -2303,11 +1807,11 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
// For stores setting both GLC and SLC configures L0 and L1 cache policy
// to MISS_EVICT and the L2 cache policy to STREAM.
if (Op == SIMemOp::STORE)
- Changed |= enableGLCBit(MI);
- Changed |= enableSLCBit(MI);
+ Changed |= enableCPolBits(MI, CPol::GLC);
+ Changed |= enableCPolBits(MI, CPol::SLC);
// Set MALL NOALLOC for load and store instructions.
- Changed |= enableDLCBit(MI);
+ Changed |= enableCPolBits(MI, CPol::DLC);
return Changed;
}
>From b092e9c091f7181831ac3847c21b065ed80a0919 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 17 Nov 2025 10:05:14 +0100
Subject: [PATCH 2/2] nit
---
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 965631b3c09e4..49aba39872138 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1249,7 +1249,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
static bool canUseBUFFER_WBINVL1_VOL(const GCNSubtarget &ST) {
if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
return false;
- return !(ST.isAmdPalOS() || ST.isMesa3DOS());
+ return !ST.isAmdPalOS() && !ST.isMesa3DOS();
}
bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
More information about the llvm-commits
mailing list