[llvm] [AMDGPU][SIMemoryLegalizer] Combine all GFX6-9 CacheControl Classes (PR #168052)

Mon Nov 17 01:07:06 PST 2025

https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/168052

>From 3d3df4c637f3fac616dff63e23c7f0180b1828f5 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Fri, 14 Nov 2025 14:11:10 +0100
Subject: [PATCH 1/2] [AMDGPU][SIMemoryLegalizer] Combine all GFX6-9
 CacheControl Classes

Merge the following classes into `SIGfx6CacheControl`:
- SIGfx7CacheControl
- SIGfx90ACacheControl
- SIGfx940CacheControl

They were all very similar and had a lot of duplicated boilerplate just to implement one or two codegen differences. GFX90A/GFX940 have a bit more differences, but they're still manageable under one class because the general behavior is the same.

This removes 500 lines of code and puts everything into a single place which I think makes it a lot easier to maintain, at the cost of a slight increase in complexity for some functions.

There is still a lot of room for improvement but I think this patch is already big enough as is and I don't want to bundle too much into one review.
---
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 1084 +++++-------------
 1 file changed, 294 insertions(+), 790 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 0dac6d2d7ee4b..965631b3c09e4 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -302,16 +302,17 @@ class SICacheControl {
 
   SICacheControl(const GCNSubtarget &ST);
 
-  /// Sets named bit \p BitName to "true" if present in instruction \p MI.
+  /// Sets CPol \p Bits to "true" if present in instruction \p MI.
   /// \returns Returns true if \p MI is modified, false otherwise.
-  bool enableNamedBit(const MachineBasicBlock::iterator MI,
-                      AMDGPU::CPol::CPol Bit) const;
+  bool enableCPolBits(const MachineBasicBlock::iterator MI,
+                      unsigned Bits) const;
 
   /// Check if any atomic operation on AS can affect memory accessible via the
   /// global address space.
   bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const;
 
 public:
+  using CPol = AMDGPU::CPol::CPol;
 
   /// Create a cache control for the subtarget \p ST.
   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
@@ -401,21 +402,9 @@ class SICacheControl {
   virtual ~SICacheControl() = default;
 };
 
+/// Generates code sequences for the memory model of all GFX targets below
+/// GFX10.
 class SIGfx6CacheControl : public SICacheControl {
-protected:
-
-  /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
-  /// is modified, false otherwise.
-  bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
-    return enableNamedBit(MI, AMDGPU::CPol::GLC);
-  }
-
-  /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
-  /// is modified, false otherwise.
-  bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
-    return enableNamedBit(MI, AMDGPU::CPol::SLC);
-  }
-
 public:
 
   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
@@ -454,114 +443,9 @@ class SIGfx6CacheControl : public SICacheControl {
                      Position Pos) const override;
 };
 
-class SIGfx7CacheControl : public SIGfx6CacheControl {
-public:
-
-  SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
-
-  bool insertAcquire(MachineBasicBlock::iterator &MI,
-                     SIAtomicScope Scope,
-                     SIAtomicAddrSpace AddrSpace,
-                     Position Pos) const override;
-
-};
-
-class SIGfx90ACacheControl : public SIGfx7CacheControl {
-public:
-
-  SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
-
-  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
-                             SIAtomicScope Scope,
-                             SIAtomicAddrSpace AddrSpace) const override;
-
-  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
-                            SIAtomicScope Scope,
-                            SIAtomicAddrSpace AddrSpace) const override;
-
-  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
-                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
-                                      bool IsVolatile, bool IsNonTemporal,
-                                      bool IsLastUse) const override;
-
-  bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
-                  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
-                  bool IsCrossAddrSpaceOrdering, Position Pos,
-                  AtomicOrdering Order, bool AtomicsOnly) const override;
-
-  bool insertAcquire(MachineBasicBlock::iterator &MI,
-                     SIAtomicScope Scope,
-                     SIAtomicAddrSpace AddrSpace,
-                     Position Pos) const override;
-
-  bool insertRelease(MachineBasicBlock::iterator &MI,
-                     SIAtomicScope Scope,
-                     SIAtomicAddrSpace AddrSpace,
-                     bool IsCrossAddrSpaceOrdering,
-                     Position Pos) const override;
-};
-
-class SIGfx940CacheControl : public SIGfx90ACacheControl {
-protected:
-
-  /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
-  /// is modified, false otherwise.
-  bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
-    return enableNamedBit(MI, AMDGPU::CPol::SC0);
-  }
-
-  /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
-  /// is modified, false otherwise.
-  bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
-    return enableNamedBit(MI, AMDGPU::CPol::SC1);
-  }
-
-  /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
-  /// is modified, false otherwise.
-  bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
-    return enableNamedBit(MI, AMDGPU::CPol::NT);
-  }
-
-public:
-  SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
-
-  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
-                             SIAtomicScope Scope,
-                             SIAtomicAddrSpace AddrSpace) const override;
-
-  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
-                              SIAtomicScope Scope,
-                              SIAtomicAddrSpace AddrSpace) const override;
-
-  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
-                            SIAtomicScope Scope,
-                            SIAtomicAddrSpace AddrSpace) const override;
-
-  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
-                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
-                                      bool IsVolatile, bool IsNonTemporal,
-                                      bool IsLastUse) const override;
-
-  bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
-                     SIAtomicAddrSpace AddrSpace, Position Pos) const override;
-
-  bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
-                     SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
-                     Position Pos) const override;
-};
-
-class SIGfx10CacheControl : public SIGfx7CacheControl {
-protected:
-
-  /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
-  /// is modified, false otherwise.
-  bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
-    return enableNamedBit(MI, AMDGPU::CPol::DLC);
-  }
-
+class SIGfx10CacheControl : public SIGfx6CacheControl {
 public:
-
-  SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
+  SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
 
   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
                              SIAtomicScope Scope,
@@ -601,6 +485,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
   // \returns Returns true if \p MI is modified, false otherwise.
   bool setTH(const MachineBasicBlock::iterator MI,
              AMDGPU::CPol::CPol Value) const;
+
   // Sets Scope policy to \p Value if CPol operand is present in instruction \p
   // MI. \returns Returns true if \p MI is modified, false otherwise.
   bool setScope(const MachineBasicBlock::iterator MI,
@@ -1006,13 +891,13 @@ SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
 }
 
-bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
-                                    AMDGPU::CPol::CPol Bit) const {
+bool SICacheControl::enableCPolBits(const MachineBasicBlock::iterator MI,
+                                    unsigned Bits) const {
   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
   if (!CPol)
     return false;
 
-  CPol->setImm(CPol->getImm() | Bit);
+  CPol->setImm(CPol->getImm() | Bits);
   return true;
 }
 
@@ -1028,14 +913,8 @@ bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const {
 /* static */
 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
   GCNSubtarget::Generation Generation = ST.getGeneration();
-  if (ST.hasGFX940Insts())
-    return std::make_unique<SIGfx940CacheControl>(ST);
-  if (ST.hasGFX90AInsts())
-    return std::make_unique<SIGfx90ACacheControl>(ST);
-  if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
-    return std::make_unique<SIGfx6CacheControl>(ST);
   if (Generation < AMDGPUSubtarget::GFX10)
-    return std::make_unique<SIGfx7CacheControl>(ST);
+    return std::make_unique<SIGfx6CacheControl>(ST);
   if (Generation < AMDGPUSubtarget::GFX11)
     return std::make_unique<SIGfx10CacheControl>(ST);
   if (Generation < AMDGPUSubtarget::GFX12)
@@ -1048,33 +927,61 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
     SIAtomicScope Scope,
     SIAtomicAddrSpace AddrSpace) const {
   assert(MI->mayLoad() && !MI->mayStore());
-  bool Changed = false;
 
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-    case SIAtomicScope::AGENT:
+  if (!canAffectGlobalAddrSpace(AddrSpace)) {
+    /// The scratch address space does not need the global memory caches
+    /// to be bypassed as all memory operations by the same thread are
+    /// sequentially consistent, and no other thread can access scratch
+    /// memory.
+
+    /// Other address spaces do not have a cache.
+    return false;
+  }
+
+  bool Changed = false;
+  switch (Scope) {
+  case SIAtomicScope::SYSTEM:
+    if (ST.hasGFX940Insts()) {
+      // Set SC bits to indicate system scope.
+      Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
+      break;
+    }
+    [[fallthrough]];
+  case SIAtomicScope::AGENT:
+    if (ST.hasGFX940Insts()) {
+      // Set SC bits to indicate agent scope.
+      Changed |= enableCPolBits(MI, CPol::SC1);
+    } else {
       // Set L1 cache policy to MISS_EVICT.
       // Note: there is no L2 cache bypass policy at the ISA level.
-      Changed |= enableGLCBit(MI);
-      break;
-    case SIAtomicScope::WORKGROUP:
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // No cache to bypass.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
+      Changed |= enableCPolBits(MI, CPol::GLC);
+    }
+    break;
+  case SIAtomicScope::WORKGROUP:
+    if (ST.hasGFX940Insts()) {
+      // In threadgroup split mode the waves of a work-group can be executing
+      // on different CUs. Therefore need to bypass the L1 which is per CU.
+      // Otherwise in non-threadgroup split mode all waves of a work-group are
+      // on the same CU, and so the L1 does not need to be bypassed. Setting
+      // SC bits to indicate work-group scope will do this automatically.
+      Changed |= enableCPolBits(MI, CPol::SC0);
+    } else if (ST.hasGFX90AInsts()) {
+      // In threadgroup split mode the waves of a work-group can be executing
+      // on different CUs. Therefore need to bypass the L1 which is per CU.
+      // Otherwise in non-threadgroup split mode all waves of a work-group are
+      // on the same CU, and so the L1 does not need to be bypassed.
+      if (ST.isTgSplitEnabled())
+        Changed |= enableCPolBits(MI, CPol::GLC);
     }
+    break;
+  case SIAtomicScope::WAVEFRONT:
+  case SIAtomicScope::SINGLETHREAD:
+    // No cache to bypass.
+    break;
+  default:
+    llvm_unreachable("Unsupported synchronization scope");
   }
 
-  /// The scratch address space does not need the global memory caches
-  /// to be bypassed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not have a cache.
-
   return Changed;
 }
 
@@ -1085,8 +992,39 @@ bool SIGfx6CacheControl::enableStoreCacheBypass(
   assert(!MI->mayLoad() && MI->mayStore());
   bool Changed = false;
 
-  /// The L1 cache is write through so does not need to be bypassed. There is no
-  /// bypass control for the L2 cache at the isa level.
+  /// For targets other than GFX940, the L1 cache is write through so does not
+  /// need to be bypassed. There is no bypass control for the L2 cache at the
+  /// isa level.
+
+  if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+      // Set SC bits to indicate system scope.
+      Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
+      break;
+    case SIAtomicScope::AGENT:
+      // Set SC bits to indicate agent scope.
+      Changed |= enableCPolBits(MI, CPol::SC1);
+      break;
+    case SIAtomicScope::WORKGROUP:
+      // Set SC bits to indicate workgroup scope.
+      Changed |= enableCPolBits(MI, CPol::SC0);
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // Leave SC bits unset to indicate wavefront scope.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+
+    /// The scratch address space does not need the global memory caches
+    /// to be bypassed as all memory operations by the same thread are
+    /// sequentially consistent, and no other thread can access scratch
+    /// memory.
+
+    /// Other address spaces do not have a cache.
+  }
 
   return Changed;
 }
@@ -1098,10 +1036,31 @@ bool SIGfx6CacheControl::enableRMWCacheBypass(
   assert(MI->mayLoad() && MI->mayStore());
   bool Changed = false;
 
-  /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
-  /// bypassed, and the GLC bit is instead used to indicate if they are
-  /// return or no-return.
-  /// Note: there is no L2 cache coherent bypass control at the ISA level.
+  /// For targets other than GFX940, do not set GLC for RMW atomic operations as
+  /// L0/L1 cache is automatically bypassed, and the GLC bit is instead used to
+  /// indicate if they are return or no-return. Note: there is no L2 cache
+  /// coherent bypass control at the ISA level.
+  ///       For GFX90A+, RMW atomics implicitly bypass the L1 cache.
+
+  if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+      // Set SC1 bit to indicate system scope.
+      Changed |= enableCPolBits(MI, CPol::SC1);
+      break;
+    case SIAtomicScope::AGENT:
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // RMW atomic operations implicitly bypass the L1 cache and only use SC1
+      // to indicate system or agent scope. The SC0 bit is used to indicate if
+      // they are return or no-return. Leave SC1 bit unset to indicate agent
+      // scope.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
 
   return Changed;
 }
@@ -1123,11 +1082,15 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
   bool Changed = false;
 
   if (IsVolatile) {
-    // Set L1 cache policy to be MISS_EVICT for load instructions
-    // and MISS_LRU for store instructions.
-    // Note: there is no L2 cache bypass policy at the ISA level.
-    if (Op == SIMemOp::LOAD)
-      Changed |= enableGLCBit(MI);
+    if (ST.hasGFX940Insts()) {
+      // Set SC bits to indicate system scope.
+      Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
+    } else if (Op == SIMemOp::LOAD) {
+      // Set L1 cache policy to be MISS_EVICT for load instructions
+      // and MISS_LRU for store instructions.
+      // Note: there is no L2 cache bypass policy at the ISA level.
+      Changed |= enableCPolBits(MI, CPol::GLC);
+    }
 
     // Ensure operation has completed at system scope to cause all volatile
     // operations to be visible outside the program in a global order. Do not
@@ -1142,10 +1105,13 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
   }
 
   if (IsNonTemporal) {
-    // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
-    // for both loads and stores, and the L2 cache policy to STREAM.
-    Changed |= enableGLCBit(MI);
-    Changed |= enableSLCBit(MI);
+    if (ST.hasGFX940Insts()) {
+      Changed |= enableCPolBits(MI, CPol::NT);
+    } else {
+      // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
+      // for both loads and stores, and the L2 cache policy to STREAM.
+      Changed |= enableCPolBits(MI, CPol::SLC | CPol::GLC);
+    }
     return Changed;
   }
 
@@ -1166,6 +1132,26 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
   if (Pos == Position::AFTER)
     ++MI;
 
+  // GFX90A+
+  if (ST.hasGFX90AInsts() && ST.isTgSplitEnabled()) {
+    // In threadgroup split mode the waves of a work-group can be executing on
+    // different CUs. Therefore need to wait for global or GDS memory operations
+    // to complete to ensure they are visible to waves in the other CUs.
+    // Otherwise in non-threadgroup split mode all waves of a work-group are on
+    // the same CU, so no need to wait for global memory as all waves in the
+    // work-group access the same the L1, nor wait for GDS as access are ordered
+    // on a CU.
+    if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
+                       SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
+        (Scope == SIAtomicScope::WORKGROUP)) {
+      // Same as <GFX90A at AGENT scope;
+      Scope = SIAtomicScope::AGENT;
+    }
+    // In threadgroup split mode LDS cannot be allocated so no need to wait for
+    // LDS memory operations.
+    AddrSpace &= ~SIAtomicAddrSpace::LDS;
+  }
+
   bool VMCnt = false;
   bool LGKMCnt = false;
 
@@ -1260,6 +1246,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
   return Changed;
 }
 
+static bool canUseBUFFER_WBINVL1_VOL(const GCNSubtarget &ST) {
+  if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+    return false;
+  return !(ST.isAmdPalOS() || ST.isMesa3DOS());
+}
+
 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
                                        SIAtomicScope Scope,
                                        SIAtomicAddrSpace AddrSpace,
@@ -1275,17 +1267,92 @@ bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
   if (Pos == Position::AFTER)
     ++MI;
 
+  const unsigned InvalidateL1 = canUseBUFFER_WBINVL1_VOL(ST)
+                                    ? AMDGPU::BUFFER_WBINVL1_VOL
+                                    : AMDGPU::BUFFER_WBINVL1;
+
   if (canAffectGlobalAddrSpace(AddrSpace)) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
+      if (ST.hasGFX940Insts()) {
+        // Ensures that following loads will not see stale remote VMEM data or
+        // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
+        // and CC will never be stale due to the local memory probes.
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+            // Set SC bits to indicate system scope.
+            .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
+        // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
+        // hardware does not reorder memory operations by the same wave with
+        // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
+        // remove any cache lines of earlier writes by the same wave and ensures
+        // later reads by the same wave will refetch the cache lines.
+        Changed = true;
+        break;
+      }
+
+      if (ST.hasGFX90AInsts()) {
+        // Ensures that following loads will not see stale remote VMEM data or
+        // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
+        // and CC will never be stale due to the local memory probes.
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
+        BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
+        // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
+        // hardware does not reorder memory operations by the same wave with
+        // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed
+        // to remove any cache lines of earlier writes by the same wave and
+        // ensures later reads by the same wave will refetch the cache lines.
+        Changed = true;
+        break;
+      }
+      [[fallthrough]];
     case SIAtomicScope::AGENT:
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
+      if (ST.hasGFX940Insts()) {
+        // Ensures that following loads will not see stale remote date or local
+        // MTYPE NC global data. Local MTYPE RW and CC memory will never be
+        // stale due to the memory probes.
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+            // Set SC bits to indicate agent scope.
+            .addImm(AMDGPU::CPol::SC1);
+        // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+        // does not reorder memory operations with respect to preceeding buffer
+        // invalidate. The invalidate is guaranteed to remove any cache lines of
+        // earlier writes and ensures later writes will refetch the cache lines.
+      } else
+        BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
       Changed = true;
       break;
     case SIAtomicScope::WORKGROUP:
+      if (ST.isTgSplitEnabled()) {
+        if (ST.hasGFX940Insts()) {
+          // In threadgroup split mode the waves of a work-group can be
+          // executing on different CUs. Therefore need to invalidate the L1
+          // which is per CU. Otherwise in non-threadgroup split mode all waves
+          // of a work-group are on the same CU, and so the L1 does not need to
+          // be invalidated.
+
+          // Ensures L1 is invalidated if in threadgroup split mode. In
+          // non-threadgroup split mode it is a NOP, but no point generating it
+          // in that case if know not in that mode.
+          BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+              // Set SC bits to indicate work-group scope.
+              .addImm(AMDGPU::CPol::SC0);
+          // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+          // does not reorder memory operations with respect to preceeding
+          // buffer invalidate. The invalidate is guaranteed to remove any cache
+          // lines of earlier writes and ensures later writes will refetch the
+          // cache lines.
+          Changed = true;
+        } else if (ST.hasGFX90AInsts()) {
+          BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
+          Changed = true;
+        }
+      }
+      break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
-      // No cache to invalidate.
+      // For GFX940, we could generate "BUFFER_INV" but it would do nothing as
+      // there are no caches to invalidate. All other targets have no cache to
+      // invalidate.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
@@ -1310,618 +1377,57 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
                                        SIAtomicAddrSpace AddrSpace,
                                        bool IsCrossAddrSpaceOrdering,
                                        Position Pos) const {
-  return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
-                    IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
-                    /*AtomicsOnly=*/false);
-}
-
-bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
-                                       SIAtomicScope Scope,
-                                       SIAtomicAddrSpace AddrSpace,
-                                       Position Pos) const {
-  if (!InsertCacheInv)
-    return false;
-
   bool Changed = false;
 
-  MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-
-  const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
+  if (ST.hasGFX90AInsts()) {
+    MachineBasicBlock &MBB = *MI->getParent();
+    const DebugLoc &DL = MI->getDebugLoc();
 
-  const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
-                                    ? AMDGPU::BUFFER_WBINVL1
-                                    : AMDGPU::BUFFER_WBINVL1_VOL;
-
-  if (Pos == Position::AFTER)
-    ++MI;
+    if (Pos == Position::AFTER)
+      ++MI;
 
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-    case SIAtomicScope::AGENT:
-      BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
-      Changed = true;
-      break;
-    case SIAtomicScope::WORKGROUP:
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // No cache to invalidate.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
+    if (canAffectGlobalAddrSpace(AddrSpace)) {
+      switch (Scope) {
+      case SIAtomicScope::SYSTEM:
+        // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
+        // hardware does not reorder memory operations by the same wave with
+        // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
+        // to initiate writeback of any dirty cache lines of earlier writes by
+        // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
+        // writeback has completed.
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+            // Set SC bits to indicate system scope.
+            .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
+        Changed = true;
+        break;
+      case SIAtomicScope::AGENT:
+        if (ST.hasGFX940Insts()) {
+          BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+              // Set SC bits to indicate agent scope.
+              .addImm(AMDGPU::CPol::SC1);
+
+          // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
+          // SIAtomicScope::AGENT, the following insertWait will generate the
+          // required "S_WAITCNT vmcnt(0)".
+          Changed = true;
+        }
+        break;
+      case SIAtomicScope::WORKGROUP:
+      case SIAtomicScope::WAVEFRONT:
+      case SIAtomicScope::SINGLETHREAD:
+        // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it
+        // would writeback, and would require an otherwise unnecessary
+        // "S_WAITCNT vmcnt(0)".
+        break;
+      default:
+        llvm_unreachable("Unsupported synchronization scope");
+      }
     }
-  }
 
-  /// The scratch address space does not need the global memory cache
-  /// to be flushed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not have a cache.
-
-  if (Pos == Position::AFTER)
-    --MI;
-
-  return Changed;
-}
-
-bool SIGfx90ACacheControl::enableLoadCacheBypass(
-    const MachineBasicBlock::iterator &MI,
-    SIAtomicScope Scope,
-    SIAtomicAddrSpace AddrSpace) const {
-  assert(MI->mayLoad() && !MI->mayStore());
-  bool Changed = false;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-    case SIAtomicScope::AGENT:
-      // Set the L1 cache policy to MISS_LRU.
-      // Note: there is no L2 cache bypass policy at the ISA level.
-      Changed |= enableGLCBit(MI);
-      break;
-    case SIAtomicScope::WORKGROUP:
-      // In threadgroup split mode the waves of a work-group can be executing on
-      // different CUs. Therefore need to bypass the L1 which is per CU.
-      // Otherwise in non-threadgroup split mode all waves of a work-group are
-      // on the same CU, and so the L1 does not need to be bypassed.
-      if (ST.isTgSplitEnabled())
-        Changed |= enableGLCBit(MI);
-      break;
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // No cache to bypass.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  /// The scratch address space does not need the global memory caches
-  /// to be bypassed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not have a cache.
-
-  return Changed;
-}
-
-bool SIGfx90ACacheControl::enableRMWCacheBypass(
-    const MachineBasicBlock::iterator &MI,
-    SIAtomicScope Scope,
-    SIAtomicAddrSpace AddrSpace) const {
-  assert(MI->mayLoad() && MI->mayStore());
-  bool Changed = false;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-    case SIAtomicScope::AGENT:
-      /// Do not set glc for RMW atomic operations as they implicitly bypass
-      /// the L1 cache, and the glc bit is instead used to indicate if they are
-      /// return or no-return.
-      break;
-    case SIAtomicScope::WORKGROUP:
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  return Changed;
-}
-
-bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
-    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
-    bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
-  // Only handle load and store, not atomic read-modify-write insructions. The
-  // latter use glc to indicate if the atomic returns a result and so must not
-  // be used for cache control.
-  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
-
-  // Only update load and store, not LLVM IR atomic read-modify-write
-  // instructions. The latter are always marked as volatile so cannot sensibly
-  // handle it as do not want to pessimize all atomics. Also they do not support
-  // the nontemporal attribute.
-  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
-
-  bool Changed = false;
-
-  if (IsVolatile) {
-    // Set L1 cache policy to be MISS_EVICT for load instructions
-    // and MISS_LRU for store instructions.
-    // Note: there is no L2 cache bypass policy at the ISA level.
-    if (Op == SIMemOp::LOAD)
-      Changed |= enableGLCBit(MI);
-
-    // Ensure operation has completed at system scope to cause all volatile
-    // operations to be visible outside the program in a global order. Do not
-    // request cross address space as only the global address space can be
-    // observable outside the program, so no need to cause a waitcnt for LDS
-    // address space operations.
-    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
-                          Position::AFTER, AtomicOrdering::Unordered,
-                          /*AtomicsOnly=*/false);
-
-    return Changed;
-  }
-
-  if (IsNonTemporal) {
-    // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
-    // for both loads and stores, and the L2 cache policy to STREAM.
-    Changed |= enableGLCBit(MI);
-    Changed |= enableSLCBit(MI);
-    return Changed;
-  }
-
-  return Changed;
-}
-
-bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
-                                      SIAtomicScope Scope,
-                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
-                                      bool IsCrossAddrSpaceOrdering,
-                                      Position Pos, AtomicOrdering Order,
-                                      bool AtomicsOnly) const {
-  if (ST.isTgSplitEnabled()) {
-    // In threadgroup split mode the waves of a work-group can be executing on
-    // different CUs. Therefore need to wait for global or GDS memory operations
-    // to complete to ensure they are visible to waves in the other CUs.
-    // Otherwise in non-threadgroup split mode all waves of a work-group are on
-    // the same CU, so no need to wait for global memory as all waves in the
-    // work-group access the same the L1, nor wait for GDS as access are ordered
-    // on a CU.
-    if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
-                       SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
-        (Scope == SIAtomicScope::WORKGROUP)) {
-      // Same as GFX7 using agent scope.
-      Scope = SIAtomicScope::AGENT;
-    }
-    // In threadgroup split mode LDS cannot be allocated so no need to wait for
-    // LDS memory operations.
-    AddrSpace &= ~SIAtomicAddrSpace::LDS;
-  }
-  return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
-                                        IsCrossAddrSpaceOrdering, Pos, Order,
-                                        AtomicsOnly);
-}
-
-bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
-                                         SIAtomicScope Scope,
-                                         SIAtomicAddrSpace AddrSpace,
-                                         Position Pos) const {
-  if (!InsertCacheInv)
-    return false;
-
-  bool Changed = false;
-
-  MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-
-  if (Pos == Position::AFTER)
-    ++MI;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-      // Ensures that following loads will not see stale remote VMEM data or
-      // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
-      // CC will never be stale due to the local memory probes.
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
-      // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
-      // hardware does not reorder memory operations by the same wave with
-      // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
-      // remove any cache lines of earlier writes by the same wave and ensures
-      // later reads by the same wave will refetch the cache lines.
-      Changed = true;
-      break;
-    case SIAtomicScope::AGENT:
-      // Same as GFX7.
-      break;
-    case SIAtomicScope::WORKGROUP:
-      // In threadgroup split mode the waves of a work-group can be executing on
-      // different CUs. Therefore need to invalidate the L1 which is per CU.
-      // Otherwise in non-threadgroup split mode all waves of a work-group are
-      // on the same CU, and so the L1 does not need to be invalidated.
-      if (ST.isTgSplitEnabled()) {
-        // Same as GFX7 using agent scope.
-        Scope = SIAtomicScope::AGENT;
-      }
-      break;
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // Same as GFX7.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  /// The scratch address space does not need the global memory cache
-  /// to be flushed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not have a cache.
-
-  if (Pos == Position::AFTER)
-    --MI;
-
-  Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
-
-  return Changed;
-}
-
-bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
-                                         SIAtomicScope Scope,
-                                         SIAtomicAddrSpace AddrSpace,
-                                         bool IsCrossAddrSpaceOrdering,
-                                         Position Pos) const {
-  bool Changed = false;
-
-  MachineBasicBlock &MBB = *MI->getParent();
-  const DebugLoc &DL = MI->getDebugLoc();
-
-  if (Pos == Position::AFTER)
-    ++MI;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-      // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
-      // hardware does not reorder memory operations by the same wave with
-      // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
-      // to initiate writeback of any dirty cache lines of earlier writes by the
-      // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
-      // writeback has completed.
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
-        // Set SC bits to indicate system scope.
-        .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
-      // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
-      // vmcnt(0)" needed by the "BUFFER_WBL2".
-      Changed = true;
-      break;
-    case SIAtomicScope::AGENT:
-    case SIAtomicScope::WORKGROUP:
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // Same as GFX7.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  if (Pos == Position::AFTER)
-    --MI;
-
-  Changed |=
-      SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
-                                        IsCrossAddrSpaceOrdering, Pos);
-
-  return Changed;
-}
-
-bool SIGfx940CacheControl::enableLoadCacheBypass(
-    const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
-    SIAtomicAddrSpace AddrSpace) const {
-  assert(MI->mayLoad() && !MI->mayStore());
-  bool Changed = false;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-      // Set SC bits to indicate system scope.
-      Changed |= enableSC0Bit(MI);
-      Changed |= enableSC1Bit(MI);
-      break;
-    case SIAtomicScope::AGENT:
-      // Set SC bits to indicate agent scope.
-      Changed |= enableSC1Bit(MI);
-      break;
-    case SIAtomicScope::WORKGROUP:
-      // In threadgroup split mode the waves of a work-group can be executing on
-      // different CUs. Therefore need to bypass the L1 which is per CU.
-      // Otherwise in non-threadgroup split mode all waves of a work-group are
-      // on the same CU, and so the L1 does not need to be bypassed. Setting SC
-      // bits to indicate work-group scope will do this automatically.
-      Changed |= enableSC0Bit(MI);
-      break;
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // Leave SC bits unset to indicate wavefront scope.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  /// The scratch address space does not need the global memory caches
-  /// to be bypassed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not have a cache.
-
-  return Changed;
-}
-
-bool SIGfx940CacheControl::enableStoreCacheBypass(
-    const MachineBasicBlock::iterator &MI,
-    SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
-  assert(!MI->mayLoad() && MI->mayStore());
-  bool Changed = false;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-      // Set SC bits to indicate system scope.
-      Changed |= enableSC0Bit(MI);
-      Changed |= enableSC1Bit(MI);
-      break;
-    case SIAtomicScope::AGENT:
-      // Set SC bits to indicate agent scope.
-      Changed |= enableSC1Bit(MI);
-      break;
-    case SIAtomicScope::WORKGROUP:
-      // Set SC bits to indicate workgroup scope.
-      Changed |= enableSC0Bit(MI);
-      break;
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // Leave SC bits unset to indicate wavefront scope.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  /// The scratch address space does not need the global memory caches
-  /// to be bypassed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not have a cache.
-
-  return Changed;
-}
-
-bool SIGfx940CacheControl::enableRMWCacheBypass(
-    const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
-    SIAtomicAddrSpace AddrSpace) const {
-  assert(MI->mayLoad() && MI->mayStore());
-  bool Changed = false;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-      // Set SC1 bit to indicate system scope.
-      Changed |= enableSC1Bit(MI);
-      break;
-    case SIAtomicScope::AGENT:
-    case SIAtomicScope::WORKGROUP:
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // RMW atomic operations implicitly bypass the L1 cache and only use SC1
-      // to indicate system or agent scope. The SC0 bit is used to indicate if
-      // they are return or no-return. Leave SC1 bit unset to indicate agent
-      // scope.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  return Changed;
-}
-
-bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
-    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
-    bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
-  // Only handle load and store, not atomic read-modify-write insructions. The
-  // latter use glc to indicate if the atomic returns a result and so must not
-  // be used for cache control.
-  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
-
-  // Only update load and store, not LLVM IR atomic read-modify-write
-  // instructions. The latter are always marked as volatile so cannot sensibly
-  // handle it as do not want to pessimize all atomics. Also they do not support
-  // the nontemporal attribute.
-  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
-
-  bool Changed = false;
-
-  if (IsVolatile) {
-    // Set SC bits to indicate system scope.
-    Changed |= enableSC0Bit(MI);
-    Changed |= enableSC1Bit(MI);
-
-    // Ensure operation has completed at system scope to cause all volatile
-    // operations to be visible outside the program in a global order. Do not
-    // request cross address space as only the global address space can be
-    // observable outside the program, so no need to cause a waitcnt for LDS
-    // address space operations.
-    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
-                          Position::AFTER, AtomicOrdering::Unordered,
-                          /*AtomicsOnly=*/false);
-
-    return Changed;
-  }
-
-  if (IsNonTemporal) {
-    Changed |= enableNTBit(MI);
-    return Changed;
-  }
-
-  return Changed;
-}
-
-bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
-                                         SIAtomicScope Scope,
-                                         SIAtomicAddrSpace AddrSpace,
-                                         Position Pos) const {
-  if (!InsertCacheInv)
-    return false;
-
-  bool Changed = false;
-
-  MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-
-  if (Pos == Position::AFTER)
-    ++MI;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-      // Ensures that following loads will not see stale remote VMEM data or
-      // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
-      // CC will never be stale due to the local memory probes.
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
-          // Set SC bits to indicate system scope.
-          .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
-      // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
-      // hardware does not reorder memory operations by the same wave with
-      // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
-      // remove any cache lines of earlier writes by the same wave and ensures
-      // later reads by the same wave will refetch the cache lines.
-      Changed = true;
-      break;
-    case SIAtomicScope::AGENT:
-      // Ensures that following loads will not see stale remote date or local
-      // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
-      // due to the memory probes.
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
-          // Set SC bits to indicate agent scope.
-          .addImm(AMDGPU::CPol::SC1);
-      // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
-      // does not reorder memory operations with respect to preceeding buffer
-      // invalidate. The invalidate is guaranteed to remove any cache lines of
-      // earlier writes and ensures later writes will refetch the cache lines.
-      Changed = true;
-      break;
-    case SIAtomicScope::WORKGROUP:
-      // In threadgroup split mode the waves of a work-group can be executing on
-      // different CUs. Therefore need to invalidate the L1 which is per CU.
-      // Otherwise in non-threadgroup split mode all waves of a work-group are
-      // on the same CU, and so the L1 does not need to be invalidated.
-      if (ST.isTgSplitEnabled()) {
-        // Ensures L1 is invalidated if in threadgroup split mode. In
-        // non-threadgroup split mode it is a NOP, but no point generating it in
-        // that case if know not in that mode.
-        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
-            // Set SC bits to indicate work-group scope.
-            .addImm(AMDGPU::CPol::SC0);
-        // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
-        // does not reorder memory operations with respect to preceeding buffer
-        // invalidate. The invalidate is guaranteed to remove any cache lines of
-        // earlier writes and ensures later writes will refetch the cache lines.
-        Changed = true;
-      }
-      break;
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // Could generate "BUFFER_INV" but it would do nothing as there are no
-      // caches to invalidate.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  /// The scratch address space does not need the global memory cache
-  /// to be flushed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not have a cache.
-
-  if (Pos == Position::AFTER)
-    --MI;
-
-  return Changed;
-}
-
-bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
-                                         SIAtomicScope Scope,
-                                         SIAtomicAddrSpace AddrSpace,
-                                         bool IsCrossAddrSpaceOrdering,
-                                         Position Pos) const {
-  bool Changed = false;
-
-  MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-
-  if (Pos == Position::AFTER)
-    ++MI;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-      // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
-      // hardware does not reorder memory operations by the same wave with
-      // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
-      // to initiate writeback of any dirty cache lines of earlier writes by the
-      // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
-      // writeback has completed.
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
-          // Set SC bits to indicate system scope.
-          .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
-      // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
-      // SIAtomicScope::SYSTEM, the following insertWait will generate the
-      // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
-      Changed = true;
-      break;
-    case SIAtomicScope::AGENT:
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
-          // Set SC bits to indicate agent scope.
-          .addImm(AMDGPU::CPol::SC1);
-
-      // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
-      // SIAtomicScope::AGENT, the following insertWait will generate the
-      // required "S_WAITCNT vmcnt(0)".
-      Changed = true;
-      break;
-    case SIAtomicScope::WORKGROUP:
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // Do not generate "BUFFER_WBL2" as there are no caches it would
-      // writeback, and would require an otherwise unnecessary
-      // "S_WAITCNT vmcnt(0)".
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
+    if (Pos == Position::AFTER)
+      --MI;
   }
 
-  if (Pos == Position::AFTER)
-    --MI;
-
   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
   // S_WAITCNT needed.
   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
@@ -1944,8 +1450,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
     case SIAtomicScope::AGENT:
       // Set the L0 and L1 cache policies to MISS_EVICT.
       // Note: there is no L2 cache coherent bypass control at the ISA level.
-      Changed |= enableGLCBit(MI);
-      Changed |= enableDLCBit(MI);
+      Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
       break;
     case SIAtomicScope::WORKGROUP:
       // In WGP mode the waves of a work-group can be executing on either CU of
@@ -1953,7 +1458,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
       // CU mode all waves of a work-group are on the same CU, and so the L0
       // does not need to be bypassed.
       if (!ST.isCuModeEnabled())
-        Changed |= enableGLCBit(MI);
+        Changed |= enableCPolBits(MI, CPol::GLC);
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
@@ -1996,8 +1501,7 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
     // and MISS_LRU for store instructions.
     // Note: there is no L2 cache coherent bypass control at the ISA level.
     if (Op == SIMemOp::LOAD) {
-      Changed |= enableGLCBit(MI);
-      Changed |= enableDLCBit(MI);
+      Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
     }
 
     // Ensure operation has completed at system scope to cause all volatile
@@ -2017,8 +1521,8 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
     // For stores setting both GLC and SLC configures L0 and L1 cache policy
     // to MISS_EVICT and the L2 cache policy to STREAM.
     if (Op == SIMemOp::STORE)
-      Changed |= enableGLCBit(MI);
-    Changed |= enableSLCBit(MI);
+      Changed |= enableCPolBits(MI, CPol::GLC);
+    Changed |= enableCPolBits(MI, CPol::SLC);
 
     return Changed;
   }
@@ -2230,7 +1734,7 @@ bool SIGfx11CacheControl::enableLoadCacheBypass(
     case SIAtomicScope::AGENT:
       // Set the L0 and L1 cache policies to MISS_EVICT.
       // Note: there is no L2 cache coherent bypass control at the ISA level.
-      Changed |= enableGLCBit(MI);
+      Changed |= enableCPolBits(MI, CPol::GLC);
       break;
     case SIAtomicScope::WORKGROUP:
       // In WGP mode the waves of a work-group can be executing on either CU of
@@ -2238,7 +1742,7 @@ bool SIGfx11CacheControl::enableLoadCacheBypass(
       // CU mode all waves of a work-group are on the same CU, and so the L0
       // does not need to be bypassed.
       if (!ST.isCuModeEnabled())
-        Changed |= enableGLCBit(MI);
+        Changed |= enableCPolBits(MI, CPol::GLC);
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
@@ -2281,10 +1785,10 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
     // and MISS_LRU for store instructions.
     // Note: there is no L2 cache coherent bypass control at the ISA level.
     if (Op == SIMemOp::LOAD)
-      Changed |= enableGLCBit(MI);
+      Changed |= enableCPolBits(MI, CPol::GLC);
 
     // Set MALL NOALLOC for load and store instructions.
-    Changed |= enableDLCBit(MI);
+    Changed |= enableCPolBits(MI, CPol::DLC);
 
     // Ensure operation has completed at system scope to cause all volatile
     // operations to be visible outside the program in a global order. Do not
@@ -2303,11 +1807,11 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
     // For stores setting both GLC and SLC configures L0 and L1 cache policy
     // to MISS_EVICT and the L2 cache policy to STREAM.
     if (Op == SIMemOp::STORE)
-      Changed |= enableGLCBit(MI);
-    Changed |= enableSLCBit(MI);
+      Changed |= enableCPolBits(MI, CPol::GLC);
+    Changed |= enableCPolBits(MI, CPol::SLC);
 
     // Set MALL NOALLOC for load and store instructions.
-    Changed |= enableDLCBit(MI);
+    Changed |= enableCPolBits(MI, CPol::DLC);
     return Changed;
   }
 

>From b092e9c091f7181831ac3847c21b065ed80a0919 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 17 Nov 2025 10:05:14 +0100
Subject: [PATCH 2/2] nit

---
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 965631b3c09e4..49aba39872138 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1249,7 +1249,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
 static bool canUseBUFFER_WBINVL1_VOL(const GCNSubtarget &ST) {
   if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
     return false;
-  return !(ST.isAmdPalOS() || ST.isMesa3DOS());
+  return !ST.isAmdPalOS() && !ST.isMesa3DOS();
 }
 
 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,