[llvm] 7ca4473 - [AMDGPU] Add new cache flushing instructions for GFX12 (#76944)

via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 8 06:07:02 PST 2024


Author: Mirko BrkuĊĦanin
Date: 2024-01-08T14:06:58Z
New Revision: 7ca4473dd97328ebaa95dd3411e3c817935389de

URL: https://github.com/llvm/llvm-project/commit/7ca4473dd97328ebaa95dd3411e3c817935389de
DIFF: https://github.com/llvm/llvm-project/commit/7ca4473dd97328ebaa95dd3411e3c817935389de.diff

LOG: [AMDGPU] Add new cache flushing instructions for GFX12 (#76944)

Co-authored-by: Diana Picus <Diana-Magda.Picus at amd.com>

Added: 
    llvm/test/CodeGen/AMDGPU/waitcnt-global-inv-wb.mir

Modified: 
    llvm/lib/Target/AMDGPU/BUFInstructions.td
    llvm/lib/Target/AMDGPU/FLATInstructions.td
    llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
    llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
    llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
    llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
    llvm/test/MC/AMDGPU/gfx12_asm_vflat.s
    llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 43d35fa5291ca0..15a54856cb2e3a 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1222,8 +1222,10 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
 
 } // End HasD16LoadStore
 
-def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
-                                       int_amdgcn_buffer_wbinvl1>;
+let SubtargetPredicate = isNotGFX12Plus in
+def BUFFER_WBINVL1 : MUBUF_Invalidate <
+  "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1
+>;
 
 let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
 defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN<

diff  --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 615f8cd54d8f9c..345564c06af144 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -60,6 +60,7 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
   bits<1> has_sve  = 0; // Scratch VGPR Enable
   bits<1> lds = 0;
   bits<1> sve = 0;
+  bits<1> has_offset = 1;
 
   let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts,
     !if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace));
@@ -182,7 +183,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   let Inst{51-50} = cpol{4-3}; // scope
   let Inst{62-55} = !if(ps.has_data, vdata{7-0}, ?);
   let Inst{71-64} = !if(ps.has_vaddr, vaddr, ?);
-  let Inst{95-72} = offset;
+  let Inst{95-72} = !if(ps.has_offset, offset, ?);
 }
 
 class GlobalSaddrTable <bit is_saddr, string Name = ""> {
@@ -340,6 +341,34 @@ multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterClass regClass
     GlobalSaddrTable<1, opName>;
 }
 
+class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = null_frag> :
+  FLAT_Pseudo<opName, (outs), (ins CPol:$cpol), "$cpol", [(node)]> {
+
+  let AsmMatchConverter = "";
+
+  let hasSideEffects = 1;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let is_flat_global = 1;
+
+  let has_offset = 0;
+  let has_saddr = 0;
+  let enabled_saddr = 0;
+  let saddr_value = 0;
+  let has_vdst = 0;
+  let has_data = 0;
+  let has_vaddr = 0;
+  let has_glc = 0;
+  let has_dlc = 0;
+  let glcValue = 0;
+  let dlcValue = 0;
+  let has_sccb = 0;
+  let sccbValue = 0;
+  let has_sve = 0;
+  let lds = 0;
+  let sve = 0;
+}
+
 class FlatScratchInst <string sv_op, string mode> {
   string SVOp = sv_op;
   string Mode = mode;
@@ -928,6 +957,10 @@ defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwor
 
 let SubtargetPredicate = isGFX12Plus in {
   defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>;
+
+  def GLOBAL_INV    : FLAT_Global_Invalidate_Writeback<"global_inv">;
+  def GLOBAL_WB     : FLAT_Global_Invalidate_Writeback<"global_wb">;
+  def GLOBAL_WBINV  : FLAT_Global_Invalidate_Writeback<"global_wbinv">;
 } // End SubtargetPredicate = isGFX12Plus
 
 } // End is_flat_global = 1
@@ -2662,6 +2695,10 @@ defm GLOBAL_ATOMIC_MAX_NUM_F32     : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_A
 defm GLOBAL_ATOMIC_ADD_F32         : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">;
 defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073, "GLOBAL_ATOMIC_ORDERED_ADD_B64", "global_atomic_ordered_add_b64">;
 
+defm GLOBAL_INV                    : VFLAT_Real_Base_gfx12<0x02b, "GLOBAL_INV", "global_inv">;
+defm GLOBAL_WB                     : VFLAT_Real_Base_gfx12<0x02c, "GLOBAL_WB", "global_wb">;
+defm GLOBAL_WBINV                  : VFLAT_Real_Base_gfx12<0x04f, "GLOBAL_WBINV", "global_wbinv">;
+
 // ENC_VSCRATCH.
 defm SCRATCH_LOAD_U8               : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>;
 defm SCRATCH_LOAD_I8               : VSCRATCH_Real_AllAddr_gfx12<0x11, "SCRATCH_LOAD_SBYTE", "scratch_load_i8", true>;

diff  --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 55ddb540c51e53..1cb1d32707f2d7 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1424,6 +1424,12 @@ bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
   });
 }
 
+static bool isCacheInvOrWBInst(MachineInstr &Inst) {
+  auto Opc = Inst.getOpcode();
+  return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
+         Opc == AMDGPU::GLOBAL_WBINV;
+}
+
 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
                                                WaitcntBrackets *ScoreBrackets) {
   // Now look at the instruction opcode. If it is a memory access
@@ -1439,6 +1445,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
     }
   } else if (TII->isFLAT(Inst)) {
+    // TODO: Track this properly.
+    if (isCacheInvOrWBInst(Inst))
+      return;
+
     assert(Inst.mayLoadOrStore());
 
     int FlatASCount = 0;

diff  --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 10ec54d3317fdf..6d749ad1ad24f6 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -578,6 +578,14 @@ class SIGfx11CacheControl : public SIGfx10CacheControl {
                                       bool IsNonTemporal) const override;
 };
 
+class SIGfx12CacheControl : public SIGfx11CacheControl {
+public:
+  SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
+
+  bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace, Position Pos) const override;
+};
+
 class SIMemoryLegalizer final : public MachineFunctionPass {
 private:
 
@@ -857,7 +865,9 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
     return std::make_unique<SIGfx7CacheControl>(ST);
   if (Generation < AMDGPUSubtarget::GFX11)
     return std::make_unique<SIGfx10CacheControl>(ST);
-  return std::make_unique<SIGfx11CacheControl>(ST);
+  if (Generation < AMDGPUSubtarget::GFX12)
+    return std::make_unique<SIGfx11CacheControl>(ST);
+  return std::make_unique<SIGfx12CacheControl>(ST);
 }
 
 bool SIGfx6CacheControl::enableLoadCacheBypass(
@@ -1423,7 +1433,7 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
   bool Changed = false;
 
   MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
+  const DebugLoc &DL = MI->getDebugLoc();
 
   if (Pos == Position::AFTER)
     ++MI;
@@ -2132,6 +2142,62 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
   return Changed;
 }
 
+bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+                                        SIAtomicScope Scope,
+                                        SIAtomicAddrSpace AddrSpace,
+                                        Position Pos) const {
+  if (!InsertCacheInv)
+    return false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  /// The scratch address space does not need the global memory cache
+  /// to be flushed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not have a cache.
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
+    return false;
+
+  AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
+  switch (Scope) {
+  case SIAtomicScope::SYSTEM:
+    ScopeImm = AMDGPU::CPol::SCOPE_SYS;
+    break;
+  case SIAtomicScope::AGENT:
+    ScopeImm = AMDGPU::CPol::SCOPE_DEV;
+    break;
+  case SIAtomicScope::WORKGROUP:
+    // In WGP mode the waves of a work-group can be executing on either CU of
+    // the WGP. Therefore we need to invalidate the L0 which is per CU.
+    // Otherwise in CU mode all waves of a work-group are on the same CU, and so
+    // the L0 does not need to be invalidated.
+    if (ST.isCuModeEnabled())
+      return false;
+
+    ScopeImm = AMDGPU::CPol::SCOPE_SE;
+    break;
+  case SIAtomicScope::WAVEFRONT:
+  case SIAtomicScope::SINGLETHREAD:
+    // No cache to invalidate.
+    return false;
+  default:
+    llvm_unreachable("Unsupported synchronization scope");
+  }
+
+  if (Pos == Position::AFTER)
+    ++MI;
+
+  BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
+
+  if (Pos == Position::AFTER)
+    --MI;
+
+  return true;
+}
+
 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
   if (AtomicPseudoMIs.empty())
     return false;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index 8ca09973a8ed7e..904120e7d11894 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -1295,8 +1295,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 2 :: v_dual_mov_b32 v1, 0
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v1, v0, s[2:3] offset:16380 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
   %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1347,8 +1346,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
   %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1389,8 +1387,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4095(ptr addrspace(1) %pt
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v2, off offset:16380 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
   %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1438,8 +1435,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace(
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
   %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1491,8 +1487,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
   %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1536,8 +1531,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(ptr addrspace(1) inreg
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v0, v[1:2], s[2:3] offset:16380 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
   %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -1590,8 +1584,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1)
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[3:4], v[1:2], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
   %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -1633,8 +1626,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4095(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off offset:16380 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
   %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -1682,8 +1674,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1)
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v6, vcc_lo
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
   %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -1736,8 +1727,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
   %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 9f97f1f4bace5d..26d981ad7b4bfe 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -240,8 +240,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB0_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
@@ -276,8 +275,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB0_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
@@ -571,8 +569,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b32 s13, s7
 ; GFX1264-NEXT:    buffer_atomic_add_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB1_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
@@ -610,8 +607,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    s_mov_b32 s9, s7
 ; GFX1232-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB1_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
@@ -967,8 +963,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB2_4:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1016,8 +1011,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB2_4:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1284,8 +1278,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB3_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1321,8 +1314,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB3_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1673,8 +1665,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b32 s9, s7
 ; GFX1264-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB4_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
@@ -1718,8 +1709,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    s_mov_b32 s9, s7
 ; GFX1232-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB4_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
@@ -1836,8 +1826,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    s_mov_b32 s4, s0
 ; GFX12-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_mov_b32 s5, s1
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
 ; GFX12-NEXT:    s_nop 0
@@ -2117,8 +2106,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB6_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2154,8 +2142,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB6_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2454,8 +2441,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b32 s13, s7
 ; GFX1264-NEXT:    buffer_atomic_sub_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB7_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2493,8 +2479,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    s_mov_b32 s9, s7
 ; GFX1232-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB7_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2850,8 +2835,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB8_4:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2899,8 +2883,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB8_4:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3218,8 +3201,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB9_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3258,8 +3240,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB9_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3626,8 +3607,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b32 s9, s7
 ; GFX1264-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB10_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3674,8 +3654,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    s_mov_b32 s9, s7
 ; GFX1232-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB10_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3795,8 +3774,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    s_mov_b32 s4, s0
 ; GFX12-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_mov_b32 s5, s1
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
 ; GFX12-NEXT:    s_nop 0

diff  --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 1df9a250a3159d..e18bdc89e7d421 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -101,8 +101,7 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
 ; GFX1200-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1200-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
 ; GFX1200-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1200-NEXT:    buffer_gl0_inv
-; GFX1200-NEXT:    buffer_gl1_inv
+; GFX1200-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1200-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX1200-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -209,7 +208,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
 ; GFX1200-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1200-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
 ; GFX1200-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1200-NEXT:    buffer_gl0_inv
+; GFX1200-NEXT:    global_inv scope:SCOPE_SE
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31]
   %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
   ret float %res
@@ -340,7 +339,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
 ; GFX1200-NEXT:    flat_atomic_add_f32 v[0:1], v2
 ; GFX1200-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1200-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1200-NEXT:    buffer_gl0_inv
+; GFX1200-NEXT:    global_inv scope:SCOPE_SE
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31]
   %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
   ret void
@@ -435,7 +434,7 @@ define float @no_unsafe(ptr %addr, float %val) {
 ; GFX1200-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1200-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
 ; GFX1200-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1200-NEXT:    buffer_gl0_inv
+; GFX1200-NEXT:    global_inv scope:SCOPE_SE
 ; GFX1200-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX1200-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)

diff  --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index f1879f28766786..d7f780e414caed 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -43,8 +43,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -101,8 +100,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -166,8 +164,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    flat_atomic_add_u64 v[2:3], v[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -230,8 +227,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -279,8 +275,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst
@@ -332,8 +327,7 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -392,8 +386,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    flat_atomic_add_u64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -451,8 +444,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -503,8 +495,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -561,8 +552,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -626,8 +616,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    flat_atomic_and_b64 v[2:3], v[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -690,8 +679,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -739,8 +727,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst
@@ -792,8 +779,7 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -852,8 +838,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    flat_atomic_and_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -911,8 +896,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -963,8 +947,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -1021,8 +1004,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -1086,8 +1068,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[2:3], v[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -1150,8 +1131,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -1199,8 +1179,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst
@@ -1252,8 +1231,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -1312,8 +1290,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -1371,8 +1348,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -1421,7 +1397,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -1478,7 +1454,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -1540,7 +1516,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    flat_atomic_max_i64 v[2:3], v[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -1603,7 +1579,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -1649,7 +1625,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst
@@ -1701,7 +1677,7 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -1758,7 +1734,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    flat_atomic_max_i64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -1816,7 +1792,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -1865,7 +1841,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -1922,7 +1898,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -1984,7 +1960,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX12-NEXT:    flat_atomic_max_u64 v[2:3], v[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -2047,7 +2023,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -2093,7 +2069,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst
@@ -2145,7 +2121,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -2202,7 +2178,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    flat_atomic_max_u64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -2260,7 +2236,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -2309,7 +2285,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -2366,7 +2342,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -2428,7 +2404,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    flat_atomic_min_i64 v[2:3], v[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -2491,7 +2467,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -2537,7 +2513,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst
@@ -2589,7 +2565,7 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -2646,7 +2622,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    flat_atomic_min_i64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -2704,7 +2680,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -2753,7 +2729,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -2810,7 +2786,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -2872,7 +2848,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX12-NEXT:    flat_atomic_min_u64 v[2:3], v[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -2935,7 +2911,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -2981,7 +2957,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst
@@ -3033,7 +3009,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -3090,7 +3066,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    flat_atomic_min_u64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -3148,7 +3124,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -3199,8 +3175,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -3257,8 +3232,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -3322,8 +3296,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
 ; GFX12-NEXT:    flat_atomic_or_b64 v[2:3], v[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -3386,8 +3359,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -3435,8 +3407,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst
@@ -3488,8 +3459,7 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -3548,8 +3518,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GFX12-NEXT:    flat_atomic_or_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -3607,8 +3576,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -3659,8 +3627,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -3708,8 +3675,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr double, ptr %out, i64 4
@@ -3757,8 +3723,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr ptr, ptr %out, i32 4
@@ -3815,8 +3780,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -3880,8 +3844,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[2:3], v[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -3944,8 +3907,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -3993,8 +3955,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst
@@ -4046,8 +4007,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -4106,8 +4066,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -4165,8 +4124,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -4217,8 +4175,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -4275,8 +4232,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -4340,8 +4296,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[2:3], v[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -4404,8 +4359,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -4453,8 +4407,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst
@@ -4506,8 +4459,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -4566,8 +4518,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -4625,8 +4576,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -4678,8 +4628,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) {
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4726,8 +4675,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) {
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4790,8 +4738,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4852,8 +4799,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -5094,8 +5040,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -5152,8 +5097,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 9000
@@ -5211,8 +5155,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -5277,8 +5220,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -5349,8 +5291,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
 ; GFX12-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -5408,8 +5349,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
@@ -5462,8 +5402,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in,
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -5523,8 +5462,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -5590,8 +5528,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
 ; GFX12-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -5644,8 +5581,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) {
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -5692,8 +5628,7 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) {
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -5756,8 +5691,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -5818,8 +5752,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -6051,8 +5984,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -6109,8 +6041,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -6174,8 +6105,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -6238,8 +6168,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -6287,8 +6216,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst
@@ -6340,8 +6268,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -6400,8 +6327,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -6459,8 +6385,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -6511,8 +6436,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -6569,8 +6493,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -6634,8 +6557,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -6698,8 +6620,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -6747,8 +6668,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst
@@ -6800,8 +6720,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -6860,8 +6779,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -6919,8 +6837,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index de4f748413e6c1..b2b3f3e1bfbd96 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -3583,8 +3583,7 @@ define amdgpu_ps float @atomic_global_load_saddr_i32(ptr addrspace(1) inreg %sba
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3621,8 +3620,7 @@ define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(ptr addrspace(1)
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128 th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3660,8 +3658,7 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(ptr addrspace(1) inre
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3698,8 +3695,7 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(ptr addrspa
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128 th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index 3d11c8bc499e23..325dae172d5236 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -51,8 +51,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -123,8 +122,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -202,8 +200,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -284,8 +281,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -349,8 +345,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_add_u64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile add ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
@@ -420,8 +415,7 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -496,8 +490,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_add_u64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -575,8 +568,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -635,8 +627,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -707,8 +698,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -786,8 +776,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -868,8 +857,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -933,8 +921,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_and_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
@@ -1004,8 +991,7 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1080,8 +1066,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_and_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -1159,8 +1144,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1219,8 +1203,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -1291,8 +1274,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1370,8 +1352,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -1452,8 +1433,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1517,8 +1497,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_sub_u64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
@@ -1588,8 +1567,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1664,8 +1642,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_sub_u64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -1743,8 +1720,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1797,7 +1773,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -1865,7 +1841,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1937,7 +1913,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -2015,7 +1991,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2073,7 +2049,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_max_i64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
@@ -2140,7 +2116,7 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2209,7 +2185,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_max_i64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -2284,7 +2260,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2337,7 +2313,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -2405,7 +2381,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2477,7 +2453,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -2555,7 +2531,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2613,7 +2589,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_max_u64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
@@ -2680,7 +2656,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2749,7 +2725,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_max_u64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -2824,7 +2800,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2877,7 +2853,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -2945,7 +2921,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3017,7 +2993,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -3095,7 +3071,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3153,7 +3129,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_min_i64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
@@ -3220,7 +3196,7 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3289,7 +3265,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_min_i64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -3364,7 +3340,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3417,7 +3393,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -3485,7 +3461,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3557,7 +3533,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -3635,7 +3611,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3693,7 +3669,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_min_u64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
@@ -3760,7 +3736,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3829,7 +3805,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_min_u64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -3904,7 +3880,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3963,8 +3939,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -4035,8 +4010,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4114,8 +4088,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -4196,8 +4169,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4261,8 +4233,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_or_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
@@ -4332,8 +4303,7 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4408,8 +4378,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_or_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -4487,8 +4456,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4547,8 +4515,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -4603,8 +4570,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr double, ptr addrspace(1) %out, i64 4
@@ -4659,8 +4625,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr ptr, ptr addrspace(1) %out, i64 4
@@ -4731,8 +4696,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4810,8 +4774,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -4892,8 +4855,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4957,8 +4919,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
@@ -5028,8 +4989,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5104,8 +5064,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -5183,8 +5142,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5243,8 +5201,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -5315,8 +5272,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5394,8 +5350,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -5476,8 +5431,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5541,8 +5495,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_xor_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
@@ -5612,8 +5565,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5688,8 +5640,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_xor_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -5767,8 +5718,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5843,8 +5793,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -5917,8 +5866,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:72000
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 9000
@@ -5991,8 +5939,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out,
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6074,8 +6021,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -6168,8 +6114,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[6:7]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6246,8 +6191,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i6
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v4, v[0:3], s[4:5]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %val = cmpxchg volatile ptr addrspace(1) %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
@@ -6319,8 +6263,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6399,8 +6342,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v4, v[0:3], s[0:1]
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -6490,8 +6432,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[6:7]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6557,8 +6498,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6625,8 +6565,7 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] offset:-32 th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6689,8 +6628,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6771,8 +6709,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6853,8 +6790,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6936,8 +6872,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -7307,8 +7242,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -7379,8 +7313,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_inc_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -7458,8 +7391,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -7515,8 +7447,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -7587,8 +7518,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_dec_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -7666,8 +7596,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-global-inv-wb.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-global-inv-wb.mir
new file mode 100644
index 00000000000000..c06e931c65d5e4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-global-inv-wb.mir
@@ -0,0 +1,29 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX12 %s
+
+# Check that we correctly track that GLOBAL_INV increases LOAD_cnt.
+# We use a straightforward dependency between a GLOBAL_LOAD and an instruction
+# that uses its result - the S_WAIT_LOADCNT introduced before the use should
+# reflect the fact that there is a GLOBAL_INV between them.
+# FIXME: We could get away with a S_WAIT_LOADCNT 1 here.
+---
+name: waitcnt-global-inv
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr2_sgpr3
+
+    ; GFX12-LABEL: name: waitcnt-global-inv
+    ; GFX12: liveins: $vgpr0, $vgpr1, $sgpr2_sgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX12-NEXT: GLOBAL_INV 16, implicit $exec
+    ; GFX12-NEXT: S_WAITCNT 1015
+    ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec
+    renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    GLOBAL_INV 16, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec
+...
+
+# TODO: Test for GLOBAL_WB, GLOBAL_WBINV

diff  --git a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s
index 95d352b421a284..daf25d314c78dd 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s
@@ -1920,6 +1920,33 @@ global_store_d16_hi_b8 v[0:1], v2, off offset:64
 global_store_d16_hi_b8 v[3:4], v1, off
 // GFX12: encoding: [0x7c,0x00,0x09,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00]
 
+global_inv
+// GFX12: encoding: [0x7c,0xc0,0x0a,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_inv scope:SCOPE_DEV
+// GFX12: encoding: [0x7c,0xc0,0x0a,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00]
+
+global_inv scope:SCOPE_SYS
+// GFX12: encoding: [0x7c,0xc0,0x0a,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00]
+
+global_wb
+// GFX12: encoding: [0x7c,0x00,0x0b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_wb scope:SCOPE_DEV
+// GFX12: encoding: [0x7c,0x00,0x0b,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00]
+
+global_wb scope:SCOPE_SYS
+// GFX12: encoding: [0x7c,0x00,0x0b,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00]
+
+global_wbinv
+// GFX12: encoding: [0x7c,0xc0,0x13,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_wbinv scope:SCOPE_DEV
+// GFX12: encoding: [0x7c,0xc0,0x13,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00]
+
+global_wbinv scope:SCOPE_SYS
+// GFX12: encoding: [0x7c,0xc0,0x13,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00]
+
 scratch_load_b128 v[1:4], off, off offset:-64
 // GFX12: encoding: [0x7c,0xc0,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
 

diff  --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt
index f4038cf10f50dc..7365adb864fd66 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt
@@ -1137,6 +1137,33 @@
 # GFX12: global_store_d16_hi_b8 v[3:4], v1, off  ; encoding: [0x7c,0x00,0x09,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00]
 0x7c,0x00,0x09,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00
 
+# GFX12: global_inv ; encoding: [0x7c,0xc0,0x0a,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x0a,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX12: global_inv scope:SCOPE_DEV ; encoding: [0x7c,0xc0,0x0a,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x0a,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00
+
+# GFX12: global_inv scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x0a,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x0a,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00
+
+# GFX12: global_wb ; encoding: [0x7c,0x00,0x0b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x7c,0x00,0x0b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX12: global_wb scope:SCOPE_DEV ; encoding: [0x7c,0x00,0x0b,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00]
+0x7c,0x00,0x0b,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00
+
+# GFX12: global_wb scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x0b,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00]
+0x7c,0x00,0x0b,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00
+
+# GFX12: global_wbinv ; encoding: [0x7c,0xc0,0x13,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x13,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX12: global_wbinv scope:SCOPE_DEV ; encoding: [0x7c,0xc0,0x13,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x13,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00
+
+# GFX12: global_wbinv scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x13,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x13,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00
+
 # GFX12: scratch_load_b128 v[1:4], off, off offset:-64 ; encoding:  [0x7c,0xc0,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
 0x7c,0xc0,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
 


        


More information about the llvm-commits mailing list