[llvm] [AMDGPU] Insert waitcnt for non-global fence release in GFX12 (PR #159282)

Tue Sep 23 01:54:30 PDT 2025

https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/159282

>From 9a1b656876e2db83752e81659d15fe5a833b9794 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Wed, 17 Sep 2025 03:55:18 -0400
Subject: [PATCH 1/4] [AMDGPU] Insert waitcnt for non-global fence release in
 GFX12

A fence release could be followed by a barrier, so it should wait for
the relevant memory accesses to complete, even if it is mmra-limited to
LDS. So far, that would be skipped for non-global fence releases.

Fixes SWDEV-554932.
---
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp  |   3 +-
 .../AMDGPU/memory-legalizer-barriers-mmra.ll  | 122 ++++++++++++++++++
 .../memory-legalizer-fence-mmra-local.ll      |  51 ++++++++
 3 files changed, 174 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers-mmra.ll

diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index c501ebba0c7ed..beeddc58be193 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2522,8 +2522,7 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
   // sequentially consistent, and no other thread can access scratch
   // memory.
 
-  // Other address spaces do not have a cache.
-  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
+  if (AddrSpace == SIAtomicAddrSpace::SCRATCH)
     return false;
 
   if (Pos == Position::AFTER)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers-mmra.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers-mmra.ll
new file mode 100644
index 0000000000000..1e6dc4e06ef4d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers-mmra.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
+
+
+define float @test_barrier_workgroup_local_mmra(ptr addrspace(3) noundef %x, ptr addrspace(3) noundef %y, float %val) {
+; GFX10-WGP-LABEL: test_barrier_workgroup_local_mmra:
+; GFX10-WGP:       ; %bb.0:
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v2
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_barrier
+; GFX10-WGP-NEXT:    ds_read_b32 v0, v1
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-WGP-LABEL: test_barrier_workgroup_local_mmra:
+; GFX11-WGP:       ; %bb.0:
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v2
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_barrier
+; GFX11-WGP-NEXT:    ds_load_b32 v0, v1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-WGP-LABEL: test_barrier_workgroup_local_mmra:
+; GFX12-WGP:       ; %bb.0:
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    s_wait_expcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v2
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    s_barrier_signal -1
+; GFX12-WGP-NEXT:    s_barrier_wait -1
+; GFX12-WGP-NEXT:    ds_load_b32 v0, v1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_barrier_workgroup_local_mmra:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v2
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_barrier_signal -1
+; GFX1250-NEXT:    s_barrier_wait -1
+; GFX1250-NEXT:    ds_load_b32 v0, v1
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  store float %val, ptr addrspace(3) %x
+  fence syncscope("workgroup") release, !mmra !0
+  tail call void @llvm.amdgcn.s.barrier()
+  fence syncscope("workgroup") acquire, !mmra !0
+  %ret = load float, ptr addrspace(3) %y
+  ret float %ret
+}
+
+define float @test_barrier_workgroup_global_mmra(ptr addrspace(1) noundef %x, ptr addrspace(1) noundef %y, float %val) {
+; GFX10-WGP-LABEL: test_barrier_workgroup_global_mmra:
+; GFX10-WGP:       ; %bb.0:
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    global_store_dword v[0:1], v4, off
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_barrier
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_load_dword v0, v[2:3], off
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-WGP-LABEL: test_barrier_workgroup_global_mmra:
+; GFX11-WGP:       ; %bb.0:
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    s_barrier
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_load_b32 v0, v[2:3], off
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-WGP-LABEL: test_barrier_workgroup_global_mmra:
+; GFX12-WGP:       ; %bb.0:
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    s_wait_expcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_barrier_signal -1
+; GFX12-WGP-NEXT:    s_barrier_wait -1
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    global_load_b32 v0, v[2:3], off
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_barrier_workgroup_global_mmra:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_barrier_signal -1
+; GFX1250-NEXT:    s_barrier_wait -1
+; GFX1250-NEXT:    global_load_b32 v0, v[2:3], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  store float %val, ptr addrspace(1) %x
+  fence syncscope("workgroup") release, !mmra !1
+  tail call void @llvm.amdgcn.s.barrier()
+  fence syncscope("workgroup") acquire, !mmra !1
+  %ret = load float, ptr addrspace(1) %y
+  ret float %ret
+}
+
+!0 = !{!"amdgpu-synchronize-as", !"local"}
+!1 = !{!"amdgpu-synchronize-as", !"global"}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
index cc42428e1aa06..682c1f6519959 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
@@ -143,14 +143,17 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX12-WGP-LABEL: workgroup_release_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: workgroup_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -213,14 +216,17 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -283,14 +289,17 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -670,14 +679,18 @@ define amdgpu_kernel void @agent_release_fence() {
 ;
 ; GFX12-WGP-LABEL: agent_release_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: agent_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: agent_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -740,14 +753,18 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
 ;
 ; GFX12-WGP-LABEL: agent_acq_rel_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: agent_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: agent_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -810,14 +827,18 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
 ;
 ; GFX12-WGP-LABEL: agent_seq_cst_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: agent_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: agent_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -940,6 +961,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1001,6 +1023,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1062,6 +1085,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1197,14 +1221,20 @@ define amdgpu_kernel void @system_release_fence() {
 ;
 ; GFX12-WGP-LABEL: system_release_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: system_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence release, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1267,14 +1297,20 @@ define amdgpu_kernel void @system_acq_rel_fence() {
 ;
 ; GFX12-WGP-LABEL: system_acq_rel_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: system_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1337,14 +1373,20 @@ define amdgpu_kernel void @system_seq_cst_fence() {
 ;
 ; GFX12-WGP-LABEL: system_seq_cst_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: system_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1459,14 +1501,17 @@ define amdgpu_kernel void @system_one_as_release_fence() {
 ;
 ; GFX12-WGP-LABEL: system_one_as_release_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_one_as_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: system_one_as_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1520,14 +1565,17 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
 ;
 ; GFX12-WGP-LABEL: system_one_as_acq_rel_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_one_as_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: system_one_as_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1581,14 +1629,17 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
 ;
 ; GFX12-WGP-LABEL: system_one_as_seq_cst_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_one_as_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: system_one_as_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}

>From cf7d60886de518a722178890bf1dc8c3f79ce1aa Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Fri, 19 Sep 2025 09:03:28 -0400
Subject: [PATCH 2/4] Don't emit WB for local-AS fences...

...and treat release/acquire on GloballyAddressableScratch as global.

This also fixes an issue where an early return between an iterator
increment/decrement pair in SIGfx12CacheControl::insertRelease could have lead
to unexpected insertion locations.
---
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp  | 74 ++++++++++---------
 .../memory-legalizer-fence-mmra-local.ll      | 24 ------
 2 files changed, 38 insertions(+), 60 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index beeddc58be193..2fd6bc52ea4da 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2459,10 +2459,12 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
   /// The scratch address space does not need the global memory cache
   /// to be flushed as all memory operations by the same thread are
   /// sequentially consistent, and no other thread can access scratch
-  /// memory.
+  /// memory, unless the subtarget has GloballyAddressableScratch.
 
   /// Other address spaces do not have a cache.
-  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
+  if (((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) &&
+      (((AddrSpace & SIAtomicAddrSpace::SCRATCH) == SIAtomicAddrSpace::NONE) ||
+       !ST.hasGloballyAddressableScratch()))
     return false;
 
   AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
@@ -2520,45 +2522,45 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
   // The scratch address space does not need the global memory cache
   // writeback as all memory operations by the same thread are
   // sequentially consistent, and no other thread can access scratch
-  // memory.
+  // memory, unless the subtarget has GloballyAddressableScratch.
+  if (((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) ||
+      (((AddrSpace & SIAtomicAddrSpace::SCRATCH) != SIAtomicAddrSpace::NONE) &&
+       ST.hasGloballyAddressableScratch())) {
 
-  if (AddrSpace == SIAtomicAddrSpace::SCRATCH)
-    return false;
-
-  if (Pos == Position::AFTER)
-    ++MI;
+    if (Pos == Position::AFTER)
+      ++MI;
 
-  // global_wb is only necessary at system scope for GFX12.0,
-  // they're also necessary at device scope for GFX12.5.
-  //
-  // Emitting it for lower scopes is a slow no-op, so we omit it
-  // for performance.
-  switch (Scope) {
-  case SIAtomicScope::SYSTEM:
-    BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
-        .addImm(AMDGPU::CPol::SCOPE_SYS);
-    break;
-  case SIAtomicScope::AGENT:
-    // TODO DOCS
-    if (ST.hasGFX1250Insts()) {
+    // global_wb is only necessary at system scope for GFX12.0,
+    // they're also necessary at device scope for GFX12.5.
+    //
+    // Emitting it for lower scopes is a slow no-op, so we omit it
+    // for performance.
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
-          .addImm(AMDGPU::CPol::SCOPE_DEV);
+          .addImm(AMDGPU::CPol::SCOPE_SYS);
+      break;
+    case SIAtomicScope::AGENT:
+      // TODO DOCS
+      if (ST.hasGFX1250Insts()) {
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
+            .addImm(AMDGPU::CPol::SCOPE_DEV);
+      }
+      break;
+    case SIAtomicScope::CLUSTER:
+    case SIAtomicScope::WORKGROUP:
+      // No WB necessary, but we still have to wait.
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No WB or wait necessary here, but insertWait takes care of that.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
     }
-    break;
-  case SIAtomicScope::CLUSTER:
-  case SIAtomicScope::WORKGROUP:
-    // No WB necessary, but we still have to wait.
-    break;
-  case SIAtomicScope::WAVEFRONT:
-  case SIAtomicScope::SINGLETHREAD:
-    // No WB or wait necessary here.
-    return false;
-  default:
-    llvm_unreachable("Unsupported synchronization scope");
-  }
 
-  if (Pos == Position::AFTER)
-    --MI;
+    if (Pos == Position::AFTER)
+      --MI;
+  }
 
   // We always have to wait for previous memory operations (load/store) to
   // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
index 682c1f6519959..8b0b099999f06 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
@@ -689,7 +689,6 @@ define amdgpu_kernel void @agent_release_fence() {
 ;
 ; GFX1250-LABEL: agent_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -763,7 +762,6 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: agent_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -837,7 +835,6 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: agent_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -961,7 +958,6 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1023,7 +1019,6 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1085,7 +1080,6 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1221,19 +1215,16 @@ define amdgpu_kernel void @system_release_fence() {
 ;
 ; GFX12-WGP-LABEL: system_release_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: system_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -1297,19 +1288,16 @@ define amdgpu_kernel void @system_acq_rel_fence() {
 ;
 ; GFX12-WGP-LABEL: system_acq_rel_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: system_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -1373,19 +1361,16 @@ define amdgpu_kernel void @system_seq_cst_fence() {
 ;
 ; GFX12-WGP-LABEL: system_seq_cst_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: system_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -1501,17 +1486,14 @@ define amdgpu_kernel void @system_one_as_release_fence() {
 ;
 ; GFX12-WGP-LABEL: system_one_as_release_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_one_as_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: system_one_as_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1565,17 +1547,14 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
 ;
 ; GFX12-WGP-LABEL: system_one_as_acq_rel_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_one_as_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: system_one_as_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1629,17 +1608,14 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
 ;
 ; GFX12-WGP-LABEL: system_one_as_seq_cst_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_one_as_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: system_one_as_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}

>From 4fdf71514db81e6baee3cc758363b715df9605e8 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Mon, 22 Sep 2025 10:46:51 -0400
Subject: [PATCH 3/4] Remove changes related to GloballyAddressableScratch from
 this PR

---
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 2fd6bc52ea4da..dc9c961e7de36 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2459,12 +2459,10 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
   /// The scratch address space does not need the global memory cache
   /// to be flushed as all memory operations by the same thread are
   /// sequentially consistent, and no other thread can access scratch
-  /// memory, unless the subtarget has GloballyAddressableScratch.
+  /// memory.
 
   /// Other address spaces do not have a cache.
-  if (((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) &&
-      (((AddrSpace & SIAtomicAddrSpace::SCRATCH) == SIAtomicAddrSpace::NONE) ||
-       !ST.hasGloballyAddressableScratch()))
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
     return false;
 
   AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
@@ -2522,11 +2520,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
   // The scratch address space does not need the global memory cache
   // writeback as all memory operations by the same thread are
   // sequentially consistent, and no other thread can access scratch
-  // memory, unless the subtarget has GloballyAddressableScratch.
-  if (((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) ||
-      (((AddrSpace & SIAtomicAddrSpace::SCRATCH) != SIAtomicAddrSpace::NONE) &&
-       ST.hasGloballyAddressableScratch())) {
-
+  // memory.
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     if (Pos == Position::AFTER)
       ++MI;
 

>From 3b7305aace0b7c3d957c9a7cbc6e6d7d63a69921 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Tue, 23 Sep 2025 04:46:14 -0400
Subject: [PATCH 4/4] Properly report changes in
 SIGfx12CacheControl::insertRelease

---
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index dc9c961e7de36..c85d2bb9fe9ae 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2514,6 +2514,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
                                         SIAtomicAddrSpace AddrSpace,
                                         bool IsCrossAddrSpaceOrdering,
                                         Position Pos) const {
+  bool Changed = false;
+
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -2534,12 +2536,14 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
     case SIAtomicScope::SYSTEM:
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
           .addImm(AMDGPU::CPol::SCOPE_SYS);
+      Changed = true;
       break;
     case SIAtomicScope::AGENT:
       // TODO DOCS
       if (ST.hasGFX1250Insts()) {
         BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
             .addImm(AMDGPU::CPol::SCOPE_DEV);
+        Changed = true;
       }
       break;
     case SIAtomicScope::CLUSTER:
@@ -2560,10 +2564,10 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
   // We always have to wait for previous memory operations (load/store) to
   // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
   // we of course need to wait for that as well.
-  insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
-             IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
+  Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+                        IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
 
-  return true;
+  return Changed;
 }
 
 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(