[Mlir-commits] [mlir] [mlir][AMDGPU] Use LDS-only MMRA fences for lds_barrier (PR #157919)

Fri Sep 12 11:44:40 PDT 2025

https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/157919

>From b414fff24927da4635cea21a69d8714ae11ce9a2 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Wed, 10 Sep 2025 18:15:42 +0000
Subject: [PATCH 1/3] [mlir][AMDGPU] Use LDS-only MMRA fences for lds_barrier

The previous lowering strategy for amdgpu.lds_barrier (which is an
operation whose semantics are) "s.barrier, and all LDS operations
before this happen-before LDS operations after this, and there must
not be an inherent fence/forcing-to-completion of global memory (for
performance)" was previosuly implemented through using manual calls to
waitcnt() intrinsics and the s_barrire intrinsic(s).

The lack of explicit fencing enabled miscompiles (where LDS accesses
were reordered with the barrier) on gfx12. Since LLVM now allows MMRA
annotations to ensure that only LDS accesses are fenced by a pair of
fences, we can now use these fences in order to explicitly represent
the semantics we want instead of trying to prescribe the method of
their implemntation.

Note that the gfx908 workaround of hiding the s_barrier in inline
assembly in order to prevent spurious vmem barriers remains in place,
but is is removed for gfx11 because the fences have been changed to
give us the effect we want recently.
---
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           | 53 ++++++++-----------
 .../AMDGPUToROCDL/amdgpu-to-rocdl.mlir        | 13 +++--
 2 files changed, 27 insertions(+), 39 deletions(-)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 203790ed95153..4d2290934eab1 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -536,52 +536,41 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
   LogicalResult
   matchAndRewrite(LDSBarrierOp op, LDSBarrierOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    bool requiresInlineAsm = chipset < kGfx90a || chipset.majorVersion == 11;
+    Location loc = op.getLoc();
+    // This ensures that waits on global memory aren't introduced on
+    // chips that don't have the BackOffBarrier feature enabled in LLVM.
+    bool requiresInlineAsm = chipset < kGfx90a;
+
+    Attribute mmra =
+        rewriter.getAttr<LLVM::MMRATagAttr>("amdgpu-synchronize-as", "local");
+    StringRef scope = "workgroup-one-as";
 
+    auto relFence = LLVM::FenceOp::create(rewriter, loc,
+                                          LLVM::AtomicOrdering::release, scope);
+    relFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra);
     if (requiresInlineAsm) {
       auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
                                                       LLVM::AsmDialect::AD_ATT);
-      const char *asmStr =
-          ";;;WARNING: BREAKS DEBUG WATCHES\ns_waitcnt lgkmcnt(0)\ns_barrier";
+      const char *asmStr = ";;;WARNING: BREAKS DEBUG WATCHES\ns_barrier";
       const char *constraints = "";
-      rewriter.replaceOpWithNewOp<LLVM::InlineAsmOp>(
-          op,
+      LLVM::InlineAsmOp::create(
+          rewriter, loc,
           /*resultTypes=*/TypeRange(), /*operands=*/ValueRange(),
           /*asm_string=*/asmStr, constraints, /*has_side_effects=*/true,
           /*is_align_stack=*/false, LLVM::TailCallKind::None,
           /*asm_dialect=*/asmDialectAttr,
           /*operand_attrs=*/ArrayAttr());
-      return success();
-    }
-    if (chipset.majorVersion < 12) {
-      constexpr int32_t ldsOnlyBitsGfx6789 = ~(0x1f << 8);
-      constexpr int32_t ldsOnlyBitsGfx10 = ~(0x3f << 8);
-      // Left in place in case someone disables the inline ASM path or future
-      // chipsets use the same bit pattern.
-      constexpr int32_t ldsOnlyBitsGfx11 = ~(0x3f << 4);
-
-      int32_t ldsOnlyBits;
-      if (chipset.majorVersion == 11)
-        ldsOnlyBits = ldsOnlyBitsGfx11;
-      else if (chipset.majorVersion == 10)
-        ldsOnlyBits = ldsOnlyBitsGfx10;
-      else if (chipset.majorVersion <= 9)
-        ldsOnlyBits = ldsOnlyBitsGfx6789;
-      else
-        return op.emitOpError(
-                   "don't know how to lower this for chipset major version")
-               << chipset.majorVersion;
-
-      Location loc = op->getLoc();
-      ROCDL::SWaitcntOp::create(rewriter, loc, ldsOnlyBits);
-      rewriter.replaceOpWithNewOp<ROCDL::SBarrierOp>(op);
+    } else if (chipset.majorVersion < 12) {
+      ROCDL::SBarrierOp::create(rewriter, loc);
     } else {
-      Location loc = op->getLoc();
-      ROCDL::WaitDscntOp::create(rewriter, loc, 0);
       ROCDL::BarrierSignalOp::create(rewriter, loc, -1);
-      rewriter.replaceOpWithNewOp<ROCDL::BarrierWaitOp>(op, -1);
+      ROCDL::BarrierWaitOp::create(rewriter, loc, -1);
     }
 
+    auto acqFence = LLVM::FenceOp::create(rewriter, loc,
+                                          LLVM::AtomicOrdering::acquire, scope);
+    acqFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra);
+    rewriter.replaceOp(op, acqFence);
     return success();
   }
 };
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index cc1162d8b0de8..91daa75ccb58f 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -8,6 +8,8 @@
 // Note: #gpu.address_space<global> is hardcoded to `1` here because the
 // test pass doesn't set up the GPU address space conversions.
 
+// CHECK: #[[$MMRA_TAG:.+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"local">
+
 #gpu_global_addrspace = 1
 
 // CHECK-LABEL: func @fat_raw_buffer_cast
@@ -414,19 +416,16 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_v2f16(%src : vector<2xf16>, %cmp : v
 
 // CHECK-LABEL: func @lds_barrier
 func.func @lds_barrier() {
+  // CHECK: llvm.fence syncscope("workgroup-one-as") release {llvm.mmra = #[[$MMRA_TAG]]}
   // GFX908: llvm.inline_asm has_side_effects asm_dialect = att
-  // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
-  // GFX90A: rocdl.s.waitcnt -7937
+  // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_barrier"
   // GFX90A-NEXT: rocdl.s.barrier
-  // GFX942: rocdl.s.waitcnt -7937
   // GFX942-NEXT: rocdl.s.barrier
-  // GFX10:  rocdl.s.waitcnt -16129
   // GFX10-NEXT: rocdl.s.barrier
-  // GFX11:  llvm.inline_asm has_side_effects asm_dialect = att
-  // GFX11-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
-  // GFX12:  rocdl.s.wait.dscnt 0
+  // GFX11-NEXT: rocdl.s.barrier
   // GFX12-NEXT: rocdl.s.barrier.signal -1
   // GFX12-NEXT: rocdl.s.barrier.wait -1
+  // CHECK-NEXT: llvm.fence syncscope("workgroup-one-as") acquire {llvm.mmra = #[[$MMRA_TAG]]}
   amdgpu.lds_barrier
   func.return
 }

>From 9706ccc25d35c88189093829eadb6d5afd786187 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Thu, 11 Sep 2025 19:55:34 +0000
Subject: [PATCH 2/3] Move to a workgroup fence from a workgroup-one-as fence
 because things don't work how I thought they would

---
 mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp    | 10 +++++++++-
 .../test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir |  4 ++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 4d2290934eab1..ea5ef0383b219 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -543,7 +543,15 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
 
     Attribute mmra =
         rewriter.getAttr<LLVM::MMRATagAttr>("amdgpu-synchronize-as", "local");
-    StringRef scope = "workgroup-one-as";
+    // Note: while there *is* a workgroup-one-as scope, this, when combined with
+    // the MMRA, will lead to the fence having no effect. This is because
+    // the codepaths for an atomic load or store will observe that a
+    // one-address-space atomic to LDS requires no synchronization because
+    // operations on LDS are totally ordered with respect to each other,
+    // and so will not emit the correct waitcnt operations that these fences
+    // are intended to produce. Therefore, we use a broader type of fence
+    // and rely on the MMRA to relax it to the semantics we want.
+    StringRef scope = "workgroup";
 
     auto relFence = LLVM::FenceOp::create(rewriter, loc,
                                           LLVM::AtomicOrdering::release, scope);
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index 91daa75ccb58f..5dd1046cce041 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -416,7 +416,7 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_v2f16(%src : vector<2xf16>, %cmp : v
 
 // CHECK-LABEL: func @lds_barrier
 func.func @lds_barrier() {
-  // CHECK: llvm.fence syncscope("workgroup-one-as") release {llvm.mmra = #[[$MMRA_TAG]]}
+  // CHECK: llvm.fence syncscope("workgroup") release {llvm.mmra = #[[$MMRA_TAG]]}
   // GFX908: llvm.inline_asm has_side_effects asm_dialect = att
   // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_barrier"
   // GFX90A-NEXT: rocdl.s.barrier
@@ -425,7 +425,7 @@ func.func @lds_barrier() {
   // GFX11-NEXT: rocdl.s.barrier
   // GFX12-NEXT: rocdl.s.barrier.signal -1
   // GFX12-NEXT: rocdl.s.barrier.wait -1
-  // CHECK-NEXT: llvm.fence syncscope("workgroup-one-as") acquire {llvm.mmra = #[[$MMRA_TAG]]}
+  // CHECK-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = #[[$MMRA_TAG]]}
   amdgpu.lds_barrier
   func.return
 }

>From 6639112187c88cc4a08d72d32736866a2634faeb Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 12 Sep 2025 18:44:26 +0000
Subject: [PATCH 3/3] Rewrap comment

---
 mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index ea5ef0383b219..9fce41259af6b 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -544,13 +544,13 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
     Attribute mmra =
         rewriter.getAttr<LLVM::MMRATagAttr>("amdgpu-synchronize-as", "local");
     // Note: while there *is* a workgroup-one-as scope, this, when combined with
-    // the MMRA, will lead to the fence having no effect. This is because
-    // the codepaths for an atomic load or store will observe that a
+    // the MMRA, will lead to the fence having no effect. This is because the
+    // codepaths for an atomic load or store will observe that a
     // one-address-space atomic to LDS requires no synchronization because
-    // operations on LDS are totally ordered with respect to each other,
-    // and so will not emit the correct waitcnt operations that these fences
-    // are intended to produce. Therefore, we use a broader type of fence
-    // and rely on the MMRA to relax it to the semantics we want.
+    // operations on LDS are totally ordered with respect to each other, and so
+    // will not emit the correct waitcnt operations that these fences are
+    // intended to produce. Therefore, we use a broader type of fence and rely
+    // on the MMRA to relax it to the semantics we want.
     StringRef scope = "workgroup";
 
     auto relFence = LLVM::FenceOp::create(rewriter, loc,