[Mlir-commits] [mlir] [mlir][AMDGPU] Use LDS-only MMRA fences for lds_barrier (PR #157919)

Wed Sep 10 11:22:46 PDT 2025

https://github.com/krzysz00 created https://github.com/llvm/llvm-project/pull/157919

The previous lowering strategy for amdgpu.lds_barrier (which is an operation whose semantics are) "s.barrier, and all LDS operations before this happen-before LDS operations after this, and there must not be an inherent fence/forcing-to-completion of global memory (for performance)" was previosuly implemented through using manual calls to waitcnt() intrinsics and the s_barrire intrinsic(s).

The lack of explicit fencing enabled miscompiles (where LDS accesses were reordered with the barrier) on gfx12. Since LLVM now allows MMRA annotations to ensure that only LDS accesses are fenced by a pair of fences, we can now use these fences in order to explicitly represent the semantics we want instead of trying to prescribe the method of their implemntation.

Note that the gfx908 workaround of hiding the s_barrier in inline assembly in order to prevent spurious vmem barriers remains in place, but is is removed for gfx11 because the fences have been changed to give us the effect we want recently.

>From b414fff24927da4635cea21a69d8714ae11ce9a2 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Wed, 10 Sep 2025 18:15:42 +0000
Subject: [PATCH] [mlir][AMDGPU] Use LDS-only MMRA fences for lds_barrier

The previous lowering strategy for amdgpu.lds_barrier (which is an
operation whose semantics are) "s.barrier, and all LDS operations
before this happen-before LDS operations after this, and there must
not be an inherent fence/forcing-to-completion of global memory (for
performance)" was previosuly implemented through using manual calls to
waitcnt() intrinsics and the s_barrire intrinsic(s).

The lack of explicit fencing enabled miscompiles (where LDS accesses
were reordered with the barrier) on gfx12. Since LLVM now allows MMRA
annotations to ensure that only LDS accesses are fenced by a pair of
fences, we can now use these fences in order to explicitly represent
the semantics we want instead of trying to prescribe the method of
their implemntation.

Note that the gfx908 workaround of hiding the s_barrier in inline
assembly in order to prevent spurious vmem barriers remains in place,
but is is removed for gfx11 because the fences have been changed to
give us the effect we want recently.
---
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           | 53 ++++++++-----------
 .../AMDGPUToROCDL/amdgpu-to-rocdl.mlir        | 13 +++--
 2 files changed, 27 insertions(+), 39 deletions(-)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 203790ed95153..4d2290934eab1 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -536,52 +536,41 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
   LogicalResult
   matchAndRewrite(LDSBarrierOp op, LDSBarrierOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    bool requiresInlineAsm = chipset < kGfx90a || chipset.majorVersion == 11;
+    Location loc = op.getLoc();
+    // This ensures that waits on global memory aren't introduced on
+    // chips that don't have the BackOffBarrier feature enabled in LLVM.
+    bool requiresInlineAsm = chipset < kGfx90a;
+
+    Attribute mmra =
+        rewriter.getAttr<LLVM::MMRATagAttr>("amdgpu-synchronize-as", "local");
+    StringRef scope = "workgroup-one-as";
 
+    auto relFence = LLVM::FenceOp::create(rewriter, loc,
+                                          LLVM::AtomicOrdering::release, scope);
+    relFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra);
     if (requiresInlineAsm) {
       auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
                                                       LLVM::AsmDialect::AD_ATT);
-      const char *asmStr =
-          ";;;WARNING: BREAKS DEBUG WATCHES\ns_waitcnt lgkmcnt(0)\ns_barrier";
+      const char *asmStr = ";;;WARNING: BREAKS DEBUG WATCHES\ns_barrier";
       const char *constraints = "";
-      rewriter.replaceOpWithNewOp<LLVM::InlineAsmOp>(
-          op,
+      LLVM::InlineAsmOp::create(
+          rewriter, loc,
           /*resultTypes=*/TypeRange(), /*operands=*/ValueRange(),
           /*asm_string=*/asmStr, constraints, /*has_side_effects=*/true,
           /*is_align_stack=*/false, LLVM::TailCallKind::None,
           /*asm_dialect=*/asmDialectAttr,
           /*operand_attrs=*/ArrayAttr());
-      return success();
-    }
-    if (chipset.majorVersion < 12) {
-      constexpr int32_t ldsOnlyBitsGfx6789 = ~(0x1f << 8);
-      constexpr int32_t ldsOnlyBitsGfx10 = ~(0x3f << 8);
-      // Left in place in case someone disables the inline ASM path or future
-      // chipsets use the same bit pattern.
-      constexpr int32_t ldsOnlyBitsGfx11 = ~(0x3f << 4);
-
-      int32_t ldsOnlyBits;
-      if (chipset.majorVersion == 11)
-        ldsOnlyBits = ldsOnlyBitsGfx11;
-      else if (chipset.majorVersion == 10)
-        ldsOnlyBits = ldsOnlyBitsGfx10;
-      else if (chipset.majorVersion <= 9)
-        ldsOnlyBits = ldsOnlyBitsGfx6789;
-      else
-        return op.emitOpError(
-                   "don't know how to lower this for chipset major version")
-               << chipset.majorVersion;
-
-      Location loc = op->getLoc();
-      ROCDL::SWaitcntOp::create(rewriter, loc, ldsOnlyBits);
-      rewriter.replaceOpWithNewOp<ROCDL::SBarrierOp>(op);
+    } else if (chipset.majorVersion < 12) {
+      ROCDL::SBarrierOp::create(rewriter, loc);
     } else {
-      Location loc = op->getLoc();
-      ROCDL::WaitDscntOp::create(rewriter, loc, 0);
       ROCDL::BarrierSignalOp::create(rewriter, loc, -1);
-      rewriter.replaceOpWithNewOp<ROCDL::BarrierWaitOp>(op, -1);
+      ROCDL::BarrierWaitOp::create(rewriter, loc, -1);
     }
 
+    auto acqFence = LLVM::FenceOp::create(rewriter, loc,
+                                          LLVM::AtomicOrdering::acquire, scope);
+    acqFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra);
+    rewriter.replaceOp(op, acqFence);
     return success();
   }
 };
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index cc1162d8b0de8..91daa75ccb58f 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -8,6 +8,8 @@
 // Note: #gpu.address_space<global> is hardcoded to `1` here because the
 // test pass doesn't set up the GPU address space conversions.
 
+// CHECK: #[[$MMRA_TAG:.+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"local">
+
 #gpu_global_addrspace = 1
 
 // CHECK-LABEL: func @fat_raw_buffer_cast
@@ -414,19 +416,16 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_v2f16(%src : vector<2xf16>, %cmp : v
 
 // CHECK-LABEL: func @lds_barrier
 func.func @lds_barrier() {
+  // CHECK: llvm.fence syncscope("workgroup-one-as") release {llvm.mmra = #[[$MMRA_TAG]]}
   // GFX908: llvm.inline_asm has_side_effects asm_dialect = att
-  // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
-  // GFX90A: rocdl.s.waitcnt -7937
+  // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_barrier"
   // GFX90A-NEXT: rocdl.s.barrier
-  // GFX942: rocdl.s.waitcnt -7937
   // GFX942-NEXT: rocdl.s.barrier
-  // GFX10:  rocdl.s.waitcnt -16129
   // GFX10-NEXT: rocdl.s.barrier
-  // GFX11:  llvm.inline_asm has_side_effects asm_dialect = att
-  // GFX11-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
-  // GFX12:  rocdl.s.wait.dscnt 0
+  // GFX11-NEXT: rocdl.s.barrier
   // GFX12-NEXT: rocdl.s.barrier.signal -1
   // GFX12-NEXT: rocdl.s.barrier.wait -1
+  // CHECK-NEXT: llvm.fence syncscope("workgroup-one-as") acquire {llvm.mmra = #[[$MMRA_TAG]]}
   amdgpu.lds_barrier
   func.return
 }