[Mlir-commits] [mlir] [mlir][amdgpu] Add gfx12+ split barrier wrappers (PR #175579)
Ivan Butygin
llvmlistbot at llvm.org
Mon Jan 12 08:25:27 PST 2026
https://github.com/Hardcode84 created https://github.com/llvm/llvm-project/pull/175579
The main purpose is to generate fences, similar to the existing `lds_barrier`. For gfx<12 convert signal to normal `lds_barrier` and wait to noop.
As we want to fully switch to gpu in long term we will probably need to add similar ops to gpu dialect eventually.
>From e53ba368097adc06d22eef3a4680f93a1322c0e7 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Mon, 12 Jan 2026 16:59:47 +0100
Subject: [PATCH 1/2] barrier ops
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 63 +++++++++++++++++++
mlir/test/Dialect/AMDGPU/ops.mlir | 23 +++++++
2 files changed, 86 insertions(+)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index bf0711cc27922..82fb12ec44d56 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -977,6 +977,69 @@ def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
let hasCanonicalizer = 1;
}
+def AMDGPU_LDSBarrierSignalOp : AMDGPU_Op<"lds_barrier_signal"> {
+ let summary = "Signal phase of split LDS barrier.";
+ let description = [{
+ `amdgpu.lds_barrier_signal` is the signal phase of a split barrier that
+ initiates a wait for all operations that affect the Local Data Store (LDS)
+ issued from the workgroup. This operation allows computation to be overlapped
+ with LDS memory operations by separating the barrier signal from the wait.
+
+ All workitems in a workgroup must reach the barrier signal before any of them
+ may proceed past the corresponding `amdgpu.lds_barrier_wait`. Between the
+ signal and wait operations, computation that does not depend on the completion
+ of LDS operations may proceed.
+
+ The split barrier provides better performance than `amdgpu.lds_barrier` when
+ there is independent work that can be done while waiting for LDS operations
+ to complete.
+
+ Example:
+ ```mlir
+ amdgpu.lds_barrier_signal
+ // Independent computation can happen here.
+ %result = arith.addi %a, %b : i32
+ amdgpu.lds_barrier_wait
+ // LDS operations are guaranteed to be complete here.
+ ```
+
+ Note that `lds_barrier_signal` does **not** force reads to or from global
+ memory to complete before execution continues.
+ }];
+ let assemblyFormat = "attr-dict";
+}
+
+def AMDGPU_LDSBarrierWaitOp : AMDGPU_Op<"lds_barrier_wait"> {
+ let summary = "Wait phase of split LDS barrier.";
+ let description = [{
+ `amdgpu.lds_barrier_wait` is the wait phase of a split barrier that completes
+ the wait for all operations that affect the Local Data Store (LDS) issued
+ from the workgroup.
+
+ This operation must be paired with a preceding `amdgpu.lds_barrier_signal`.
+ All workitems in a workgroup must reach both the signal and wait operations,
+ and the workgroup may only continue past the wait once all previously signaled
+ LDS operations have completed.
+
+ The split barrier provides better performance than `amdgpu.lds_barrier` when
+ there is independent work that can be done while waiting for LDS operations
+ to complete.
+
+ Example:
+ ```mlir
+ amdgpu.lds_barrier_signal
+ // Independent computation can happen here.
+ %result = arith.addi %a, %b : i32
+ amdgpu.lds_barrier_wait
+ // LDS operations are guaranteed to be complete here.
+ ```
+
+ Note that `lds_barrier_wait` does **not** force reads to or from global
+ memory to complete before execution continues.
+ }];
+ let assemblyFormat = "attr-dict";
+}
+
def AMDGPU_SchedBarrierOpOpt : I32BitEnumAttr<"sched_barrier_opt_enum",
"The possible options for scheduling barriers",
[
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 2b3234ef8510d..25e3d57c50782 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -549,6 +549,29 @@ func.func @lds_barrier() {
func.return
}
+// CHECK-LABEL: func @lds_barrier_signal
+func.func @lds_barrier_signal() {
+ // CHECK: amdgpu.lds_barrier_signal
+ amdgpu.lds_barrier_signal
+ func.return
+}
+
+// CHECK-LABEL: func @lds_barrier_wait
+func.func @lds_barrier_wait() {
+ // CHECK: amdgpu.lds_barrier_wait
+ amdgpu.lds_barrier_wait
+ func.return
+}
+
+// CHECK-LABEL: func @lds_barrier_split
+func.func @lds_barrier_split() {
+ // CHECK: amdgpu.lds_barrier_signal
+ amdgpu.lds_barrier_signal
+ // CHECK: amdgpu.lds_barrier_wait
+ amdgpu.lds_barrier_wait
+ func.return
+}
+
// CHECK-LABEL: func @sched_barrier
func.func @sched_barrier() {
// CHECK: amdgpu.sched_barrier allow = <none>
>From c0c43e1dab12dbecb2c2de73d0f4a53abff19961 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Mon, 12 Jan 2026 17:11:30 +0100
Subject: [PATCH 2/2] lowering
---
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 99 +++++++++++++++----
.../AMDGPUToROCDL/amdgpu-to-rocdl.mlir | 75 ++++++++++++++
2 files changed, 156 insertions(+), 18 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 6427807e944a1..6dec9c49c6cbb 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -566,6 +566,26 @@ struct MemoryCounterWaitOpLowering
}
};
+// Helper function to create a fence operation with LDS memory synchronization.
+static Operation *createLDSFence(OpBuilder &builder, Location loc,
+ LLVM::AtomicOrdering ordering) {
+ Attribute mmra =
+ builder.getAttr<LLVM::MMRATagAttr>("amdgpu-synchronize-as", "local");
+ // Note: while there *is* a workgroup-one-as scope, this, when combined with
+ // the MMRA, will lead to the fence having no effect. This is because the
+ // codepaths for an atomic load or store will observe that a
+ // one-address-space atomic to LDS requires no synchronization because
+ // operations on LDS are totally ordered with respect to each other, and so
+ // will not emit the correct waitcnt operations that these fences are
+ // intended to produce. Therefore, we use a broader type of fence and rely
+ // on the MMRA to relax it to the semantics we want.
+ StringRef scope = "workgroup";
+
+ auto fence = LLVM::FenceOp::create(builder, loc, ordering, scope);
+ fence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra);
+ return fence;
+}
+
struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
LDSBarrierOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
: ConvertOpToLLVMPattern<LDSBarrierOp>(converter), chipset(chipset) {}
@@ -580,21 +600,7 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
// chips that don't have the BackOffBarrier feature enabled in LLVM.
bool requiresInlineAsm = chipset < kGfx90a;
- Attribute mmra =
- rewriter.getAttr<LLVM::MMRATagAttr>("amdgpu-synchronize-as", "local");
- // Note: while there *is* a workgroup-one-as scope, this, when combined with
- // the MMRA, will lead to the fence having no effect. This is because the
- // codepaths for an atomic load or store will observe that a
- // one-address-space atomic to LDS requires no synchronization because
- // operations on LDS are totally ordered with respect to each other, and so
- // will not emit the correct waitcnt operations that these fences are
- // intended to produce. Therefore, we use a broader type of fence and rely
- // on the MMRA to relax it to the semantics we want.
- StringRef scope = "workgroup";
-
- auto relFence = LLVM::FenceOp::create(rewriter, loc,
- LLVM::AtomicOrdering::release, scope);
- relFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra);
+ createLDSFence(rewriter, loc, LLVM::AtomicOrdering::release);
if (requiresInlineAsm) {
auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
LLVM::AsmDialect::AD_ATT);
@@ -614,9 +620,65 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
ROCDL::BarrierWaitOp::create(rewriter, loc, -1);
}
- auto acqFence = LLVM::FenceOp::create(rewriter, loc,
- LLVM::AtomicOrdering::acquire, scope);
- acqFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra);
+ auto acqFence =
+ createLDSFence(rewriter, loc, LLVM::AtomicOrdering::acquire);
+ rewriter.replaceOp(op, acqFence);
+ return success();
+ }
+};
+
+struct LDSBarrierSignalOpLowering
+ : public ConvertOpToLLVMPattern<LDSBarrierSignalOp> {
+ LDSBarrierSignalOpLowering(const LLVMTypeConverter &converter,
+ Chipset chipset)
+ : ConvertOpToLLVMPattern<LDSBarrierSignalOp>(converter),
+ chipset(chipset) {}
+
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(LDSBarrierSignalOp op, LDSBarrierSignalOp::Adaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ Location loc = op.getLoc();
+
+ // Split barriers are only supported on gfx12+.
+ // For older architectures, convert signal to a full lds_barrier.
+ if (chipset.majorVersion < 12) {
+ auto ldsBarrier = LDSBarrierOp::create(rewriter, loc);
+ rewriter.replaceOp(op, ldsBarrier);
+ return success();
+ }
+
+ createLDSFence(rewriter, loc, LLVM::AtomicOrdering::release);
+ auto signal = ROCDL::BarrierSignalOp::create(rewriter, loc, -1);
+ rewriter.replaceOp(op, signal);
+ return success();
+ }
+};
+
+struct LDSBarrierWaitOpLowering
+ : public ConvertOpToLLVMPattern<LDSBarrierWaitOp> {
+ LDSBarrierWaitOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
+ : ConvertOpToLLVMPattern<LDSBarrierWaitOp>(converter), chipset(chipset) {}
+
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(LDSBarrierWaitOp op, LDSBarrierWaitOp::Adaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ Location loc = op.getLoc();
+
+ // Split barriers are only supported on gfx12+.
+ // For older architectures, convert wait to a noop (signal already did the
+ // full barrier).
+ if (chipset.majorVersion < 12) {
+ rewriter.eraseOp(op);
+ return success();
+ }
+
+ ROCDL::BarrierWaitOp::create(rewriter, loc, -1);
+ auto acqFence =
+ createLDSFence(rewriter, loc, LLVM::AtomicOrdering::acquire);
rewriter.replaceOp(op, acqFence);
return success();
}
@@ -3560,6 +3622,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
RawBufferOpLowering<RawBufferAtomicCmpswapOp,
ROCDL::RawPtrBufferAtomicCmpSwap>,
AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering,
+ LDSBarrierSignalOpLowering, LDSBarrierWaitOpLowering,
SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering,
SparseMFMAOpLowering, WMMAOpLowering, ScaledWMMAOpLowering,
ExtPackedFp8OpLowering, ScaledExtPackedMatrixOpLowering,
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index e55bca4bad42f..ed83393a644d7 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -454,6 +454,81 @@ func.func @lds_barrier() {
func.return
}
+// CHECK-LABEL: func @lds_barrier_signal
+func.func @lds_barrier_signal() {
+ // For gfx < 12, signal converts to full lds_barrier.
+ // GFX908: llvm.fence syncscope("workgroup") release {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX908-NEXT: llvm.inline_asm has_side_effects asm_dialect = att
+ // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_barrier"
+ // GFX908-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX90A: llvm.fence syncscope("workgroup") release {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX90A-NEXT: rocdl.s.barrier
+ // GFX90A-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX942: llvm.fence syncscope("workgroup") release {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX942-NEXT: rocdl.s.barrier
+ // GFX942-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX10: llvm.fence syncscope("workgroup") release {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX10-NEXT: rocdl.s.barrier
+ // GFX10-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX11: llvm.fence syncscope("workgroup") release {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX11-NEXT: rocdl.s.barrier
+ // GFX11-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = #[[$MMRA_TAG]]}
+ // For gfx >= 12, signal converts to split barrier signal.
+ // GFX12: llvm.fence syncscope("workgroup") release {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX12-NEXT: rocdl.s.barrier.signal id = -1
+ // GFX1250: llvm.fence syncscope("workgroup") release {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX1250-NEXT: rocdl.s.barrier.signal id = -1
+ amdgpu.lds_barrier_signal
+ func.return
+}
+
+// CHECK-LABEL: func @lds_barrier_wait
+func.func @lds_barrier_wait() {
+ // For gfx < 12, wait is erased (noop, since signal already did full barrier).
+ // GFX908-NOT: llvm.fence
+ // GFX908-NOT: rocdl
+ // GFX90A-NOT: llvm.fence
+ // GFX90A-NOT: rocdl
+ // GFX942-NOT: llvm.fence
+ // GFX942-NOT: rocdl
+ // GFX10-NOT: llvm.fence
+ // GFX10-NOT: rocdl
+ // GFX11-NOT: llvm.fence
+ // GFX11-NOT: rocdl
+ // For gfx >= 12, wait converts to split barrier wait.
+ // GFX12: rocdl.s.barrier.wait id = -1
+ // GFX12-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX1250: rocdl.s.barrier.wait id = -1
+ // GFX1250-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = #[[$MMRA_TAG]]}
+ amdgpu.lds_barrier_wait
+ func.return
+}
+
+// CHECK-LABEL: func @lds_barrier_split
+func.func @lds_barrier_split() {
+ // Test combined signal + wait pattern.
+ // For gfx < 12: signal becomes full barrier, wait is erased.
+ // GFX908: llvm.fence syncscope("workgroup") release {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX908-NEXT: llvm.inline_asm has_side_effects asm_dialect = att
+ // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_barrier"
+ // GFX908-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX90A: llvm.fence syncscope("workgroup") release {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX90A-NEXT: rocdl.s.barrier
+ // GFX90A-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = #[[$MMRA_TAG]]}
+ // For gfx >= 12: both operations preserved as split barriers.
+ // GFX12: llvm.fence syncscope("workgroup") release {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX12-NEXT: rocdl.s.barrier.signal id = -1
+ // GFX12-NEXT: rocdl.s.barrier.wait id = -1
+ // GFX12-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX1250: llvm.fence syncscope("workgroup") release {llvm.mmra = #[[$MMRA_TAG]]}
+ // GFX1250-NEXT: rocdl.s.barrier.signal id = -1
+ // GFX1250-NEXT: rocdl.s.barrier.wait id = -1
+ // GFX1250-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = #[[$MMRA_TAG]]}
+ amdgpu.lds_barrier_signal
+ amdgpu.lds_barrier_wait
+ func.return
+}
+
// CHECK-LABEL: func @sched_barrier
func.func @sched_barrier() {
// CHECK: rocdl.sched.barrier 0
More information about the Mlir-commits
mailing list