[Mlir-commits] [mlir] [mlir][amdgpu] Remove redundant barriers (PR #175436)
Ivan Butygin
llvmlistbot at llvm.org
Sun Jan 11 06:20:47 PST 2026
https://github.com/Hardcode84 updated https://github.com/llvm/llvm-project/pull/175436
>From a6dd1c9bf9d587bf3275b3479e7b9e41ff2a38da Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Sun, 11 Jan 2026 15:08:33 +0100
Subject: [PATCH 1/2] [mlir][amdgpu] Remove redundant barriers
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 9 +++++----
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 19 +++++++++++++++++++
mlir/test/Dialect/AMDGPU/canonicalize.mlir | 12 ++++++++++++
3 files changed, 36 insertions(+), 4 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 7a8cd89f886a7..bf0711cc27922 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -41,7 +41,7 @@ def AMDGPU_Dialect : Dialect {
have chipset-specific differences that can be abstracted over in a useful way.
To give some concrete examples:
-
+
- `amdgpu.mfma` and `amdgpu.wmma` exist in order to make a large set of
intrinsics more compatible with the MLIR type system (such as by allowing
8-bit float vectors to be passed as `vector<N x f8E4M3FN>` or
@@ -105,7 +105,7 @@ def AMDGPU_Dialect : Dialect {
what it does and have found the keywords they'll need for more detail.
Operation documentation should include usage examples.
-
+
Note that this dialect uses LLVM's gfx numbers to refer to individual
architectures/chipsets and not product names or codenames.
}];
@@ -974,6 +974,7 @@ def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
breakpoints set on variables) when debugging.
}];
let assemblyFormat = "attr-dict";
+ let hasCanonicalizer = 1;
}
def AMDGPU_SchedBarrierOpOpt : I32BitEnumAttr<"sched_barrier_opt_enum",
@@ -1285,7 +1286,7 @@ def AMDGPU_SparseMFMAOp :
`vector<2xi16>` (two 16-bit indices).
The `cbsz` and `abid` parameters are repurposed to select the index set.
- If `cbsz == 0`, then `abid[1:0]` selects which index set to use.
+ If `cbsz == 0`, then `abid[1:0]` selects which index set to use.
If `cbsz != 0`, then the very first is selected.
Example:
@@ -1473,7 +1474,7 @@ def AMDGPU_ScaledWMMAOp
number of scales required for each matrix is determined by:
num_scales_A = (M × K) / block_size
num_scales_B = (N × K) / block_size
-
+
The index attributes (`a_first_scale_lane`, `b_first_scale_lane`) select
which lane to start reading scale values from (0 or 16):
- For block size 32, 32 lanes across a single wave are used for the scale
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 787248f9f339e..feeedf2ddb84e 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -773,6 +773,25 @@ LogicalResult PermlaneSwapOp::verify() {
return success();
}
+namespace {
+
+/// Remove amdgpu.lds_barrier after amdgpu.lds_barrier.
+LogicalResult eraseRedundantLDSBarrierOps(LDSBarrierOp op,
+ PatternRewriter &rewriter) {
+ if (isa_and_nonnull<LDSBarrierOp>(op->getNextNode())) {
+ rewriter.eraseOp(op);
+ return success();
+ }
+ return failure();
+}
+
+} // namespace
+
+void LDSBarrierOp::getCanonicalizationPatterns(RewritePatternSet &results,
+ MLIRContext *context) {
+ results.add(eraseRedundantLDSBarrierOps);
+}
+
//===----------------------------------------------------------------------===//
// MemoryCounterWaitOp
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/AMDGPU/canonicalize.mlir b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
index cff1d3f2ac1fd..052fabfb300bb 100644
--- a/mlir/test/Dialect/AMDGPU/canonicalize.mlir
+++ b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
@@ -280,3 +280,15 @@ func.func @fuse_memory_counter_wait_not_adjacent() {
amdgpu.memory_counter_wait load(4) store(3) ds(2) exp(1)
return
}
+
+// -----
+
+// Erase duplicate barriers.
+// CHECK-LABEL: func @erase_barriers
+// CHECK-NEXT: amdgpu.lds_barrier
+// CHECK-NEXT: return
+func.func @erase_barriers() {
+ amdgpu.lds_barrier
+ amdgpu.lds_barrier
+ return
+}
>From 723ca46cd2937be11ce412c4bb83e3711d045d12 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Sun, 11 Jan 2026 15:13:32 +0100
Subject: [PATCH 2/2] static
---
mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 36db6e82baaea..552485e748123 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -1468,11 +1468,9 @@ LogicalResult RotateOp::verify() {
// BarrierOp
//===----------------------------------------------------------------------===//
-namespace {
-
/// Remove gpu.barrier after gpu.barrier, the threads are already synchronized!
-LogicalResult eraseRedundantGpuBarrierOps(BarrierOp op,
- PatternRewriter &rewriter) {
+static LogicalResult eraseRedundantGpuBarrierOps(BarrierOp op,
+ PatternRewriter &rewriter) {
if (isa_and_nonnull<BarrierOp>(op->getNextNode())) {
rewriter.eraseOp(op);
return success();
@@ -1480,8 +1478,6 @@ LogicalResult eraseRedundantGpuBarrierOps(BarrierOp op,
return failure();
}
-} // end anonymous namespace
-
void BarrierOp::getCanonicalizationPatterns(RewritePatternSet &results,
MLIRContext *context) {
results.add(eraseRedundantGpuBarrierOps);
More information about the Mlir-commits
mailing list