[Mlir-commits] [mlir] ac62f12 - [mlir][amdgpu] Remove redundant barriers (#175436)

Mon Jan 12 03:48:02 PST 2026

Author: Ivan Butygin
Date: 2026-01-12T14:47:58+03:00
New Revision: ac62f12192c2f516b9aa3131d5192152b7f02b3e

URL: https://github.com/llvm/llvm-project/commit/ac62f12192c2f516b9aa3131d5192152b7f02b3e
DIFF: https://github.com/llvm/llvm-project/commit/ac62f12192c2f516b9aa3131d5192152b7f02b3e.diff

LOG: [mlir][amdgpu] Remove redundant barriers (#175436)

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
    mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
    mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
    mlir/test/Dialect/AMDGPU/canonicalize.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 7a8cd89f886a7..bf0711cc27922 100644

--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -41,7 +41,7 @@ def AMDGPU_Dialect : Dialect {
     have chipset-specific 
diff erences that can be abstracted over in a useful way.
 
     To give some concrete examples:
- 
+
     - `amdgpu.mfma` and `amdgpu.wmma` exist in order to make a large set of
       intrinsics more compatible with the MLIR type system (such as by allowing
       8-bit float vectors to be passed as `vector<N x f8E4M3FN>` or
@@ -105,7 +105,7 @@ def AMDGPU_Dialect : Dialect {
     what it does and have found the keywords they'll need for more detail.
 
     Operation documentation should include usage examples.
-    
+
     Note that this dialect uses LLVM's gfx numbers to refer to individual
     architectures/chipsets and not product names or codenames.
   }];
@@ -974,6 +974,7 @@ def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
     breakpoints set on variables) when debugging.
   }];
   let assemblyFormat = "attr-dict";
+  let hasCanonicalizer = 1;
 }
 
 def AMDGPU_SchedBarrierOpOpt : I32BitEnumAttr<"sched_barrier_opt_enum",
@@ -1285,7 +1286,7 @@ def AMDGPU_SparseMFMAOp :
     `vector<2xi16>` (two 16-bit indices).
 
     The `cbsz` and `abid` parameters are repurposed to select the index set.
-    If `cbsz == 0`, then `abid[1:0]` selects which index set to use. 
+    If `cbsz == 0`, then `abid[1:0]` selects which index set to use.
     If `cbsz != 0`, then the very first is selected.
 
     Example:
@@ -1473,7 +1474,7 @@ def AMDGPU_ScaledWMMAOp
     number of scales required for each matrix is determined by:
       num_scales_A = (M × K) / block_size
       num_scales_B = (N × K) / block_size
-      
+
     The index attributes (`a_first_scale_lane`, `b_first_scale_lane`) select
     which lane to start reading scale values from (0 or 16):
     - For block size 32, 32 lanes across a single wave are used for the scale

diff  --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 787248f9f339e..dd741d56d39d0 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -773,6 +773,21 @@ LogicalResult PermlaneSwapOp::verify() {
   return success();
 }
 
+/// Remove amdgpu.lds_barrier after amdgpu.lds_barrier.
+static LogicalResult eraseRedundantLDSBarrierOps(LDSBarrierOp op,
+                                                 PatternRewriter &rewriter) {
+  if (isa_and_nonnull<LDSBarrierOp>(op->getNextNode())) {
+    rewriter.eraseOp(op);
+    return success();
+  }
+  return failure();
+}
+
+void LDSBarrierOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                               MLIRContext *context) {
+  results.add(eraseRedundantLDSBarrierOps);
+}
+
 //===----------------------------------------------------------------------===//
 // MemoryCounterWaitOp
 //===----------------------------------------------------------------------===//

diff  --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 36db6e82baaea..552485e748123 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -1468,11 +1468,9 @@ LogicalResult RotateOp::verify() {
 // BarrierOp
 //===----------------------------------------------------------------------===//
 
-namespace {
-
 /// Remove gpu.barrier after gpu.barrier, the threads are already synchronized!
-LogicalResult eraseRedundantGpuBarrierOps(BarrierOp op,
-                                          PatternRewriter &rewriter) {
+static LogicalResult eraseRedundantGpuBarrierOps(BarrierOp op,
+                                                 PatternRewriter &rewriter) {
   if (isa_and_nonnull<BarrierOp>(op->getNextNode())) {
     rewriter.eraseOp(op);
     return success();
@@ -1480,8 +1478,6 @@ LogicalResult eraseRedundantGpuBarrierOps(BarrierOp op,
   return failure();
 }
 
-} // end anonymous namespace
-
 void BarrierOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                             MLIRContext *context) {
   results.add(eraseRedundantGpuBarrierOps);

diff  --git a/mlir/test/Dialect/AMDGPU/canonicalize.mlir b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
index cff1d3f2ac1fd..052fabfb300bb 100644
--- a/mlir/test/Dialect/AMDGPU/canonicalize.mlir
+++ b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
@@ -280,3 +280,15 @@ func.func @fuse_memory_counter_wait_not_adjacent() {
   amdgpu.memory_counter_wait load(4) store(3) ds(2) exp(1)
   return
 }
+
+// -----
+
+// Erase duplicate barriers.
+// CHECK-LABEL: func @erase_barriers
+//       CHECK-NEXT: amdgpu.lds_barrier
+//       CHECK-NEXT: return
+func.func @erase_barriers() {
+  amdgpu.lds_barrier
+  amdgpu.lds_barrier
+  return
+}