[Mlir-commits] [mlir] [mlir] Update all-reduce (& vector tests) to use workgroup barriers (PR #178285)

Tue Jan 27 12:06:04 PST 2026

https://github.com/krzysz00 created https://github.com/llvm/llvm-project/pull/178285

This commit updates the lowering of all-reduce operations to annotate the generated barriers with `memfence [#gpu.address_space<workgroup>]` so that these barriers do not force unrelated global memory operations to complete. It similarly sets up the warp synchronization function in the vectory distribuhte tests, since they also only read/write shared memory.

In additon, this commit adds convenience builders for gpu.barrier, which will allow it to either fence on a given address space or on the address space of a provided memref.

>From b6f332c2c38edc7b562942868715a3356ff9aa6b Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Tue, 27 Jan 2026 19:42:47 +0000
Subject: [PATCH] [mlir] Update all-reduce (& vector tests) to use workgroup
 barriers

This commit updates the lowering of all-reduce operations to annotate
the generated barriers with `memfence [#gpu.address_space<workgroup>]`
so that these barriers do not force unrelated global memory operations
to complete. It similarly sets up the warp synchronization function in
the vectory distribuhte tests, since they also only read/write shared
memory.

In additon, this commit adds convenience builders for gpu.barrier,
which will allow it to either fence on a given address space or on the
address space of a provided memref.
---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    |  5 +++-
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        | 23 ++++++++++++++++++-
 .../GPU/Transforms/AllReduceLowering.cpp      |  4 ++--
 mlir/test/Dialect/GPU/all-reduce-add.mlir     |  4 ++--
 mlir/test/Dialect/GPU/all-reduce-maxf.mlir    |  4 ++--
 .../Vector/vector-warp-distribute.mlir        | 10 ++++----
 .../Dialect/Vector/TestVectorTransforms.cpp   |  2 +-
 7 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 3167388f57e7d..7891cf19ac921 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1472,7 +1472,10 @@ def GPU_BarrierOp : GPU_Op<"barrier">,
   }];
   let assemblyFormat = "(`memfence` $address_spaces^)? attr-dict";
   let hasCanonicalizer = 1;
-  let builders = [OpBuilder<(ins)>];
+  let builders = [OpBuilder<(
+                      ins CArg<"std::optional<::mlir::gpu::AddressSpace>",
+                               "std::nullopt">:$addressSpace)>,
+                  OpBuilder<(ins "Value":$memrefToFence)>];
 }
 
 def GPU_GPUModuleOp : GPU_Op<"module", [
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index ba8403a3a2d05..c029a49f2625f 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -1514,7 +1514,28 @@ void BarrierOp::getCanonicalizationPatterns(RewritePatternSet &results,
 }
 
 void BarrierOp::build(mlir::OpBuilder &odsBuilder,
-                      mlir::OperationState &odsState) {}
+                      mlir::OperationState &odsState,
+                      std::optional<AddressSpace> addressSpace) {
+  ArrayAttr addressSpacesAttr;
+  if (addressSpace)
+    addressSpacesAttr = odsBuilder.getArrayAttr(
+        AddressSpaceAttr::get(odsBuilder.getContext(), addressSpace.value()));
+  build(odsBuilder, odsState, addressSpacesAttr);
+}
+
+/// Builds a barrier that causes memory operations affecting `memrefToFence` to
+/// be completed after the barrier is concluded. Currently, this means setting
+/// the fenced address spaces to those of the given memref if it is a gpu
+/// address space.
+void BarrierOp::build(OpBuilder &builder, OperationState &odsState,
+                      Value memrefToFence) {
+  std::optional<AddressSpace> addrSpaceToFence;
+  if (auto memrefType = dyn_cast<BaseMemRefType>(memrefToFence.getType()))
+    if (auto addrSpaceAttr = dyn_cast_if_present<gpu::AddressSpaceAttr>(
+            memrefType.getMemorySpace()))
+      addrSpaceToFence = addrSpaceAttr.getValue();
+  return build(builder, odsState, addrSpaceToFence);
+}
 
 //===----------------------------------------------------------------------===//
 // GPUFuncOp
diff --git a/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
index 8c449144af3a9..d76c3194f025e 100644
--- a/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
@@ -113,7 +113,7 @@ struct GpuAllReduceRewriter {
       Value index = create<arith::IndexCastOp>(indexType, subgroupId);
       create<memref::StoreOp>(subgroupReduce, buffer, index);
     });
-    create<gpu::BarrierOp>();
+    create<gpu::BarrierOp>(buffer);
 
     // Compute number of active subgroups.
     Value biasedBlockSize =
@@ -135,7 +135,7 @@ struct GpuAllReduceRewriter {
     });
 
     // Synchronize workgroup and load result from workgroup memory.
-    create<gpu::BarrierOp>();
+    create<gpu::BarrierOp>(buffer);
     Value result = create<memref::LoadOp>(valueType, buffer, zero);
 
     rewriter.replaceOp(reduceOp, result);
diff --git a/mlir/test/Dialect/GPU/all-reduce-add.mlir b/mlir/test/Dialect/GPU/all-reduce-add.mlir
index 2a24e1de3bf3f..bca9ea3f50bd5 100644
--- a/mlir/test/Dialect/GPU/all-reduce-add.mlir
+++ b/mlir/test/Dialect/GPU/all-reduce-add.mlir
@@ -104,7 +104,7 @@ gpu.module @kernels {
     // CHECK: ^bb20:
     // CHECK:   cf.br ^bb21
     // CHECK: ^bb21:
-    // CHECK:   gpu.barrier
+    // CHECK:   gpu.barrier memfence [#gpu.address_space<workgroup>]
     // CHECK:   [[VAL_72:%.*]] = arith.addi [[VAL_28]], [[VAL_2]] : i32
     // CHECK:   [[VAL_73:%.*]] = arith.divsi [[VAL_72]], [[VAL_5]] : i32
     // CHECK:   [[VAL_74:%.*]] = arith.cmpi slt, [[VAL_27]], [[VAL_73]] : i32
@@ -174,7 +174,7 @@ gpu.module @kernels {
     // CHECK: ^bb41:
     // CHECK:   cf.br ^bb42
     // CHECK: ^bb42:
-    // CHECK:   gpu.barrier
+    // CHECK:   gpu.barrier memfence [#gpu.address_space<workgroup>]
     %sum = gpu.all_reduce add %arg0 uniform {} : (f32) -> (f32)
     gpu.return
   }
diff --git a/mlir/test/Dialect/GPU/all-reduce-maxf.mlir b/mlir/test/Dialect/GPU/all-reduce-maxf.mlir
index a7d61fdfbd165..9c32249b2230f 100644
--- a/mlir/test/Dialect/GPU/all-reduce-maxf.mlir
+++ b/mlir/test/Dialect/GPU/all-reduce-maxf.mlir
@@ -104,7 +104,7 @@ gpu.module @kernels {
     // CHECK: ^bb20:
     // CHECK:   cf.br ^bb21
     // CHECK: ^bb21:
-    // CHECK:   gpu.barrier
+    // CHECK:   gpu.barrier memfence [#gpu.address_space<workgroup>]
     // CHECK:   [[VAL_82:%.*]] = arith.addi [[VAL_28]], [[VAL_2]] : i32
     // CHECK:   [[VAL_83:%.*]] = arith.divsi [[VAL_82]], [[VAL_5]] : i32
     // CHECK:   [[VAL_84:%.*]] = arith.cmpi slt, [[VAL_27]], [[VAL_83]] : i32
@@ -174,7 +174,7 @@ gpu.module @kernels {
     // CHECK: ^bb41:
     // CHECK:   cf.br ^bb42
     // CHECK: ^bb42:
-    // CHECK:   gpu.barrier
+    // CHECK:   gpu.barrier memfence [#gpu.address_space<workgroup>]
     %sum = gpu.all_reduce maxnumf %arg0 uniform {} : (f32) -> (f32)
     gpu.return
   }
diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
index 2d0330043db06..63c9d9b7a9bf8 100644
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -39,7 +39,7 @@ func.func @rewrite_warp_op_to_scf_if(%laneid: index,
 //       CHECK-SCF-IF:   %[[s1:.*]] = affine.apply #[[$TIMES8]]()[%[[laneid]]]
 //       CHECK-SCF-IF:   vector.transfer_write %[[v1]], %[[buffer_v1]][%[[s1]]]
 
-//   CHECK-SCF-IF-DAG:   gpu.barrier
+//   CHECK-SCF-IF-DAG:   gpu.barrier memfence [#gpu.address_space<workgroup>]
 //   CHECK-SCF-IF-DAG:   %[[buffer_def_0:.*]] = memref.get_global @__shared_32xf32
 //   CHECK-SCF-IF-DAG:   %[[buffer_def_1:.*]] = memref.get_global @__shared_64xf32
 
@@ -58,7 +58,7 @@ func.func @rewrite_warp_op_to_scf_if(%laneid: index,
     gpu.yield %2, %3 : vector<32xf32>, vector<64xf32>
   }
 //       CHECK-SCF-IF:   }
-//       CHECK-SCF-IF:   gpu.barrier
+//       CHECK-SCF-IF:   gpu.barrier memfence [#gpu.address_space<workgroup>]
 //       CHECK-SCF-IF:   %[[o1:.*]] = affine.apply #[[$TIMES2]]()[%[[laneid]]]
 //       CHECK-SCF-IF:   %[[r1:.*]] = vector.transfer_read %[[buffer_def_1]][%[[o1]]], %{{.*}} {in_bounds = [true]} : memref<64xf32, 3>, vector<2xf32>
 //       CHECK-SCF-IF:   %[[r0:.*]] = vector.transfer_read %[[buffer_def_0]][%[[laneid]]], %{{.*}} {in_bounds = [true]} : memref<32xf32, 3>, vector<1xf32>
@@ -1230,7 +1230,7 @@ func.func @warp_execute_has_broadcast_semantics(%laneid: index, %s0: f32, %v0: v
       gpu.yield %rs0, %rv0, %rv1, %rv2 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>
   }
 
-  // CHECK-SCF-IF: gpu.barrier
+  // CHECK-SCF-IF: gpu.barrier memfence [#gpu.address_space<workgroup>]
   // CHECK-SCF-IF: %[[RV2:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]]{{.*}} {in_bounds = [true, true]} : memref<1x1xf32, 3>, vector<1x1xf32>
   // CHECK-SCF-IF: %[[RV1:.*]] = vector.transfer_read {{.*}}[%[[C0]]]{{.*}} {in_bounds = [true]} : memref<1xf32, 3>, vector<1xf32>
   // CHECK-SCF-IF: %[[RV0:.*]] = vector.transfer_read {{.*}}[]{{.*}} : memref<f32, 3>, vector<f32>
@@ -1252,7 +1252,7 @@ func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, %
   // CHECK-SCF-IF:  vector.transfer_write %{{.*}}, %{{.*}}[%[[LANEID]], %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x1xf32>, memref<32x64x1xf32, 3>
   // CHECK-SCF-IF:  %[[RID:.*]] = affine.apply #[[$TIMES2]]()[%[[LANEID]]]
   // CHECK-SCF-IF:  vector.transfer_write %{{.*}}, %{{.*}}[%[[C0]], %[[RID]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x2x128xf32>, memref<1x64x128xf32, 3>
-  // CHECK-SCF-IF:  gpu.barrier
+  // CHECK-SCF-IF:  gpu.barrier memfence [#gpu.address_space<workgroup>]
 
   // CHECK-SCF-IF: scf.if{{.*}}{
   %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32]
@@ -1273,7 +1273,7 @@ func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, %
       gpu.yield %r0, %r1 : vector<32x64x1xf32>, vector<1x64x128xf32>
   }
 
-  //     CHECK-SCF-IF: gpu.barrier
+  //     CHECK-SCF-IF: gpu.barrier memfence [#gpu.address_space<workgroup>]
   //     CHECK-SCF-IF: %[[WID:.*]] = affine.apply #[[$TIMES2]]()[%[[LANEID]]]
   // CHECK-SCF-IF-DAG: %[[R0:.*]] = vector.transfer_read %{{.*}}[%[[LANEID]], %[[C0]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<32x64x1xf32, 3>, vector<1x64x1xf32>
   // CHECK-SCF-IF-DAG: %[[R1:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[WID]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<1x64x128xf32, 3>, vector<1x2x128xf32>
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index db941454f8d8c..3317ae8d11b0d 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -684,7 +684,7 @@ struct TestVectorDistribution
     options.warpAllocationFn = allocateGlobalSharedMemory;
     options.warpSyncronizationFn = [](Location loc, OpBuilder &builder,
                                       gpu::WarpExecuteOnLane0Op warpOp) {
-      gpu::BarrierOp::create(builder, loc);
+      gpu::BarrierOp::create(builder, loc, gpu::AddressSpace::Workgroup);
     };
     // Test on one pattern in isolation.
     if (warpOpToSCF) {