[Mlir-commits] [mlir] [mlir] Update all-reduce (& vector tests) to use workgroup barriers (PR #178285)
Krzysztof Drewniak
llvmlistbot at llvm.org
Tue Jan 27 12:06:04 PST 2026
https://github.com/krzysz00 created https://github.com/llvm/llvm-project/pull/178285
This commit updates the lowering of all-reduce operations to annotate the generated barriers with `memfence [#gpu.address_space<workgroup>]` so that these barriers do not force unrelated global memory operations to complete. It similarly sets up the warp synchronization function in the vectory distribuhte tests, since they also only read/write shared memory.
In additon, this commit adds convenience builders for gpu.barrier, which will allow it to either fence on a given address space or on the address space of a provided memref.
>From b6f332c2c38edc7b562942868715a3356ff9aa6b Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Tue, 27 Jan 2026 19:42:47 +0000
Subject: [PATCH] [mlir] Update all-reduce (& vector tests) to use workgroup
barriers
This commit updates the lowering of all-reduce operations to annotate
the generated barriers with `memfence [#gpu.address_space<workgroup>]`
so that these barriers do not force unrelated global memory operations
to complete. It similarly sets up the warp synchronization function in
the vectory distribuhte tests, since they also only read/write shared
memory.
In additon, this commit adds convenience builders for gpu.barrier,
which will allow it to either fence on a given address space or on the
address space of a provided memref.
---
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 5 +++-
mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 23 ++++++++++++++++++-
.../GPU/Transforms/AllReduceLowering.cpp | 4 ++--
mlir/test/Dialect/GPU/all-reduce-add.mlir | 4 ++--
mlir/test/Dialect/GPU/all-reduce-maxf.mlir | 4 ++--
.../Vector/vector-warp-distribute.mlir | 10 ++++----
.../Dialect/Vector/TestVectorTransforms.cpp | 2 +-
7 files changed, 38 insertions(+), 14 deletions(-)
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 3167388f57e7d..7891cf19ac921 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1472,7 +1472,10 @@ def GPU_BarrierOp : GPU_Op<"barrier">,
}];
let assemblyFormat = "(`memfence` $address_spaces^)? attr-dict";
let hasCanonicalizer = 1;
- let builders = [OpBuilder<(ins)>];
+ let builders = [OpBuilder<(
+ ins CArg<"std::optional<::mlir::gpu::AddressSpace>",
+ "std::nullopt">:$addressSpace)>,
+ OpBuilder<(ins "Value":$memrefToFence)>];
}
def GPU_GPUModuleOp : GPU_Op<"module", [
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index ba8403a3a2d05..c029a49f2625f 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -1514,7 +1514,28 @@ void BarrierOp::getCanonicalizationPatterns(RewritePatternSet &results,
}
void BarrierOp::build(mlir::OpBuilder &odsBuilder,
- mlir::OperationState &odsState) {}
+ mlir::OperationState &odsState,
+ std::optional<AddressSpace> addressSpace) {
+ ArrayAttr addressSpacesAttr;
+ if (addressSpace)
+ addressSpacesAttr = odsBuilder.getArrayAttr(
+ AddressSpaceAttr::get(odsBuilder.getContext(), addressSpace.value()));
+ build(odsBuilder, odsState, addressSpacesAttr);
+}
+
+/// Builds a barrier that causes memory operations affecting `memrefToFence` to
+/// be completed after the barrier is concluded. Currently, this means setting
+/// the fenced address spaces to those of the given memref if it is a gpu
+/// address space.
+void BarrierOp::build(OpBuilder &builder, OperationState &odsState,
+ Value memrefToFence) {
+ std::optional<AddressSpace> addrSpaceToFence;
+ if (auto memrefType = dyn_cast<BaseMemRefType>(memrefToFence.getType()))
+ if (auto addrSpaceAttr = dyn_cast_if_present<gpu::AddressSpaceAttr>(
+ memrefType.getMemorySpace()))
+ addrSpaceToFence = addrSpaceAttr.getValue();
+ return build(builder, odsState, addrSpaceToFence);
+}
//===----------------------------------------------------------------------===//
// GPUFuncOp
diff --git a/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
index 8c449144af3a9..d76c3194f025e 100644
--- a/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
@@ -113,7 +113,7 @@ struct GpuAllReduceRewriter {
Value index = create<arith::IndexCastOp>(indexType, subgroupId);
create<memref::StoreOp>(subgroupReduce, buffer, index);
});
- create<gpu::BarrierOp>();
+ create<gpu::BarrierOp>(buffer);
// Compute number of active subgroups.
Value biasedBlockSize =
@@ -135,7 +135,7 @@ struct GpuAllReduceRewriter {
});
// Synchronize workgroup and load result from workgroup memory.
- create<gpu::BarrierOp>();
+ create<gpu::BarrierOp>(buffer);
Value result = create<memref::LoadOp>(valueType, buffer, zero);
rewriter.replaceOp(reduceOp, result);
diff --git a/mlir/test/Dialect/GPU/all-reduce-add.mlir b/mlir/test/Dialect/GPU/all-reduce-add.mlir
index 2a24e1de3bf3f..bca9ea3f50bd5 100644
--- a/mlir/test/Dialect/GPU/all-reduce-add.mlir
+++ b/mlir/test/Dialect/GPU/all-reduce-add.mlir
@@ -104,7 +104,7 @@ gpu.module @kernels {
// CHECK: ^bb20:
// CHECK: cf.br ^bb21
// CHECK: ^bb21:
- // CHECK: gpu.barrier
+ // CHECK: gpu.barrier memfence [#gpu.address_space<workgroup>]
// CHECK: [[VAL_72:%.*]] = arith.addi [[VAL_28]], [[VAL_2]] : i32
// CHECK: [[VAL_73:%.*]] = arith.divsi [[VAL_72]], [[VAL_5]] : i32
// CHECK: [[VAL_74:%.*]] = arith.cmpi slt, [[VAL_27]], [[VAL_73]] : i32
@@ -174,7 +174,7 @@ gpu.module @kernels {
// CHECK: ^bb41:
// CHECK: cf.br ^bb42
// CHECK: ^bb42:
- // CHECK: gpu.barrier
+ // CHECK: gpu.barrier memfence [#gpu.address_space<workgroup>]
%sum = gpu.all_reduce add %arg0 uniform {} : (f32) -> (f32)
gpu.return
}
diff --git a/mlir/test/Dialect/GPU/all-reduce-maxf.mlir b/mlir/test/Dialect/GPU/all-reduce-maxf.mlir
index a7d61fdfbd165..9c32249b2230f 100644
--- a/mlir/test/Dialect/GPU/all-reduce-maxf.mlir
+++ b/mlir/test/Dialect/GPU/all-reduce-maxf.mlir
@@ -104,7 +104,7 @@ gpu.module @kernels {
// CHECK: ^bb20:
// CHECK: cf.br ^bb21
// CHECK: ^bb21:
- // CHECK: gpu.barrier
+ // CHECK: gpu.barrier memfence [#gpu.address_space<workgroup>]
// CHECK: [[VAL_82:%.*]] = arith.addi [[VAL_28]], [[VAL_2]] : i32
// CHECK: [[VAL_83:%.*]] = arith.divsi [[VAL_82]], [[VAL_5]] : i32
// CHECK: [[VAL_84:%.*]] = arith.cmpi slt, [[VAL_27]], [[VAL_83]] : i32
@@ -174,7 +174,7 @@ gpu.module @kernels {
// CHECK: ^bb41:
// CHECK: cf.br ^bb42
// CHECK: ^bb42:
- // CHECK: gpu.barrier
+ // CHECK: gpu.barrier memfence [#gpu.address_space<workgroup>]
%sum = gpu.all_reduce maxnumf %arg0 uniform {} : (f32) -> (f32)
gpu.return
}
diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
index 2d0330043db06..63c9d9b7a9bf8 100644
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -39,7 +39,7 @@ func.func @rewrite_warp_op_to_scf_if(%laneid: index,
// CHECK-SCF-IF: %[[s1:.*]] = affine.apply #[[$TIMES8]]()[%[[laneid]]]
// CHECK-SCF-IF: vector.transfer_write %[[v1]], %[[buffer_v1]][%[[s1]]]
-// CHECK-SCF-IF-DAG: gpu.barrier
+// CHECK-SCF-IF-DAG: gpu.barrier memfence [#gpu.address_space<workgroup>]
// CHECK-SCF-IF-DAG: %[[buffer_def_0:.*]] = memref.get_global @__shared_32xf32
// CHECK-SCF-IF-DAG: %[[buffer_def_1:.*]] = memref.get_global @__shared_64xf32
@@ -58,7 +58,7 @@ func.func @rewrite_warp_op_to_scf_if(%laneid: index,
gpu.yield %2, %3 : vector<32xf32>, vector<64xf32>
}
// CHECK-SCF-IF: }
-// CHECK-SCF-IF: gpu.barrier
+// CHECK-SCF-IF: gpu.barrier memfence [#gpu.address_space<workgroup>]
// CHECK-SCF-IF: %[[o1:.*]] = affine.apply #[[$TIMES2]]()[%[[laneid]]]
// CHECK-SCF-IF: %[[r1:.*]] = vector.transfer_read %[[buffer_def_1]][%[[o1]]], %{{.*}} {in_bounds = [true]} : memref<64xf32, 3>, vector<2xf32>
// CHECK-SCF-IF: %[[r0:.*]] = vector.transfer_read %[[buffer_def_0]][%[[laneid]]], %{{.*}} {in_bounds = [true]} : memref<32xf32, 3>, vector<1xf32>
@@ -1230,7 +1230,7 @@ func.func @warp_execute_has_broadcast_semantics(%laneid: index, %s0: f32, %v0: v
gpu.yield %rs0, %rv0, %rv1, %rv2 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>
}
- // CHECK-SCF-IF: gpu.barrier
+ // CHECK-SCF-IF: gpu.barrier memfence [#gpu.address_space<workgroup>]
// CHECK-SCF-IF: %[[RV2:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]]{{.*}} {in_bounds = [true, true]} : memref<1x1xf32, 3>, vector<1x1xf32>
// CHECK-SCF-IF: %[[RV1:.*]] = vector.transfer_read {{.*}}[%[[C0]]]{{.*}} {in_bounds = [true]} : memref<1xf32, 3>, vector<1xf32>
// CHECK-SCF-IF: %[[RV0:.*]] = vector.transfer_read {{.*}}[]{{.*}} : memref<f32, 3>, vector<f32>
@@ -1252,7 +1252,7 @@ func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, %
// CHECK-SCF-IF: vector.transfer_write %{{.*}}, %{{.*}}[%[[LANEID]], %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x1xf32>, memref<32x64x1xf32, 3>
// CHECK-SCF-IF: %[[RID:.*]] = affine.apply #[[$TIMES2]]()[%[[LANEID]]]
// CHECK-SCF-IF: vector.transfer_write %{{.*}}, %{{.*}}[%[[C0]], %[[RID]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x2x128xf32>, memref<1x64x128xf32, 3>
- // CHECK-SCF-IF: gpu.barrier
+ // CHECK-SCF-IF: gpu.barrier memfence [#gpu.address_space<workgroup>]
// CHECK-SCF-IF: scf.if{{.*}}{
%r:2 = gpu.warp_execute_on_lane_0(%laneid)[32]
@@ -1273,7 +1273,7 @@ func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, %
gpu.yield %r0, %r1 : vector<32x64x1xf32>, vector<1x64x128xf32>
}
- // CHECK-SCF-IF: gpu.barrier
+ // CHECK-SCF-IF: gpu.barrier memfence [#gpu.address_space<workgroup>]
// CHECK-SCF-IF: %[[WID:.*]] = affine.apply #[[$TIMES2]]()[%[[LANEID]]]
// CHECK-SCF-IF-DAG: %[[R0:.*]] = vector.transfer_read %{{.*}}[%[[LANEID]], %[[C0]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<32x64x1xf32, 3>, vector<1x64x1xf32>
// CHECK-SCF-IF-DAG: %[[R1:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[WID]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<1x64x128xf32, 3>, vector<1x2x128xf32>
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index db941454f8d8c..3317ae8d11b0d 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -684,7 +684,7 @@ struct TestVectorDistribution
options.warpAllocationFn = allocateGlobalSharedMemory;
options.warpSyncronizationFn = [](Location loc, OpBuilder &builder,
gpu::WarpExecuteOnLane0Op warpOp) {
- gpu::BarrierOp::create(builder, loc);
+ gpu::BarrierOp::create(builder, loc, gpu::AddressSpace::Workgroup);
};
// Test on one pattern in isolation.
if (warpOpToSCF) {
More information about the Mlir-commits
mailing list