[Mlir-commits] [mlir] [mlir][xegpu] Add support for distributing `gpu.barrier` (PR #145434)
Charitha Saumya
llvmlistbot at llvm.org
Tue Jun 24 09:28:11 PDT 2025
https://github.com/charithaintc updated https://github.com/llvm/llvm-project/pull/145434
>From e014914470a2b932504f21c711a901b60bcb1270 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 23 Jun 2025 23:34:36 +0000
Subject: [PATCH 1/3] fix
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 30 +++++++++++++++++++
.../Dialect/XeGPU/subgroup-distribute.mlir | 19 ++++++++++++
2 files changed, 49 insertions(+)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index dabcae0bfe4b1..fd19a234dc083 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -455,6 +455,14 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
if (!operand)
return rewriter.notifyMatchFailure(
subgroupOp, "warp result is not a xegpu::LoadNd op");
+ // Make sure the load op is the last operation in the warp op body. This
+ // ensure that load op is not sinked earlier violating any barrier
+ // synchronizations.
+ auto yield = cast<gpu::YieldOp>(
+ subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+ Operation *lastNode = yield->getPrevNode();
+ if (!dyn_cast_or_null<xegpu::LoadNdOp>(lastNode))
+ return failure();
auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
@@ -782,6 +790,27 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
}
};
+struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const override {
+ auto yield = cast<gpu::YieldOp>(
+ subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+ Operation *lastNode = yield->getPrevNode();
+ // The last node must be a gpu::BarrierOp.
+ auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
+ if (!barrierOp)
+ return failure();
+ // Simply move the barrier op outside of the warp op.
+ rewriter.setInsertionPointAfter(subgroupOp);
+ rewriter.create<gpu::BarrierOp>(
+ barrierOp.getLoc(), barrierOp->getResultTypes(),
+ barrierOp->getOperands(), barrierOp->getAttrs());
+ rewriter.eraseOp(barrierOp);
+ return success();
+ }
+};
+
} // namespace
namespace {
@@ -797,6 +826,7 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
patterns.add<CreateNdDescDistribution, StoreNdDistribution,
LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
UpdateNdOffsetDistribution>(patterns.getContext());
+ patterns.add<GpuBarrierDistribution>(patterns.getContext(), 10);
}
void XeGPUSubgroupDistributePass::runOnOperation() {
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index a59633b0cbd9a..3d91b2269bc4b 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -278,3 +278,22 @@ gpu.module @test {
gpu.return
}
}
+
+// -----
+// CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
+// CHECK-NEXT: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
+// CHECK-NEXT: gpu.barrier
+// CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
+// CHECK-NEXT: xegpu.store_nd %[[T1]], %[[T2]] : vector<1xf16>, !xegpu.tensor_desc<16xf16>
+gpu.module @test {
+ gpu.func @gpu_barrier(%arg0: memref<256xf16>, %arg1: memref<256xf16>) {
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+ %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf16>
+ gpu.barrier
+ %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+ xegpu.store_nd %1, %2 : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+ gpu.return
+ }
+}
>From 18e51112f5553b13f436c1c2120e1dbf0822163c Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 23 Jun 2025 23:37:30 +0000
Subject: [PATCH 2/3] fix
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index fd19a234dc083..9501249cff0d0 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -825,8 +825,8 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
RewritePatternSet &patterns) {
patterns.add<CreateNdDescDistribution, StoreNdDistribution,
LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
- UpdateNdOffsetDistribution>(patterns.getContext());
- patterns.add<GpuBarrierDistribution>(patterns.getContext(), 10);
+ UpdateNdOffsetDistribution, GpuBarrierDistribution>(
+ patterns.getContext());
}
void XeGPUSubgroupDistributePass::runOnOperation() {
>From f9d0e117b5077c319a6f919edab8b36cb5236dbf Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 24 Jun 2025 16:02:08 +0000
Subject: [PATCH 3/3] add comment
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 9501249cff0d0..42381ea2683e3 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -790,6 +790,8 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
}
};
+/// Sink a gpu::BarrierOp at the end of enclosing `gpu.warp_execute_on_lane_0`
+/// region. This will simply move the barrier op outside of the warp op.
struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
@@ -801,7 +803,7 @@ struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
if (!barrierOp)
return failure();
- // Simply move the barrier op outside of the warp op.
+ // Move the barrier op outside of the warp op.
rewriter.setInsertionPointAfter(subgroupOp);
rewriter.create<gpu::BarrierOp>(
barrierOp.getLoc(), barrierOp->getResultTypes(),
More information about the Mlir-commits
mailing list