[Mlir-commits] [mlir] [mlir][xegpu] Add support for distributing `gpu.barrier` (PR #145434)

Tue Jun 24 09:28:11 PDT 2025

https://github.com/charithaintc updated https://github.com/llvm/llvm-project/pull/145434

>From e014914470a2b932504f21c711a901b60bcb1270 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 23 Jun 2025 23:34:36 +0000
Subject: [PATCH 1/3] fix

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 30 +++++++++++++++++++
 .../Dialect/XeGPU/subgroup-distribute.mlir    | 19 ++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index dabcae0bfe4b1..fd19a234dc083 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -455,6 +455,14 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
     if (!operand)
       return rewriter.notifyMatchFailure(
           subgroupOp, "warp result is not a xegpu::LoadNd op");
+    // Make sure the load op is the last operation in the warp op body. This
+    // ensure that load op is not sinked earlier violating any barrier
+    // synchronizations.
+    auto yield = cast<gpu::YieldOp>(
+        subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+    Operation *lastNode = yield->getPrevNode();
+    if (!dyn_cast_or_null<xegpu::LoadNdOp>(lastNode))
+      return failure();
 
     auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
     xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
@@ -782,6 +790,27 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
   }
 };
 
+struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                PatternRewriter &rewriter) const override {
+    auto yield = cast<gpu::YieldOp>(
+        subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+    Operation *lastNode = yield->getPrevNode();
+    // The last node must be a gpu::BarrierOp.
+    auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
+    if (!barrierOp)
+      return failure();
+    // Simply move the barrier op outside of the warp op.
+    rewriter.setInsertionPointAfter(subgroupOp);
+    rewriter.create<gpu::BarrierOp>(
+        barrierOp.getLoc(), barrierOp->getResultTypes(),
+        barrierOp->getOperands(), barrierOp->getAttrs());
+    rewriter.eraseOp(barrierOp);
+    return success();
+  }
+};
+
 } // namespace
 
 namespace {
@@ -797,6 +826,7 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
   patterns.add<CreateNdDescDistribution, StoreNdDistribution,
                LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
                UpdateNdOffsetDistribution>(patterns.getContext());
+  patterns.add<GpuBarrierDistribution>(patterns.getContext(), 10);
 }
 
 void XeGPUSubgroupDistributePass::runOnOperation() {
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index a59633b0cbd9a..3d91b2269bc4b 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -278,3 +278,22 @@ gpu.module @test {
     gpu.return
   }
 }
+
+// -----
+// CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
+// CHECK-NEXT: %[[T1:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
+// CHECK-NEXT: gpu.barrier
+// CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
+// CHECK-NEXT: xegpu.store_nd %[[T1]], %[[T2]] : vector<1xf16>, !xegpu.tensor_desc<16xf16>
+gpu.module @test {
+  gpu.func @gpu_barrier(%arg0: memref<256xf16>, %arg1: memref<256xf16>) {
+    %c0 = arith.constant 0 : index
+    %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    %1 = xegpu.load_nd %0  {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf16>
+    gpu.barrier
+    %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    xegpu.store_nd %1, %2 : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    gpu.return
+  }
+}

>From 18e51112f5553b13f436c1c2120e1dbf0822163c Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 23 Jun 2025 23:37:30 +0000
Subject: [PATCH 2/3] fix

---
 mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index fd19a234dc083..9501249cff0d0 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -825,8 +825,8 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
     RewritePatternSet &patterns) {
   patterns.add<CreateNdDescDistribution, StoreNdDistribution,
                LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
-               UpdateNdOffsetDistribution>(patterns.getContext());
-  patterns.add<GpuBarrierDistribution>(patterns.getContext(), 10);
+               UpdateNdOffsetDistribution, GpuBarrierDistribution>(
+      patterns.getContext());
 }
 
 void XeGPUSubgroupDistributePass::runOnOperation() {

>From f9d0e117b5077c319a6f919edab8b36cb5236dbf Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 24 Jun 2025 16:02:08 +0000
Subject: [PATCH 3/3] add comment

---
 mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 9501249cff0d0..42381ea2683e3 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -790,6 +790,8 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
   }
 };
 
+/// Sink a gpu::BarrierOp at the end of enclosing `gpu.warp_execute_on_lane_0`
+/// region. This will simply move the barrier op outside of the warp op.
 struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
@@ -801,7 +803,7 @@ struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
     auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
     if (!barrierOp)
       return failure();
-    // Simply move the barrier op outside of the warp op.
+    // Move the barrier op outside of the warp op.
     rewriter.setInsertionPointAfter(subgroupOp);
     rewriter.create<gpu::BarrierOp>(
         barrierOp.getLoc(), barrierOp->getResultTypes(),