[Mlir-commits] [mlir] [MLIR][XeGPU] Switch to the new sg to wi pass (PR #188627)

Mon Mar 30 19:54:13 PDT 2026

https://github.com/nbpatel updated https://github.com/llvm/llvm-project/pull/188627

>From 231e1de28ef281d53044b6f311f1865365d472f0 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Wed, 25 Mar 2026 17:35:50 +0000
Subject: [PATCH 1/5] Update getDistVecTypeBasedOnLaneLayout

---
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 44 +++++++--------
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 56 +++++++++++++++++++
 2 files changed, 77 insertions(+), 23 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index f60635830cc74..7a3cda5e1c24e 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -110,29 +110,27 @@ xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
     return failure();
   assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
          "Expecting a valid layout.");
-  SmallVector<int64_t> effectiveLaneLayout =
-      layout.getEffectiveLaneLayoutAsInt();
-  assert(static_cast<size_t>(originalType.getRank()) >=
-             effectiveLaneLayout.size() &&
-         "Rank of the original vector type should be greater or equal to the "
-         "size of the lane layout to distribute the vector type.");
-  // TODO: replace the implementation with
-  //   auto distributedShape = layout.computeDistributedShape(
-  //       SmallVector<int64_t>(originalType.getShape()));
-  SmallVector<int64_t> distributedShape(originalType.getShape());
-  // Only distribute the last `laneLayout.size()` dimensions. The remaining
-  // dimensions are not distributed.
-  unsigned distributionStart =
-      originalType.getRank() - effectiveLaneLayout.size();
-  for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
-    if (i < distributionStart)
-      continue;
-    // Check if the dimension can be distributed evenly.
-    if (dim % effectiveLaneLayout[i - distributionStart] != 0)
-      return failure();
-    distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
-  }
-  return VectorType::get(distributedShape, originalType.getElementType());
+
+  int64_t vectorRank = originalType.getRank();
+  int64_t layoutRank = layout.getRank();
+  assert(vectorRank >= layoutRank && "Vector rank must be >= layout rank.");
+
+  // When the vector has more dimensions than the layout, only the trailing
+  // dimensions are distributed. Leading dimensions are preserved as-is.
+  int64_t offset = vectorRank - layoutRank;
+  ArrayRef<int64_t> fullShape = originalType.getShape();
+  SmallVector<int64_t> trailingShape(fullShape.begin() + offset,
+                                     fullShape.end());
+  auto distributedShapeOrFailure =
+      layout.computeDistributedShape(trailingShape);
+  if (failed(distributedShapeOrFailure))
+    return failure();
+
+  SmallVector<int64_t> resultShape(fullShape.begin(),
+                                   fullShape.begin() + offset);
+  resultShape.append(distributedShapeOrFailure->begin(),
+                     distributedShapeOrFailure->end());
+  return VectorType::get(resultShape, originalType.getElementType());
 }
 
 std::string xegpu::getTemporaryLayoutName(const OpOperand &operand) {
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 016b393e3d8bc..a3d9adb6a52ef 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -737,3 +737,59 @@ gpu.func @load_store_matrix_3(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layou
   gpu.return
 }
 }
+
+// -----
+gpu.module @xevm_module {
+// CHECK-LABEL: gpu.func @elementwise_wrap_around_dim
+// CHECK: %[[SRC:.*]] = "some_op"()
+// CHECK: %[[NEG:.*]] = arith.negf %[[SRC]] : vector<16x1xf16>
+// CHECK: gpu.return
+gpu.func @elementwise_wrap_around_dim() {
+  %0 = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : () -> vector<16x1xf16>
+  %1 = arith.negf %0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : vector<16x1xf16>
+  gpu.return
+}
+}
+
+// -----
+gpu.module @xevm_module {
+// CHECK-LABEL: gpu.func @constant_wrap_around_dim
+// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x1xf16>
+// CHECK: gpu.return
+gpu.func @constant_wrap_around_dim() {
+  %0 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    dense<1.0> : vector<16x1xf16>
+  gpu.return
+}
+}
+
+// -----
+gpu.module @xevm_module {
+// CHECK-LABEL: gpu.func @scatter_load_chunksize_leading_dim
+// CHECK-SAME: (%[[ARG0:.*]]: memref<256xf16>)
+// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1x1xindex>
+// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1x1xi1>
+// CHECK: %[[OFF_FLAT:.*]] = vector.shape_cast %[[OFFSET]] : vector<1x1xindex> to vector<1xindex>
+// CHECK: %[[MASK_FLAT:.*]] = vector.shape_cast %[[MASK]] : vector<1x1xi1> to vector<1xi1>
+// CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[OFF_FLAT]]], %[[MASK_FLAT]] <{chunk_size = 8 : i64, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>}>
+// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
+// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<8xf16> to vector<1x1x8xf16>
+// CHECK: gpu.return
+gpu.func @scatter_load_chunksize_leading_dim(%input: memref<256xf16>) {
+  %offset = arith.constant
+    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+    dense<12> : vector<1x16xindex>
+  %mask = arith.constant
+    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+    dense<true> : vector<1x16xi1>
+  %0 = xegpu.load %input[%offset], %mask
+    <{chunk_size = 8, l1_hint = #xegpu.cache_hint<cached>,
+      l2_hint = #xegpu.cache_hint<cached>,
+      layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
+    : memref<256xf16>, vector<1x16xindex>, vector<1x16xi1> -> vector<1x16x8xf16>
+  gpu.return
+}
+}
+

>From a839252aea8311a1928ee3fbab943cfa90983e29 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Fri, 27 Mar 2026 19:01:25 +0000
Subject: [PATCH 2/5] Add test case

---
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 41 ++++++-------------
 1 file changed, 12 insertions(+), 29 deletions(-)

diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index a3d9adb6a52ef..adead0284eb8e 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -59,6 +59,18 @@ gpu.func @load_nd_transpose() {
   gpu.return
 }
 
+// CHECK-LABEL: gpu.func @load_nd_array_length
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<64xf16>
+// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<64xf16> to vector<2x32x1xf16>
+gpu.func @load_nd_array_length() {
+  %c0 = arith.constant 0 : index
+  %0 = "some_op"() : () -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x32x16xf16>
+  gpu.return
+}
+
 // CHECK-LABEL: gpu.func @store_nd
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
@@ -764,32 +776,3 @@ gpu.func @constant_wrap_around_dim() {
   gpu.return
 }
 }
-
-// -----
-gpu.module @xevm_module {
-// CHECK-LABEL: gpu.func @scatter_load_chunksize_leading_dim
-// CHECK-SAME: (%[[ARG0:.*]]: memref<256xf16>)
-// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1x1xindex>
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1x1xi1>
-// CHECK: %[[OFF_FLAT:.*]] = vector.shape_cast %[[OFFSET]] : vector<1x1xindex> to vector<1xindex>
-// CHECK: %[[MASK_FLAT:.*]] = vector.shape_cast %[[MASK]] : vector<1x1xi1> to vector<1xi1>
-// CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[OFF_FLAT]]], %[[MASK_FLAT]] <{chunk_size = 8 : i64, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>}>
-// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<8xf16> to vector<1x1x8xf16>
-// CHECK: gpu.return
-gpu.func @scatter_load_chunksize_leading_dim(%input: memref<256xf16>) {
-  %offset = arith.constant
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
-    dense<12> : vector<1x16xindex>
-  %mask = arith.constant
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
-    dense<true> : vector<1x16xi1>
-  %0 = xegpu.load %input[%offset], %mask
-    <{chunk_size = 8, l1_hint = #xegpu.cache_hint<cached>,
-      l2_hint = #xegpu.cache_hint<cached>,
-      layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
-    : memref<256xf16>, vector<1x16xindex>, vector<1x16xi1> -> vector<1x16x8xf16>
-  gpu.return
-}
-}
-

>From c96f1cac5032b5435699376e6a606979d1d2d502 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Mon, 30 Mar 2026 23:11:47 +0000
Subject: [PATCH 3/5] Add recoverTemporaryLayouts

---
 .../Transforms/XeGPUSgToWiDistributeExperimental.cpp      | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 981c250249e5f..bb173c81346a2 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
 #include "mlir/IR/Builders.h"
@@ -1519,9 +1520,14 @@ struct XeGPUSgToWiDistributeExperimentalPass
 
 void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
 
+  Operation *root = getOperation();
+  if (!xegpu::recoverTemporaryLayouts(root)) {
+    signalPassFailure();
+    return;
+  }
+
   // Verify if all XeGPU anchor ops and vector ops have result layouts.
   // TODO: This can be removed once the full layout refactoring is done.
-  Operation *root = getOperation();
   if (failed(verifyLayouts(root))) {
     LLVM_DEBUG(DBGS() << "XeGPUSgToWiDistributeExperimentalPass: layout "
                          "verification failed\n");

>From f77bfa23bb60eededfbfad0e361b9fbb444be1df Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Mon, 30 Mar 2026 23:15:10 +0000
Subject: [PATCH 4/5] Add comment

---
 .../XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index bb173c81346a2..99e3dbc955199 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -1520,6 +1520,7 @@ struct XeGPUSgToWiDistributeExperimentalPass
 
 void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
 
+  // Recover temporary operand layouts for usage in patterns.
   Operation *root = getOperation();
   if (!xegpu::recoverTemporaryLayouts(root)) {
     signalPassFailure();

>From 1f48a4e262ce8e499f91d7a5df60f835ddc5b59b Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Tue, 31 Mar 2026 02:44:13 +0000
Subject: [PATCH 5/5] Switch to the new sg to wi pass

---
 mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
index fbb7bb8aeb4bc..f29ffabc1d94d 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
@@ -89,7 +89,8 @@ void buildGPUPassPipeline(OpPassManager &pm,
     pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
     pm.addNestedPass<gpu::GPUModuleOp>(
         xegpu::createXeGPUPropagateLayout(laneLayoutOptions));
-    pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUSubgroupDistribute());
+    pm.addNestedPass<gpu::GPUModuleOp>(
+        xegpu::createXeGPUSgToWiDistributeExperimental());
     pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
     pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
     pm.addNestedPass<gpu::GPUModuleOp>(createLoopInvariantCodeMotionPass());