[Mlir-commits] [mlir] [MLIR][XeGPU] Switch to the new sg to wi pass (PR #188627)
Nishant Patel
llvmlistbot at llvm.org
Tue Mar 31 08:57:00 PDT 2026
https://github.com/nbpatel updated https://github.com/llvm/llvm-project/pull/188627
>From 231e1de28ef281d53044b6f311f1865365d472f0 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Wed, 25 Mar 2026 17:35:50 +0000
Subject: [PATCH 1/5] Update getDistVecTypeBasedOnLaneLayout
---
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 44 +++++++--------
.../XeGPU/sg-to-wi-experimental-unit.mlir | 56 +++++++++++++++++++
2 files changed, 77 insertions(+), 23 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index f60635830cc74..7a3cda5e1c24e 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -110,29 +110,27 @@ xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
return failure();
assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
"Expecting a valid layout.");
- SmallVector<int64_t> effectiveLaneLayout =
- layout.getEffectiveLaneLayoutAsInt();
- assert(static_cast<size_t>(originalType.getRank()) >=
- effectiveLaneLayout.size() &&
- "Rank of the original vector type should be greater or equal to the "
- "size of the lane layout to distribute the vector type.");
- // TODO: replace the implementation with
- // auto distributedShape = layout.computeDistributedShape(
- // SmallVector<int64_t>(originalType.getShape()));
- SmallVector<int64_t> distributedShape(originalType.getShape());
- // Only distribute the last `laneLayout.size()` dimensions. The remaining
- // dimensions are not distributed.
- unsigned distributionStart =
- originalType.getRank() - effectiveLaneLayout.size();
- for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
- if (i < distributionStart)
- continue;
- // Check if the dimension can be distributed evenly.
- if (dim % effectiveLaneLayout[i - distributionStart] != 0)
- return failure();
- distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
- }
- return VectorType::get(distributedShape, originalType.getElementType());
+
+ int64_t vectorRank = originalType.getRank();
+ int64_t layoutRank = layout.getRank();
+ assert(vectorRank >= layoutRank && "Vector rank must be >= layout rank.");
+
+ // When the vector has more dimensions than the layout, only the trailing
+ // dimensions are distributed. Leading dimensions are preserved as-is.
+ int64_t offset = vectorRank - layoutRank;
+ ArrayRef<int64_t> fullShape = originalType.getShape();
+ SmallVector<int64_t> trailingShape(fullShape.begin() + offset,
+ fullShape.end());
+ auto distributedShapeOrFailure =
+ layout.computeDistributedShape(trailingShape);
+ if (failed(distributedShapeOrFailure))
+ return failure();
+
+ SmallVector<int64_t> resultShape(fullShape.begin(),
+ fullShape.begin() + offset);
+ resultShape.append(distributedShapeOrFailure->begin(),
+ distributedShapeOrFailure->end());
+ return VectorType::get(resultShape, originalType.getElementType());
}
std::string xegpu::getTemporaryLayoutName(const OpOperand &operand) {
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 016b393e3d8bc..a3d9adb6a52ef 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -737,3 +737,59 @@ gpu.func @load_store_matrix_3(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layou
gpu.return
}
}
+
+// -----
+gpu.module @xevm_module {
+// CHECK-LABEL: gpu.func @elementwise_wrap_around_dim
+// CHECK: %[[SRC:.*]] = "some_op"()
+// CHECK: %[[NEG:.*]] = arith.negf %[[SRC]] : vector<16x1xf16>
+// CHECK: gpu.return
+gpu.func @elementwise_wrap_around_dim() {
+ %0 = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> vector<16x1xf16>
+ %1 = arith.negf %0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : vector<16x1xf16>
+ gpu.return
+}
+}
+
+// -----
+gpu.module @xevm_module {
+// CHECK-LABEL: gpu.func @constant_wrap_around_dim
+// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x1xf16>
+// CHECK: gpu.return
+gpu.func @constant_wrap_around_dim() {
+ %0 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ dense<1.0> : vector<16x1xf16>
+ gpu.return
+}
+}
+
+// -----
+gpu.module @xevm_module {
+// CHECK-LABEL: gpu.func @scatter_load_chunksize_leading_dim
+// CHECK-SAME: (%[[ARG0:.*]]: memref<256xf16>)
+// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1x1xindex>
+// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1x1xi1>
+// CHECK: %[[OFF_FLAT:.*]] = vector.shape_cast %[[OFFSET]] : vector<1x1xindex> to vector<1xindex>
+// CHECK: %[[MASK_FLAT:.*]] = vector.shape_cast %[[MASK]] : vector<1x1xi1> to vector<1xi1>
+// CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[OFF_FLAT]]], %[[MASK_FLAT]] <{chunk_size = 8 : i64, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>}>
+// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
+// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<8xf16> to vector<1x1x8xf16>
+// CHECK: gpu.return
+gpu.func @scatter_load_chunksize_leading_dim(%input: memref<256xf16>) {
+ %offset = arith.constant
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ dense<12> : vector<1x16xindex>
+ %mask = arith.constant
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ dense<true> : vector<1x16xi1>
+ %0 = xegpu.load %input[%offset], %mask
+ <{chunk_size = 8, l1_hint = #xegpu.cache_hint<cached>,
+ l2_hint = #xegpu.cache_hint<cached>,
+ layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
+ : memref<256xf16>, vector<1x16xindex>, vector<1x16xi1> -> vector<1x16x8xf16>
+ gpu.return
+}
+}
+
>From a839252aea8311a1928ee3fbab943cfa90983e29 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Fri, 27 Mar 2026 19:01:25 +0000
Subject: [PATCH 2/5] Add test case
---
.../XeGPU/sg-to-wi-experimental-unit.mlir | 41 ++++++-------------
1 file changed, 12 insertions(+), 29 deletions(-)
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index a3d9adb6a52ef..adead0284eb8e 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -59,6 +59,18 @@ gpu.func @load_nd_transpose() {
gpu.return
}
+// CHECK-LABEL: gpu.func @load_nd_array_length
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<64xf16>
+// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<64xf16> to vector<2x32x1xf16>
+gpu.func @load_nd_array_length() {
+ %c0 = arith.constant 0 : index
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x32x16xf16>
+ gpu.return
+}
+
// CHECK-LABEL: gpu.func @store_nd
// CHECK: %[[C0:.*]] = arith.constant 0 : index
// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
@@ -764,32 +776,3 @@ gpu.func @constant_wrap_around_dim() {
gpu.return
}
}
-
-// -----
-gpu.module @xevm_module {
-// CHECK-LABEL: gpu.func @scatter_load_chunksize_leading_dim
-// CHECK-SAME: (%[[ARG0:.*]]: memref<256xf16>)
-// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1x1xindex>
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1x1xi1>
-// CHECK: %[[OFF_FLAT:.*]] = vector.shape_cast %[[OFFSET]] : vector<1x1xindex> to vector<1xindex>
-// CHECK: %[[MASK_FLAT:.*]] = vector.shape_cast %[[MASK]] : vector<1x1xi1> to vector<1xi1>
-// CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[OFF_FLAT]]], %[[MASK_FLAT]] <{chunk_size = 8 : i64, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>}>
-// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<8xf16> to vector<1x1x8xf16>
-// CHECK: gpu.return
-gpu.func @scatter_load_chunksize_leading_dim(%input: memref<256xf16>) {
- %offset = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- dense<12> : vector<1x16xindex>
- %mask = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- dense<true> : vector<1x16xi1>
- %0 = xegpu.load %input[%offset], %mask
- <{chunk_size = 8, l1_hint = #xegpu.cache_hint<cached>,
- l2_hint = #xegpu.cache_hint<cached>,
- layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
- : memref<256xf16>, vector<1x16xindex>, vector<1x16xi1> -> vector<1x16x8xf16>
- gpu.return
-}
-}
-
>From c96f1cac5032b5435699376e6a606979d1d2d502 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Mon, 30 Mar 2026 23:11:47 +0000
Subject: [PATCH 3/5] Add recoverTemporaryLayouts
---
.../Transforms/XeGPUSgToWiDistributeExperimental.cpp | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 981c250249e5f..bb173c81346a2 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -14,6 +14,7 @@
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
#include "mlir/IR/Builders.h"
@@ -1519,9 +1520,14 @@ struct XeGPUSgToWiDistributeExperimentalPass
void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
+ Operation *root = getOperation();
+ if (!xegpu::recoverTemporaryLayouts(root)) {
+ signalPassFailure();
+ return;
+ }
+
// Verify if all XeGPU anchor ops and vector ops have result layouts.
// TODO: This can be removed once the full layout refactoring is done.
- Operation *root = getOperation();
if (failed(verifyLayouts(root))) {
LLVM_DEBUG(DBGS() << "XeGPUSgToWiDistributeExperimentalPass: layout "
"verification failed\n");
>From f77bfa23bb60eededfbfad0e361b9fbb444be1df Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Mon, 30 Mar 2026 23:15:10 +0000
Subject: [PATCH 4/5] Add comment
---
.../XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index bb173c81346a2..99e3dbc955199 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -1520,6 +1520,7 @@ struct XeGPUSgToWiDistributeExperimentalPass
void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
+ // Recover temporary operand layouts for usage in patterns.
Operation *root = getOperation();
if (!xegpu::recoverTemporaryLayouts(root)) {
signalPassFailure();
>From 1f48a4e262ce8e499f91d7a5df60f835ddc5b59b Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Tue, 31 Mar 2026 02:44:13 +0000
Subject: [PATCH 5/5] Switch to the new sg to wi pass
---
mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
index fbb7bb8aeb4bc..f29ffabc1d94d 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
@@ -89,7 +89,8 @@ void buildGPUPassPipeline(OpPassManager &pm,
pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
pm.addNestedPass<gpu::GPUModuleOp>(
xegpu::createXeGPUPropagateLayout(laneLayoutOptions));
- pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUSubgroupDistribute());
+ pm.addNestedPass<gpu::GPUModuleOp>(
+ xegpu::createXeGPUSgToWiDistributeExperimental());
pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
pm.addNestedPass<gpu::GPUModuleOp>(createLoopInvariantCodeMotionPass());
More information about the Mlir-commits
mailing list