[Mlir-commits] [mlir] [MLIR][XeGPU] Propagate layout from anchor ops before Wg To Sg & Blocking Pass (PR #179490)
Nishant Patel
llvmlistbot at llvm.org
Fri Feb 6 15:45:54 PST 2026
https://github.com/nbpatel updated https://github.com/llvm/llvm-project/pull/179490
>From 7862d9f8f2534619eb425edbaa9c04e415fcd0df Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Mon, 2 Feb 2026 21:31:27 +0000
Subject: [PATCH 1/6] Add layout propagation from anchor op to other ops at WG
IR
---
.../Transforms/XeGPUWgToSgDistribute.cpp | 11 ++++----
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 13 ++++++++++
.../XeGPU/xegpu-wg-to-sg-elemwise.mlir | 25 ++++++++++---------
.../XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir | 6 ++---
.../XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 22 +++++-----------
5 files changed, 40 insertions(+), 37 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 45a002b63abd6..adaf4ff881d69 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -1723,12 +1723,11 @@ struct XeGPUWgToSgDistributePass
void XeGPUWgToSgDistributePass::runOnOperation() {
- // TODO-LayoutRefactor: unify the local propagation for layout preprocessing
- // Operation *op = getOperation();
- // if (!xegpu::recoverTemporaryLayouts(op)) {
- // signalPassFailure();
- // return;
- // }
+ Operation *op = getOperation();
+ if (!xegpu::recoverTemporaryLayouts(op)) {
+ signalPassFailure();
+ return;
+ }
// Track existing UnrealizedConversionCastOps
SmallVector<Operation *> existingCastOps;
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 7e28c756f2d72..282d9253d7afe 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -166,6 +166,18 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
return layout;
}
+
+ // Handle scf.for results by looking at yielded values
+ if (auto forOp = dyn_cast<scf::ForOp>(defOp)) {
+ auto resultIndex = result.getResultNumber();
+ if (auto yieldOp =
+ dyn_cast<scf::YieldOp>(forOp.getBody()->getTerminator())) {
+ if (resultIndex < yieldOp.getOperands().size()) {
+ auto yieldedValue = yieldOp.getOperand(resultIndex);
+ return getDistributeLayoutAttr(yieldedValue);
+ }
+ }
+ }
}
if (auto arg = dyn_cast<BlockArgument>(value)) {
@@ -179,6 +191,7 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
return nullptr;
}
+
xegpu::DistributeLayoutAttr
xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
Operation *op = opr.getOwner();
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
index 6e9711442b92d..608f1dc2a1253 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
@@ -27,11 +27,11 @@ gpu.module @test_elementwise_ops {
%load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
- // CHECK: math.exp {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>} : vector<12x8xf32>
+ // CHECK: math.exp {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>} : vector<12x8xf32>
%exp = math.exp %load_a
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
- // CHECK: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>} : vector<12x8xf32>
+ // CHECK: arith.negf {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>} : vector<12x8xf32>
%negf = arith.negf %load_a
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
@@ -50,12 +50,12 @@ gpu.module @test_elementwise_ops {
%load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
- // CHECK: arith.addf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+ // CHECK: arith.addf {{.*}}, {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_operand_1 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
// CHECK-SAME: : vector<12x8xf32>
%addf = arith.addf %load_a, %load_b
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
- // CHECK: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+ // CHECK: math.powf {{.*}}, {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_operand_1 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
// CHECK-SAME: : vector<12x8xf32>
%powf = math.powf %load_a, %load_b
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
@@ -80,12 +80,12 @@ gpu.module @test_elementwise_ops {
%load_c = xegpu.load_nd %tdesc_c {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xi1>
- // CHECK: arith.select {{.*}}, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+ // CHECK: arith.select {{.*}}, {{.*}}, {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_operand_1 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_operand_2 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
// CHECK-SAME: : vector<12x8xi1>, vector<12x8xf32>
%select = arith.select %load_c, %load_a, %load_b
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xi1>, vector<24x32xf32>
- // CHECK: math.fma {{.*}}, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+ // CHECK: math.fma {{.*}}, {{.*}}, {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_operand_1 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_operand_2 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
// CHECK-SAME: : vector<12x8xf32>
%fma = math.fma %load_a, %load_b, %load_a
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
@@ -105,12 +105,12 @@ gpu.module @test_elementwise_ops {
%load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xi32>
- // CHECK: arith.truncf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+ // CHECK: arith.truncf {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
// CHECK-SAME: : vector<12x8xf32> to vector<12x8xf16>
%truncf = arith.truncf %load_a
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32> to vector<24x32xf16>
- // CHECK: arith.bitcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+ // CHECK: arith.bitcast {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
// CHECK-SAME: : vector<12x8xi32> to vector<12x8xf32>
%bitcast = arith.bitcast %load_b
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
@@ -140,12 +140,12 @@ gpu.module @test_elementwise_ops {
%load_d = xegpu.load_nd %tdesc_d {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xi32>
- // CHECK: arith.cmpf ult, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+ // CHECK: arith.cmpf ult, {{.*}}, {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_operand_1 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
// CHECK-SAME: : vector<12x8xf32>
%cmpf = arith.cmpf ult, %load_a, %load_b
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
- // CHECK: arith.cmpi eq, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+ // CHECK: arith.cmpi eq, {{.*}}, {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_operand_1 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
// CHECK-SAME: : vector<12x8xi32>
%cmpi = arith.cmpi eq, %load_c, %load_d
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
@@ -166,12 +166,12 @@ gpu.module @test_elementwise_ops {
%load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
-> vector<24x32xf32>
- // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
+ // CHECK-COUNT-12: arith.negf {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
// CHECK-NOT: arith.negf
%negf = arith.negf %load_a
{layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
: vector<24x32xf32>
- // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
+ // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>, layout_operand_1 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
// CHECK-NOT: math.powf
%powf = math.powf %load_a, %load_b
{layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
@@ -179,3 +179,4 @@ gpu.module @test_elementwise_ops {
gpu.return
}
}
+
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
index 4b11270373f95..f794f8951cf94 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
@@ -26,14 +26,14 @@ gpu.module @test_distribution {
// CHECK-LABEL: store_nd_with_offset
gpu.func @store_nd_with_offset(%src: memref<256x128xf32>) {
- // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// CHECK-NOT: xegpu.store_nd
%tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<256x128xf32>
- xegpu.store_nd %load, %tdesc[0, 0]
+ xegpu.store_nd %load, %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -155,7 +155,7 @@ gpu.module @test_distribution {
%cst_0 = arith.constant {layout_result_0=#xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>, dims = [1]>} dense<0xFF800000> : vector<256xf32>
%block_id_x = gpu.block_id x
%0 = xegpu.create_nd_tdesc %arg0 : memref<4096x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>>
- %1 = xegpu.load_nd %0[%block_id_x, 0] : !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>> -> vector<256x128xf32>
+ %1 = xegpu.load_nd %0[%block_id_x, 0] {layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>} : !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>> -> vector<256x128xf32>
%2 = vector.multi_reduction <maximumf>, %1, %cst_0 {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>, dims = [1]>} [1] : vector<256x128xf32> to vector<256xf32>
%3 = vector.shape_cast %2 {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>, layout_operand_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>, dims = [1]>} : vector<256xf32> to vector<256x1xf32>
%4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>} : vector<256x1xf32>to vector<256x128xf32>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 1fc2328d09046..b761a3c0f87b7 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -58,13 +58,13 @@ gpu.module @test_distribution {
// CHECK-LABEL: store_nd_with_offsets
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
gpu.func @store_nd_with_offsets(%src: memref<256x128xf32>) {
- //CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ //CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<256x128xf32>
- xegpu.store_nd %load, %tdesc[0, 0]
+ xegpu.store_nd %load, %tdesc[0, 0]
: vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -130,17 +130,6 @@ gpu.module @test_distribution {
gpu.return
}
- // CHECK-LABEL: dpas_with_no_create_nd_desc
- gpu.func @dpas_with_no_create_nd_desc(%a: vector<256x128xf32>, %b: vector<128x256xf32>) {
- // CHECK-NOT: vector<32x32xf32>
- %dpas = xegpu.dpas %a, %b
- {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
- : vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32>
- gpu.return
- }
-
// CHECK-LABEL: broadcast_dim1
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x1xf32>
gpu.func @broadcast_dim1(%src: memref<256x1xf32>) {
@@ -209,14 +198,15 @@ gpu.module @test_distribution {
%9 = xegpu.dpas %arg4, %arg5, %arg6
{layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>,
layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>,
- layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
+ layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>,
+ layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
: vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
%10 = xegpu.load_nd %3[%arg3, %c0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
%11 = xegpu.load_nd %4[%c0, %arg3] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
scf.yield %10, %11, %9 : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>
}
// store_nd with offset
- xegpu.store_nd %8#2, %2[%0, %1] : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
+ xegpu.store_nd %8#2, %2[%0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>} : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
gpu.return
}
@@ -726,7 +716,7 @@ gpu.module @test_distribution {
%cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} dense<0.0> : vector<128xf32>
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
- %load = xegpu.load_nd %tdesc
+ %load = xegpu.load_nd %tdesc {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>}
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
-> vector<256x128xf32>
%reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0]
>From 686ab215640db1502b584ed4eea8d6632387bf02 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Mon, 2 Feb 2026 21:36:23 +0000
Subject: [PATCH 2/6] Run recoverTemporaryLayouts before blocking
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 11 ++++-------
1 file changed, 4 insertions(+), 7 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index c00b7d42d48a6..6ab473704645d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -286,13 +286,10 @@ void XeGPUBlockingPass::runOnOperation() {
MLIRContext *ctx = &getContext();
Operation *op = getOperation();
- // TODO-LayoutRefactor: unify the local propagation for layout preprocessing
- // replace the function with recoverTemporaryLayouts
- // if (!xegpu::recoverTemporaryLayouts(op)) {
- // signalPassFailure();
- // return;
- // }
- xegpu::recoverTemporaryLayoutsDeprecated(op);
+ if (!xegpu::recoverTemporaryLayouts(op)) {
+ signalPassFailure();
+ return;
+ }
auto getTileShapeAndCount = [](llvm::ArrayRef<int64_t> shape,
xegpu::LayoutAttr layout) {
>From 34d7565a61f745a2460ec7e84a9146aed5a98a7b Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Wed, 4 Feb 2026 19:17:29 +0000
Subject: [PATCH 3/6] only set anchor layout after the pattern is transformed
---
.../Transforms/XeGPUWgToSgDistribute.cpp | 54 ++++++-------------
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 12 -----
.../XeGPU/xegpu-wg-to-sg-elemwise.mlir | 32 +++++------
.../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir | 2 +-
.../XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir | 6 +--
.../XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 22 ++++----
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir | 6 +--
7 files changed, 44 insertions(+), 90 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index adaf4ff881d69..dab3eb7ae52a6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -510,8 +510,6 @@ struct WgToSgVectorBroadcastOp
for (auto operand : adaptor.getOperands().front()) {
auto newBroadcast = vector::BroadcastOp::create(rewriter, op.getLoc(),
newResultType, operand);
- xegpu::setTemporaryLayout(newBroadcast->getResult(0),
- layout.dropSgLayoutAndData());
newBroadcastOps.push_back(newBroadcast.getResult());
}
@@ -563,9 +561,17 @@ struct WgToSgElementwiseOp : public ConversionPattern {
OperationState state(op->getLoc(), op->getName());
state.addOperands(opOperands);
state.addTypes(newResultType);
- // Copy all attributes, but update "layout_result_0" to drop
- // sgLayout/sgData
- state.addAttributes(xegpu::dropSgLayoutAndDataOnAttrs(op->getAttrs()));
+ // Copy all attributes except the layout attribute.
+ SmallVector<NamedAttribute> attrs;
+ auto resultLayoutName = xegpu::getTemporaryLayoutName(op->getResult(0));
+ for (auto attr : op->getAttrs()) {
+ StringRef attrName = attr.getName().strref();
+ if (attrName == resultLayoutName ||
+ attrName.starts_with("layout_operand_"))
+ continue;
+ attrs.push_back(attr);
+ }
+ state.addAttributes(attrs);
Operation *newOp = rewriter.create(state);
newResults.push_back(newOp->getResult(0));
}
@@ -748,24 +754,17 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
Location loc = op.getLoc();
auto eltType = vecType.getElementType();
- auto setLayout = [&](Value val) {
- xegpu::setTemporaryLayout(llvm::dyn_cast<OpResult>(val),
- layout.dropSgLayoutAndData());
- };
-
if (vecAttr.isSplat()) {
// Splat: single value for all subgroups
Attribute singleVal = vecAttr.getSplatValue<Attribute>();
auto sgAttr = DenseElementsAttr::get(newType, singleVal);
auto cstOp = arith::ConstantOp::create(rewriter, loc, newType, sgAttr);
- setLayout(cstOp->getResult(0));
rewriter.replaceOp(op, cstOp);
return success();
} else if (sgShape == wgShape) { // if the entire vector is shared by all
// subgroups, don't distribute
auto newConstOp =
arith::ConstantOp::create(rewriter, op.getLoc(), vecType, vecAttr);
- setLayout(newConstOp->getResult(0));
rewriter.replaceOp(op, newConstOp);
return success();
} else {
@@ -867,9 +866,6 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
rewriter, loc, baseConstVec.getType(), mulOffset);
auto finalConst =
arith::AddIOp::create(rewriter, loc, baseConstVec, bcastOffset);
- setLayout(baseConstVec);
- setLayout(bcastOffset);
- setLayout(finalConst);
newConstOps.push_back(finalConst);
}
rewriter.replaceOpWithMultiple(op, {newConstOps});
@@ -925,7 +921,6 @@ struct WgToSgLoadGatherOpWithOffset
rewriter, loc, newTy, op.getSource(), offsets, mask, chunkSizeAttr,
op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(),
newLayout);
- newLoadOp.setAnchorLayout(newLayout);
newLoadOps.push_back(newLoadOp);
}
rewriter.replaceOpWithMultiple(op, {newLoadOps});
@@ -971,17 +966,10 @@ struct WgToSgStoreScatterOpWithOffset
auto chunkSizeAttr = rewriter.getI64IntegerAttr(chunkSize);
for (auto [val, offs, mask] : llvm::zip(
adaptor.getValue(), adaptor.getOffsets(), adaptor.getMask())) {
- auto store = xegpu::StoreScatterOp::create(
- rewriter, loc, val, op.getDest(), offs, mask, chunkSizeAttr,
- op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(),
- layout.dropSgLayoutAndData());
- // Update the layout attribute to drop sg_layout and sg_data.
- for (OpOperand &operand : store->getOpOperands()) {
- // Skip for operand one (memref)
- if (operand.getOperandNumber() == 1)
- continue;
- xegpu::setTemporaryLayout(operand, layout.dropSgLayoutAndData());
- }
+ xegpu::StoreScatterOp::create(rewriter, loc, val, op.getDest(), offs,
+ mask, chunkSizeAttr, op.getL1HintAttr(),
+ op.getL2HintAttr(), op.getL3HintAttr(),
+ layout.dropSgLayoutAndData());
}
rewriter.eraseOp(op);
return success();
@@ -1073,12 +1061,6 @@ struct WgToSgVectorStepOp : public OpConversionPattern<vector::StepOp> {
vector::BroadcastOp::create(rewriter, loc, newTy, offsets[0]);
auto finalSteps =
arith::AddIOp::create(rewriter, loc, steps, bcastOffset);
- xegpu::setTemporaryLayout(steps->getResult(0),
- layout.dropSgLayoutAndData());
- xegpu::setTemporaryLayout(bcastOffset->getResult(0),
- layout.dropSgLayoutAndData());
- xegpu::setTemporaryLayout(finalSteps->getResult(0),
- layout.dropSgLayoutAndData());
newOps.push_back(finalSteps);
}
@@ -1166,8 +1148,6 @@ struct WgToSgVectorShapeCastOp
for (auto src : adaptor.getSource()) {
auto newShapeCast = vector::ShapeCastOp::create(rewriter, op.getLoc(),
newResultType, src);
- xegpu::setTemporaryLayout(newShapeCast->getResult(0),
- layout.dropSgLayoutAndData());
newShapeCastOps.push_back(newShapeCast.getResult());
}
@@ -1611,8 +1591,6 @@ struct WgToSgVectorTransposeOp
for (auto src : adaptor.getVector()) {
auto newTranspose = vector::TransposeOp::create(
rewriter, op.getLoc(), newResultType, src, permutation);
- xegpu::setTemporaryLayout(newTranspose->getResult(0),
- layout.dropSgLayoutAndData());
newTransposeOps.push_back(newTranspose.getResult());
}
@@ -1681,8 +1659,6 @@ struct WgToSgVectorMaskOp : public OpConversionPattern<MaskOpType> {
auto newCreateMaskOp =
vector::CreateMaskOp::create(rewriter, loc, resultType, maskOperands);
- xegpu::setTemporaryLayout(newCreateMaskOp->getResult(0),
- layout.dropSgLayoutAndData());
newCreateMaskOps.push_back(newCreateMaskOp.getResult());
}
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 282d9253d7afe..e3567b67da7fc 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -166,18 +166,6 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
return layout;
}
-
- // Handle scf.for results by looking at yielded values
- if (auto forOp = dyn_cast<scf::ForOp>(defOp)) {
- auto resultIndex = result.getResultNumber();
- if (auto yieldOp =
- dyn_cast<scf::YieldOp>(forOp.getBody()->getTerminator())) {
- if (resultIndex < yieldOp.getOperands().size()) {
- auto yieldedValue = yieldOp.getOperand(resultIndex);
- return getDistributeLayoutAttr(yieldedValue);
- }
- }
- }
}
if (auto arg = dyn_cast<BlockArgument>(value)) {
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
index 608f1dc2a1253..762530e5d189f 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
@@ -27,11 +27,11 @@ gpu.module @test_elementwise_ops {
%load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
- // CHECK: math.exp {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>} : vector<12x8xf32>
+ // CHECK: math.exp {{.*}} : vector<12x8xf32>
%exp = math.exp %load_a
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
- // CHECK: arith.negf {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>} : vector<12x8xf32>
+ // CHECK: arith.negf {{.*}} : vector<12x8xf32>
%negf = arith.negf %load_a
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
@@ -50,13 +50,11 @@ gpu.module @test_elementwise_ops {
%load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
- // CHECK: arith.addf {{.*}}, {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_operand_1 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
- // CHECK-SAME: : vector<12x8xf32>
+ // CHECK: arith.addf {{.*}}, {{.*}} : vector<12x8xf32>
%addf = arith.addf %load_a, %load_b
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
- // CHECK: math.powf {{.*}}, {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_operand_1 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
- // CHECK-SAME: : vector<12x8xf32>
+ // CHECK: math.powf {{.*}}, {{.*}} : vector<12x8xf32>
%powf = math.powf %load_a, %load_b
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
@@ -80,13 +78,11 @@ gpu.module @test_elementwise_ops {
%load_c = xegpu.load_nd %tdesc_c {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xi1>
- // CHECK: arith.select {{.*}}, {{.*}}, {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_operand_1 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_operand_2 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
- // CHECK-SAME: : vector<12x8xi1>, vector<12x8xf32>
+ // CHECK: arith.select {{.*}}, {{.*}}, {{.*}} : vector<12x8xi1>, vector<12x8xf32>
%select = arith.select %load_c, %load_a, %load_b
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xi1>, vector<24x32xf32>
- // CHECK: math.fma {{.*}}, {{.*}}, {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_operand_1 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_operand_2 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
- // CHECK-SAME: : vector<12x8xf32>
+ // CHECK: math.fma {{.*}}, {{.*}}, {{.*}} : vector<12x8xf32>
%fma = math.fma %load_a, %load_b, %load_a
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
@@ -105,13 +101,11 @@ gpu.module @test_elementwise_ops {
%load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xi32>
- // CHECK: arith.truncf {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
- // CHECK-SAME: : vector<12x8xf32> to vector<12x8xf16>
+ // CHECK: arith.truncf {{.*}} : vector<12x8xf32> to vector<12x8xf16>
%truncf = arith.truncf %load_a
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32> to vector<24x32xf16>
- // CHECK: arith.bitcast {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
- // CHECK-SAME: : vector<12x8xi32> to vector<12x8xf32>
+ // CHECK: arith.bitcast {{.*}} : vector<12x8xi32> to vector<12x8xf32>
%bitcast = arith.bitcast %load_b
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xi32> to vector<24x32xf32>
@@ -140,13 +134,11 @@ gpu.module @test_elementwise_ops {
%load_d = xegpu.load_nd %tdesc_d {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xi32>
- // CHECK: arith.cmpf ult, {{.*}}, {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_operand_1 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
- // CHECK-SAME: : vector<12x8xf32>
+ // CHECK: arith.cmpf ult, {{.*}}, {{.*}} : vector<12x8xf32>
%cmpf = arith.cmpf ult, %load_a, %load_b
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
- // CHECK: arith.cmpi eq, {{.*}}, {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_operand_1 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
- // CHECK-SAME: : vector<12x8xi32>
+ // CHECK: arith.cmpi eq, {{.*}}, {{.*}} : vector<12x8xi32>
%cmpi = arith.cmpi eq, %load_c, %load_d
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xi32>
@@ -166,12 +158,12 @@ gpu.module @test_elementwise_ops {
%load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
-> vector<24x32xf32>
- // CHECK-COUNT-12: arith.negf {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
+ // CHECK-COUNT-12: arith.negf {{.*}} : vector<2x2xf32>
// CHECK-NOT: arith.negf
%negf = arith.negf %load_a
{layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
: vector<24x32xf32>
- // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>, layout_operand_1 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
+ // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} : vector<2x2xf32>
// CHECK-NOT: math.powf
%powf = math.powf %load_a, %load_b
{layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index 6b8b4f282b744..e89cb52ee02f5 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -116,7 +116,7 @@ gpu.module @test_round_robin_assignment {
%load = xegpu.load_nd %tdesc {layout = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>}
: !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
-> vector<128x1xf32>
- // CHECK-COUNT-2: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>} : vector<16x1xf32> to vector<16x32xf32>
+ // CHECK-COUNT-2: vector.broadcast {{.*}} : vector<16x1xf32> to vector<16x32xf32>
// CHECK-NOT: vector.broadcast
%broadcast = vector.broadcast %load
{layout_result_0 = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
index f794f8951cf94..ecdfdb9ad34c5 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
@@ -126,7 +126,7 @@ gpu.module @test_distribution {
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>}
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
-> vector<256x128xf32>
- // CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<32x16xf32> to vector<16x32xf32>
+ // CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] : vector<32x16xf32> to vector<16x32xf32>
// CHECK-NOT: vector.transpose
%trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 32], lane_layout = [1, 16], lane_data = [1, 1], order =[1, 0]>} : vector<256x128xf32> to vector<128x256xf32>
gpu.return
@@ -149,8 +149,8 @@ gpu.module @test_distribution {
}
// CHECK-LABEL: distribute_shapecast_expandunitdims_broadcast
- // CHECK: %[[CAST:.*]] = vector.shape_cast %[[REDUCE:.*]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : vector<8xf32> to vector<8x1xf32>
- // CHECK: %[[BCAST:.*]] = vector.broadcast %[[CAST]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : vector<8x1xf32> to vector<8x128xf32>
+ // CHECK: %[[CAST:.*]] = vector.shape_cast %[[REDUCE:.*]] : vector<8xf32> to vector<8x1xf32>
+ // CHECK: %[[BCAST:.*]] = vector.broadcast %[[CAST]] : vector<8x1xf32> to vector<8x128xf32>
gpu.func @distribute_shapecast_expandunitdims_broadcast(%arg0: memref<4096x128xf32>, %arg1: memref<4096x128xf32>) {
%cst_0 = arith.constant {layout_result_0=#xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>, dims = [1]>} dense<0xFF800000> : vector<256xf32>
%block_id_x = gpu.block_id x
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index b761a3c0f87b7..0967f0769c9e8 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -138,8 +138,7 @@ gpu.module @test_distribution {
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x1xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
-> vector<256x1xf32>
- // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>}
- // CHECK-SAME: : vector<32x1xf32> to vector<32x32xf32>
+ // CHECK: vector.broadcast {{.*}} : vector<32x1xf32> to vector<32x32xf32>
%broadcast = vector.broadcast %load
{layout_result_0 = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
: vector<256x1xf32> to vector<256x32xf32>
@@ -154,8 +153,7 @@ gpu.module @test_distribution {
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<1x128xf32, #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<1x128xf32>
- // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- // CHECK-SAME: : vector<1x32xf32> to vector<32x32xf32>
+ // CHECK: vector.broadcast {{.*}} : vector<1x32xf32> to vector<32x32xf32>
%broadcast = vector.broadcast %load
{layout_result_0 = #xegpu.layout<sg_layout = [1, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<1x128xf32> to vector<32x128xf32>
@@ -193,7 +191,7 @@ gpu.module @test_distribution {
// CHECK: [[b:%.+]] = xegpu.load_nd [[DESC_B]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
// CHECK: scf.yield [[a]], [[b]], [[c]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32>
%8:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %6, %arg5 = %7, %arg6 = %5)
- -> (vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>) {
+ -> (vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>) {
// load_nd with offset inside loop
%9 = xegpu.dpas %arg4, %arg5, %arg6
{layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>,
@@ -204,7 +202,9 @@ gpu.module @test_distribution {
%10 = xegpu.load_nd %3[%arg3, %c0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
%11 = xegpu.load_nd %4[%c0, %arg3] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
scf.yield %10, %11, %9 : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>
- }
+ } {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>,
+ layout_result_1 = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>,
+ layout_result_2 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
// store_nd with offset
xegpu.store_nd %8#2, %2[%0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>} : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
gpu.return
@@ -293,9 +293,9 @@ gpu.module @test_distribution {
// CHECK-LABEL: @store_scatter
// CHECK-SAME: %[[ARG0:.*]]: memref<256xf16>
gpu.func @store_scatter(%dest : memref<256xf16>) {
- // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<2.550000e+01> : vector<8xf16>
- // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<0> : vector<8xindex>
- // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<true> : vector<8xi1>
+ // CHECK: %[[VAL:.*]] = arith.constant dense<2.550000e+01> : vector<8xf16>
+ // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xindex>
+ // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<8xi1>
// CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>, layout = #xegpu.layout<inst_data = [8]>}>
// CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
%val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<25.5> : vector<256xf16>
@@ -479,7 +479,7 @@ gpu.module @test_distribution {
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>}
: !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
-> vector<256x32xf32>
- //CHECK: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<64x32xf32> to vector<32x64xf32>
+ //CHECK: vector.transpose {{.*}}, [1, 0] : vector<64x32xf32> to vector<32x64xf32>
%trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], lane_layout = [1, 16], lane_data = [1, 1], order =[1, 0]>} : vector<256x32xf32> to vector<32x256xf32>
gpu.return
}
@@ -635,7 +635,7 @@ gpu.module @test_distribution {
// CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32>
%3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32>
- // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[LOAD]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : vector<32xf32> to vector<32x32xf32>
+ // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[LOAD]] : vector<32xf32> to vector<32x32xf32>
%4 = vector.broadcast %3 {layout_result_0 =
#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>} : vector<256xf32> to vector<256x256xf32>
gpu.return
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index 4f29a686d301f..5fd72f2d4a3b3 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -160,8 +160,7 @@ gpu.module @test_1_1_assignment {
%load = xegpu.load_nd %tdesc
: !xegpu.tensor_desc<256x1xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
-> vector<256x1xf32>
- // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>}
- // CHECK-SAME: : vector<32x1xf32> to vector<32x32xf32>
+ // CHECK: vector.broadcast {{.*}} : vector<32x1xf32> to vector<32x32xf32>
%broadcast = vector.broadcast %load
{layout_result_0 = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
: vector<256x1xf32> to vector<256x32xf32>
@@ -176,8 +175,7 @@ gpu.module @test_1_1_assignment {
%load = xegpu.load_nd %tdesc
: !xegpu.tensor_desc<1x128xf32, #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<1x128xf32>
- // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- // CHECK-SAME: : vector<1x32xf32> to vector<32x32xf32>
+ // CHECK: vector.broadcast {{.*}} : vector<1x32xf32> to vector<32x32xf32>
%broadcast = vector.broadcast %load
{layout_result_0 = #xegpu.layout<sg_layout = [1, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<1x128xf32> to vector<32x128xf32>
>From 684d613cf60254fa818ac3dee4e6e4371795fe99 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Wed, 4 Feb 2026 22:14:30 +0000
Subject: [PATCH 4/6] Clean up
---
.../lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp | 7 -------
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 2 +-
2 files changed, 1 insertion(+), 8 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index dab3eb7ae52a6..11f0270440f7b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -1382,9 +1382,6 @@ struct WgToSgMultiDimReductionOp
for (auto localResult : localReductions) {
auto finalResult = vector::makeArithReduction(
rewriter, loc, op.getKind(), localResult, adaptor.getAcc()[0]);
- if (auto defOp = finalResult.getDefiningOp())
- xegpu::setDistributeLayoutAttr(defOp->getResult(0),
- layout.dropSgLayoutAndData());
results.push_back(finalResult);
}
rewriter.replaceOpWithMultiple(op, {results});
@@ -1528,10 +1525,6 @@ struct WgToSgMultiDimReductionOp
auto finalResult = vector::makeArithReduction(
rewriter, loc, op.getKind(), finalReduce.getResult(), accToAdd);
- if (auto defOp = finalResult.getDefiningOp())
- xegpu::setDistributeLayoutAttr(defOp->getResult(0),
- layout.dropSgLayoutAndData());
-
rewriter.replaceOp(op, finalResult);
return success();
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 0967f0769c9e8..b2fc7231dad60 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -64,7 +64,7 @@ gpu.module @test_distribution {
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<256x128xf32>
- xegpu.store_nd %load, %tdesc[0, 0]
+ xegpu.store_nd %load, %tdesc[0, 0]
: vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
>From 49f73e9b3b7e67e9006cd903a5757327fc568305 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Thu, 5 Feb 2026 00:01:43 +0000
Subject: [PATCH 5/6] Update tests in blocking to add result layouts
---
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 68 ++++++++++++---------
1 file changed, 38 insertions(+), 30 deletions(-)
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index 0b6e30e6f95f0..68f6e8e1ec955 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -34,7 +34,9 @@ gpu.module @test_kernel {
%b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #b>
scf.yield %a_next_tdesc, %b_next_tdesc, %c
: !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>
- }
+ } {layout_result_0 = #a,
+ layout_result_1 = #b,
+ layout_result_2 = #c}
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
xegpu.store_nd %out#2, %c_tdesc {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
gpu.return
@@ -75,7 +77,9 @@ gpu.module @test_kernel {
%b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #l2>
scf.yield %a_next_tdesc, %b_next_tdesc, %c
: !xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32>
- }
+ } {layout_result_0 = #l1,
+ layout_result_1 = #l2,
+ layout_result_2 = #l1}
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
xegpu.store_nd %out#2, %c_tdesc {layout = #l1}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1>
gpu.return
@@ -118,7 +122,9 @@ gpu.module @test_kernel {
%b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<16x32xf16, #l2>
scf.yield %a_next_tdesc, %b_next_tdesc, %c
: !xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32>
- }
+ } {layout_result_0 = #l1,
+ layout_result_1 = #l2,
+ layout_result_2 = #l1}
//CHECK-COUNT-2: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
xegpu.store_nd %out#2, %c_tdesc {layout = #l1}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1>
gpu.return
@@ -162,7 +168,9 @@ gpu.module @test_kernel {
%b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #b>
scf.yield %a_next_tdesc, %b_next_tdesc, %c
: !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>
- }
+ } {layout_result_0 = #a,
+ layout_result_1 = #b,
+ layout_result_2 = #c}
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
xegpu.store_nd %out#2, %c_tdesc {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
gpu.return
@@ -252,7 +260,7 @@ gpu.module @test_kernel {
#r = #xegpu.layout<inst_data = [16]>
gpu.module @test_kernel {
gpu.func @reduce_dim_0(%a: memref<16x512xf32>, %b: memref<512xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
- %acc = arith.constant dense<0.0> : vector<64xf32>
+ %acc = arith.constant {layout_result_0 = #r} dense<0.0> : vector<64xf32>
%c64 = arith.constant 64 : index
%block_id_x = gpu.block_id x
%m = arith.muli %block_id_x, %c64 : index
@@ -274,7 +282,7 @@ gpu.module @test_kernel {
gpu.func @reduce_dim_1(%a: memref<512x32xf32>, %b: memref<512xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
- %acc = arith.constant dense<0.0> : vector<32xf32>
+ %acc = arith.constant {layout_result_0 = #r} dense<0.0> : vector<32xf32>
%block_id_x = gpu.block_id x
%block_id_y = gpu.block_id y
@@ -324,7 +332,7 @@ gpu.module @test_kernel {
%m = arith.muli %block_id_x, %c32 : index
%0 = xegpu.create_nd_tdesc %a[%m] : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r>
%1 = xegpu.load_nd %0 {layout = #r}: !xegpu.tensor_desc<32xf32, #r> -> vector<32xf32>
- %11 = vector.shape_cast %1 : vector<32xf32> to vector<32x1xf32>
+ %11 = vector.shape_cast %1 {layout_result_0 = #l} : vector<32xf32> to vector<32x1xf32>
// CHECK-COUNT-8: vector.broadcast {{.*}}: vector<16x1xf32> to vector<16x16xf32>
%2 = vector.broadcast %11 {layout_result_0 = #l} : vector<32x1xf32> to vector<32x64xf32>
%3 = xegpu.create_nd_tdesc %b[0, %m] : memref<16x512xf32> -> !xegpu.tensor_desc<32x64xf32, #l>
@@ -358,7 +366,7 @@ gpu.module @test_kernel {
gpu.func @test_vector_constant_mask(%src: ui64, %dst: ui64) {
//CHECK: arith.constant dense<true> : vector<16xi1>
%mask = vector.constant_mask [32] {layout_result_0 = #l} : vector<32xi1>
- %cst = arith.constant dense<[
+ %cst = arith.constant {layout_result_0 = #l} dense<[
0, 8, 16, 24, 32, 40, 48, 56,
64, 72, 80, 88, 96, 104, 112, 120,
128, 136, 144, 152, 160, 168, 176, 184,
@@ -377,7 +385,7 @@ gpu.module @test_kernel {
%c16 = arith.constant 16 : index
//CHECK-COUNT-2: vector.create_mask {{.*}} : vector<16xi1>
%mask = vector.create_mask %c16 {layout_result_0 = #l} : vector<32xi1>
- %cst = arith.constant dense<[
+ %cst = arith.constant {layout_result_0 = #l} dense<[
0, 8, 16, 24, 32, 40, 48, 56,
64, 72, 80, 88, 96, 104, 112, 120,
128, 136, 144, 152, 160, 168, 176, 184,
@@ -418,7 +426,7 @@ gpu.module @test_kernel {
gpu.func @test_prefetch_load_store_update(%src: ui64) {
- %cst = arith.constant dense<[
+ %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
0, 8, 16, 24, 32, 40, 48, 56,
64, 72, 80, 88, 96, 104, 112, 120,
128, 136, 144, 152, 160, 168, 176, 184,
@@ -428,7 +436,7 @@ gpu.module @test_kernel {
%tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
xegpu.prefetch %tdesc {layout = #xegpu.layout<inst_data = [16]>}: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- %delta = arith.constant dense<[
+ %delta = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
32, 32, 32, 32, 32, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 64,
128, 128, 128, 128, 128, 128, 128, 128,
@@ -438,11 +446,11 @@ gpu.module @test_kernel {
: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xindex>
%c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
+ %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout<inst_data = [16]>} : vector<32xi1>
%ld_vec = xegpu.load %new_tdesc, %mask {layout = #xegpu.layout<inst_data = [16]>}: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1> -> vector<32xf32>
- %st_vec = arith.addf %ld_vec, %ld_vec : vector<32xf32>
+ %st_vec = arith.addf %ld_vec, %ld_vec {layout_result_0 = #xegpu.layout<inst_data = [16]>} : vector<32xf32>
xegpu.store %st_vec, %tdesc, %mask {layout = #xegpu.layout<inst_data = [16]>}:
vector<32xf32>,
!xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>,
@@ -465,7 +473,7 @@ gpu.module @test_kernel {
gpu.func @test_prefetch_load_store_update_chunk(%src: ui64) {
- %cst = arith.constant dense<[
+ %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
0, 8, 16, 24, 32, 40, 48, 56,
64, 72, 80, 88, 96, 104, 112, 120,
128, 136, 144, 152, 160, 168, 176, 184,
@@ -475,7 +483,7 @@ gpu.module @test_kernel {
%tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
xegpu.prefetch %tdesc {layout = #xegpu.layout<inst_data = [16, 2]>}: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
- %delta = arith.constant dense<[
+ %delta = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
32, 32, 32, 32, 32, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 64,
128, 128, 128, 128, 128, 128, 128, 128,
@@ -485,11 +493,11 @@ gpu.module @test_kernel {
: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xindex>
%c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
+ %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout<inst_data = [16]>} : vector<32xi1>
%ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<inst_data = [16, 2]>}>: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xi1> -> vector<32x4xf32>
- %st_vec = arith.addf %ld_vec, %ld_vec : vector<32x4xf32>
+ %st_vec = arith.addf %ld_vec, %ld_vec {layout_result_0 = #xegpu.layout<inst_data = [16, 2]>} : vector<32x4xf32>
xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<inst_data = [16, 2]>}>:
vector<32x4xf32>,
!xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>,
@@ -521,7 +529,7 @@ gpu.module @test_kernel {
gpu.func @test_3d_scattered_tensor_desc(%src: ui64) {
- %cst = arith.constant dense<[
+ %cst = arith.constant {layout_result_0 = #l} dense<[
[0, 8, 16, 24, 32, 40, 48, 56],
[64, 72, 80, 88, 96, 104, 112, 120],
[128, 136, 144, 152, 160, 168, 176, 184],
@@ -531,7 +539,7 @@ gpu.module @test_kernel {
%tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<4x8xindex> -> !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>
xegpu.prefetch %tdesc {layout = #l}: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>
- %delta = arith.constant dense<[
+ %delta = arith.constant {layout_result_0 = #l} dense<[
[32, 32, 32, 32, 32, 32, 32, 32],
[32, 32, 32, 32, 32, 32, 32, 64],
[128, 128, 128, 128, 128, 128, 128, 128],
@@ -541,7 +549,7 @@ gpu.module @test_kernel {
: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>, vector<4x8xindex>
%c4 = arith.constant 4: index
- %mask = vector.create_mask %c4, %c4: vector<4x8xi1>
+ %mask = vector.create_mask %c4, %c4 {layout_result_0 = #l}: vector<4x8xi1>
%ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #l}>: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>, vector<4x8xi1> -> vector<4x8x4xf32>
@@ -643,7 +651,7 @@ gpu.module @test_kernel {
// CHECK-LABEL: load_with_offsets
// CHECK-COUNT-2: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
gpu.func @load_with_offsets(%src: ui64) -> vector<32xf32> {
- %cst = arith.constant dense<[
+ %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
0, 8, 16, 24, 32, 40, 48, 56,
64, 72, 80, 88, 96, 104, 112, 120,
128, 136, 144, 152, 160, 168, 176, 184,
@@ -651,7 +659,7 @@ gpu.module @test_kernel {
]> : vector<32xindex>
%c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
+ %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout<inst_data = [16]>}: vector<32xi1>
%ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
gpu.return %ld : vector<32xf32>
@@ -663,7 +671,7 @@ gpu.module @test_kernel {
// CHECK-LABEL: store_with_offsets
// CHECK-COUNT-2: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<16xf32>, ui64, vector<16xindex>, vector<16xi1>
gpu.func @store_with_offsets(%src: ui64) {
- %cst = arith.constant dense<[
+ %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
0, 8, 16, 24, 32, 40, 48, 56,
64, 72, 80, 88, 96, 104, 112, 120,
128, 136, 144, 152, 160, 168, 176, 184,
@@ -671,9 +679,9 @@ gpu.module @test_kernel {
]> : vector<32xindex>
%c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
+ %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout<inst_data = [16]>}: vector<32xi1>
- %st_vec = arith.constant dense<1023.0>: vector<32xf32>
+ %st_vec = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<1023.0>: vector<32xf32>
xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1>
gpu.return
@@ -690,7 +698,7 @@ gpu.module @test_kernel {
// CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
// CHECK-COUNT-4: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16x2xf32>
gpu.func @load_with_offsets_chunk(%src: ui64) -> vector<32x4xf32> {
- %cst = arith.constant dense<[
+ %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
0, 8, 16, 24, 32, 40, 48, 56,
64, 72, 80, 88, 96, 104, 112, 120,
128, 136, 144, 152, 160, 168, 176, 184,
@@ -698,7 +706,7 @@ gpu.module @test_kernel {
]> : vector<32xindex>
%c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
+ %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout<inst_data = [16]>}: vector<32xi1>
%ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32>
gpu.return %ld : vector<32x4xf32>
}
@@ -714,7 +722,7 @@ gpu.module @test_kernel {
// CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
// CHECK-COUNT-4: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<16x2xf32>, ui64, vector<16xindex>, vector<16xi1>
gpu.func @store_with_offsets_chunk(%src: ui64) {
- %cst = arith.constant dense<[
+ %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
0, 8, 16, 24, 32, 40, 48, 56,
64, 72, 80, 88, 96, 104, 112, 120,
128, 136, 144, 152, 160, 168, 176, 184,
@@ -722,9 +730,9 @@ gpu.module @test_kernel {
]> : vector<32xindex>
%c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
+ %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout<inst_data = [16]>}: vector<32xi1>
- %st_vec = arith.constant dense<1023.>: vector<32x4xf32>
+ %st_vec = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16, 2]>} dense<1023.>: vector<32x4xf32>
xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : vector<32x4xf32>, ui64, vector<32xindex>, vector<32xi1>
gpu.return
}
>From 31cc1660340280a7a454a5016397f020fa39e9ed Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Fri, 6 Feb 2026 22:21:55 +0000
Subject: [PATCH 6/6] Clean up
---
.../XeGPU/Transforms/XeGPUWgToSgDistribute.cpp | 18 ------------------
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 1 -
.../XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 3 +--
3 files changed, 1 insertion(+), 21 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 11f0270440f7b..826b6cd96bf29 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -1879,22 +1879,4 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
if (failed(
applyPartialConversion(getOperation(), target, std::move(patterns))))
return signalPassFailure();
-
- // Remove sg_layout and sg_data attributes from the Layout
- // attribute for each VectorType result of the operation.
- // For Structured Control Flow ops, the layout is simply removed,
- // since in 1:N case, the layout for new results are missing.
- // Layout propagation pass will activated.
- getOperation()->walk([](Operation *op) {
- for (OpResult result : op->getOpResults()) {
- std::string name = xegpu::getTemporaryLayoutName(result);
- if (auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
- op->removeAttr(name);
- if (!isa<scf::IfOp, scf::ForOp, scf::WhileOp, scf::ConditionOp>(op)) {
- if (auto newLayout = layout.dropSgLayoutAndData())
- op->setAttr(name, newLayout);
- }
- }
- }
- });
}
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index e3567b67da7fc..7e28c756f2d72 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -179,7 +179,6 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
return nullptr;
}
-
xegpu::DistributeLayoutAttr
xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
Operation *op = opr.getOwner();
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index b2fc7231dad60..063adaf717f63 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -196,8 +196,7 @@ gpu.module @test_distribution {
%9 = xegpu.dpas %arg4, %arg5, %arg6
{layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>,
layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>,
- layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>,
- layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
+ layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
: vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
%10 = xegpu.load_nd %3[%arg3, %c0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
%11 = xegpu.load_nd %4[%c0, %arg3] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
More information about the Mlir-commits
mailing list