[Mlir-commits] [mlir] [MLIR][XeGPU] Preserve anchor layouts in recoverTemporaryLayout (PR #182186)
Nishant Patel
llvmlistbot at llvm.org
Fri Feb 27 10:43:48 PST 2026
https://github.com/nbpatel updated https://github.com/llvm/llvm-project/pull/182186
>From 54790e9c306bc1d1725afc91c6cb032a365b8591 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Wed, 18 Feb 2026 23:30:31 +0000
Subject: [PATCH 1/2] Fix recoverTemporaryLayout
---
.../lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp | 10 +++++++++-
.../test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 12 ++++++++++++
2 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index eb7fab3610218..75a9149e93c1a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -93,7 +93,15 @@ bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
// layout attributes to.
if (isa<BlockArgument>(operand.get()))
continue;
- auto layout = xegpu::getDistributeLayoutAttr(operand.get());
+ // First, try to get the layout from the consumer (operand) side. This
+ // preserves anchor layouts on ops like load/store, which may carry
+ // user-specified inst_data/lane fields that the defining op (e.g.,
+ // multi_reduction) does not have.
+ auto layout = xegpu::getDistributeLayoutAttr(operand);
+ // Fall back to the producer (Value) side if the consumer side has no
+ // layout.
+ if (!layout)
+ layout = xegpu::getDistributeLayoutAttr(operand.get());
if (!layout) {
op->emitWarning("Could not find layout attribute for operand ")
<< operand.getOperandNumber() << " of operation " << op->getName();
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index e2e94c5f0300f..199bbfd8a6d37 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -969,4 +969,16 @@ gpu.module @test_distribution {
gpu.return
}
+ // CHECK-LABEL: @preserve_anchor_layout
+ // CHECK: arith.constant dense<1.000000e+00> : vector<16x128xf32>
+ // CHECK: xegpu.store_nd %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] <{layout = #xegpu.layout<inst_data = [8, 16]>}>
+ gpu.func @preserve_anchor_layout(%dst: memref<256x128xf32>) {
+ %val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 128]>} dense<1.0> : vector<256x128xf32>
+ %tdesc = xegpu.create_nd_tdesc %dst : memref<256x128xf32>
+ -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 128], inst_data = [8, 16]>>
+ xegpu.store_nd %val, %tdesc[0, 0] <{layout = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 128], inst_data = [8, 16]>}>
+ : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 128], inst_data = [8, 16]>>
+ gpu.return
+ }
+
}
>From 89c23cc17fba2eb6b5aeefb086504b2e55f9b8b1 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Fri, 27 Feb 2026 18:43:04 +0000
Subject: [PATCH 2/2] Use setTemporaryLayout in recoverTemporaryLayout
---
.../lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp | 12 ++----------
mlir/test/Dialect/XeGPU/subgroup-distribute.mlir | 12 ++++++------
.../test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 2 +-
3 files changed, 9 insertions(+), 17 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index b90f4256a3b57..352d85584e921 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -93,21 +93,13 @@ bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
// layout attributes to.
if (isa<BlockArgument>(operand.get()))
continue;
- // First, try to get the layout from the consumer (operand) side. This
- // preserves anchor layouts on ops like load/store, which may carry
- // user-specified inst_data/lane fields that the defining op (e.g.,
- // multi_reduction) does not have.
- auto layout = xegpu::getDistributeLayoutAttr(operand);
- // Fall back to the producer (Value) side if the consumer side has no
- // layout.
- if (!layout)
- layout = xegpu::getDistributeLayoutAttr(operand.get());
+ auto layout = xegpu::getDistributeLayoutAttr(operand.get());
if (!layout) {
op->emitWarning("Could not find layout attribute for operand ")
<< operand.getOperandNumber() << " of operation " << op->getName();
continue;
}
- xegpu::setDistributeLayoutAttr(operand, layout);
+ xegpu::setTemporaryLayout(operand, layout);
}
return WalkResult::advance();
});
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index dae00838fdcb6..5cf4ae64a0fd4 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -42,7 +42,7 @@ gpu.module @xevm_module{
%6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> ->
!xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.store_nd %5, %6[%c0, %c0] : vector<8x16xf32>,
+ xegpu.store_nd %5, %6[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
!xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -112,7 +112,7 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar
scf.yield %9 : vector<8x16xf32>
} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- xegpu.store_nd %4, %2[%0, %1] : vector<8x16xf32>,
+ xegpu.store_nd %4, %2[%0, %1] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
!xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -150,7 +150,7 @@ gpu.module @xevm_module{
} dense<12.> : vector<16x8xf16>
scf.yield %3 : vector<16x8xf16>
} { layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> }
- xegpu.store %loaded, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+ xegpu.store %loaded, %src[%offset], %1 <{chunk_size=8}> {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
gpu.return
}
}
@@ -175,7 +175,7 @@ gpu.module @xevm_module{
%3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
- xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+ xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
}
gpu.return
}
@@ -215,7 +215,7 @@ gpu.module @xevm_module{
: vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
%7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32>
-> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.store_nd %6, %7[%c0, %c0] : vector<8x16xf32>,
+ xegpu.store_nd %6, %7[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
!xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
@@ -267,7 +267,7 @@ gpu.module @xevm_module{
{
layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
}
- xegpu.store_nd %3#0, %arg1[%c0, %c0]
+ xegpu.store_nd %3#0, %arg1[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 86f4c2e3ffe9d..fe9e3683edf7c 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -64,7 +64,7 @@ gpu.module @test_distribution {
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<256x128xf32>
- xegpu.store_nd %load, %tdesc[0, 0]
+ xegpu.store_nd %load, %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
More information about the Mlir-commits
mailing list