[Mlir-commits] [mlir] [MLIR][XeGPU] Preserve anchor layouts in recoverTemporaryLayout (PR #182186)

Fri Feb 27 10:43:48 PST 2026

https://github.com/nbpatel updated https://github.com/llvm/llvm-project/pull/182186

>From 54790e9c306bc1d1725afc91c6cb032a365b8591 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Wed, 18 Feb 2026 23:30:31 +0000
Subject: [PATCH 1/2] Fix recoverTemporaryLayout

---
 .../lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp | 10 +++++++++-
 .../test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 12 ++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index eb7fab3610218..75a9149e93c1a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -93,7 +93,15 @@ bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
       // layout attributes to.
       if (isa<BlockArgument>(operand.get()))
         continue;
-      auto layout = xegpu::getDistributeLayoutAttr(operand.get());
+      // First, try to get the layout from the consumer (operand) side. This
+      // preserves anchor layouts on ops like load/store, which may carry
+      // user-specified inst_data/lane fields that the defining op (e.g.,
+      // multi_reduction) does not have.
+      auto layout = xegpu::getDistributeLayoutAttr(operand);
+      // Fall back to the producer (Value) side if the consumer side has no
+      // layout.
+      if (!layout)
+        layout = xegpu::getDistributeLayoutAttr(operand.get());
       if (!layout) {
         op->emitWarning("Could not find layout attribute for operand ")
             << operand.getOperandNumber() << " of operation " << op->getName();
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index e2e94c5f0300f..199bbfd8a6d37 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -969,4 +969,16 @@ gpu.module @test_distribution {
     gpu.return
   }
 
+  // CHECK-LABEL: @preserve_anchor_layout
+  // CHECK: arith.constant dense<1.000000e+00> : vector<16x128xf32>
+  // CHECK: xegpu.store_nd %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] <{layout = #xegpu.layout<inst_data = [8, 16]>}>
+  gpu.func @preserve_anchor_layout(%dst: memref<256x128xf32>) {
+    %val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 128]>} dense<1.0> : vector<256x128xf32>
+    %tdesc = xegpu.create_nd_tdesc %dst : memref<256x128xf32>
+      -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 128], inst_data = [8, 16]>>
+    xegpu.store_nd %val, %tdesc[0, 0] <{layout = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 128], inst_data = [8, 16]>}>
+      : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 128], inst_data = [8, 16]>>
+    gpu.return
+  }
+
 }

>From 89c23cc17fba2eb6b5aeefb086504b2e55f9b8b1 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Fri, 27 Feb 2026 18:43:04 +0000
Subject: [PATCH 2/2] Use setTemporaryLayout in recoverTemporaryLayout

---
 .../lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp | 12 ++----------
 mlir/test/Dialect/XeGPU/subgroup-distribute.mlir     | 12 ++++++------
 .../test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir |  2 +-
 3 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index b90f4256a3b57..352d85584e921 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -93,21 +93,13 @@ bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
       // layout attributes to.
       if (isa<BlockArgument>(operand.get()))
         continue;
-      // First, try to get the layout from the consumer (operand) side. This
-      // preserves anchor layouts on ops like load/store, which may carry
-      // user-specified inst_data/lane fields that the defining op (e.g.,
-      // multi_reduction) does not have.
-      auto layout = xegpu::getDistributeLayoutAttr(operand);
-      // Fall back to the producer (Value) side if the consumer side has no
-      // layout.
-      if (!layout)
-        layout = xegpu::getDistributeLayoutAttr(operand.get());
+      auto layout = xegpu::getDistributeLayoutAttr(operand.get());
       if (!layout) {
         op->emitWarning("Could not find layout attribute for operand ")
             << operand.getOperandNumber() << " of operation " << op->getName();
         continue;
       }
-      xegpu::setDistributeLayoutAttr(operand, layout);
+      xegpu::setTemporaryLayout(operand, layout);
     }
     return WalkResult::advance();
   });
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index dae00838fdcb6..5cf4ae64a0fd4 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -42,7 +42,7 @@ gpu.module @xevm_module{
 
     %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> ->
       !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %5, %6[%c0, %c0] : vector<8x16xf32>,
+    xegpu.store_nd %5, %6[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
       !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
   }
@@ -112,7 +112,7 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar
     scf.yield %9 : vector<8x16xf32>
   } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 
-  xegpu.store_nd %4, %2[%0, %1] : vector<8x16xf32>,
+  xegpu.store_nd %4, %2[%0, %1] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
     !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
@@ -150,7 +150,7 @@ gpu.module @xevm_module{
       } dense<12.> : vector<16x8xf16>
       scf.yield %3 : vector<16x8xf16>
     } { layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> }
-    xegpu.store %loaded, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+    xegpu.store %loaded, %src[%offset], %1 <{chunk_size=8}> {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
     gpu.return
   }
 }
@@ -175,7 +175,7 @@ gpu.module @xevm_module{
       %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
         layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
       } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-      xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+      xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
     }
     gpu.return
   }
@@ -215,7 +215,7 @@ gpu.module @xevm_module{
       : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
     %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32>
       -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %6, %7[%c0, %c0] : vector<8x16xf32>,
+    xegpu.store_nd %6, %7[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
       !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
 
@@ -267,7 +267,7 @@ gpu.module @xevm_module{
     {
       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
     }
-    xegpu.store_nd %3#0, %arg1[%c0, %c0]
+    xegpu.store_nd %3#0, %arg1[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
   }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 86f4c2e3ffe9d..fe9e3683edf7c 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -64,7 +64,7 @@ gpu.module @test_distribution {
     %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
       -> vector<256x128xf32>
-    xegpu.store_nd %load, %tdesc[0, 0]
+    xegpu.store_nd %load, %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
 }