[Mlir-commits] [mlir] [MLIR][XeGPU] Clean up the temporary layout usage in XeGPU test (PR #195739)

Tue May 5 09:24:59 PDT 2026

https://github.com/Jianhui-Li updated https://github.com/llvm/llvm-project/pull/195739

>From 485f1803518648383b45f05f6565e32e770c5d15 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 23 Apr 2026 22:24:35 +0000
Subject: [PATCH 01/11] remove recusive loop in getDistributeLayoutAttr. clean
 the layout before recover, need to fix test as next step

---
 .../Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp | 16 +---------------
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp      |  3 ++-
 2 files changed, 3 insertions(+), 16 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 7d48315eec6ff..8188cfa08779d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -33,20 +33,6 @@
 
 using namespace mlir;
 
-void xegpu::recoverTemporaryLayoutsDeprecated(Operation *op) {
-  op->walk([&](Operation *nestOp) {
-    for (OpOperand &opr : nestOp->getOpOperands()) {
-      auto layout = getDistributeLayoutAttr(opr.get());
-      setDistributeLayoutAttr(opr, layout);
-    }
-
-    for (OpResult result : nestOp->getOpResults()) {
-      auto layout = getDistributeLayoutAttr(result);
-      setDistributeLayoutAttr(result, layout);
-    }
-  });
-}
-
 SmallVector<NamedAttribute>
 xegpu::dropSgLayoutAndDataOnAttrs(ArrayRef<NamedAttribute> attrs) {
   SmallVector<NamedAttribute> out;
@@ -293,7 +279,7 @@ bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
       }
     });
   };
-
+  removeTemporaryLayoutAttrs(rootOp);
   rootOp->walk([&](func::FuncOp func) {
     processFunc(func.getBody(), func.getSymName());
   });
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 2d1ce6eea17aa..13288a377e69a 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -162,7 +162,8 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
     if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
       OpOperand *tiedInit = loop.getTiedLoopInit(arg);
       if (tiedInit)
-        return getDistributeLayoutAttr(tiedInit->get());
+        // return getDistributeLayoutAttr(tiedInit->get());
+        return getTemporaryLayout(*tiedInit);
     }
   }
 

>From c47269160cef27bab664a0b257aef671084c8b69 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 1 May 2026 02:46:01 +0000
Subject: [PATCH 02/11] fix tests

---
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      |   2 +-
 .../Transforms/XeGPUWgToSgDistribute.cpp      |   9 +-
 .../XeGPU/sg-to-wi-experimental-unit.mlir     |  10 ++
 .../Dialect/XeGPU/sg-to-wi-experimental.mlir  |   8 +-
 .../XeGPU/xegpu-wg-to-sg-elemwise.mlir        |  52 ++++++-
 .../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir |  44 +++++-
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir   | 137 ++++++++++++++++++
 7 files changed, 248 insertions(+), 14 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 2d5e1a6397278..eda7b18bd978a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -134,7 +134,7 @@ static void propagateResultsToRegularOperands(Operation *op) {
       result.setType(typeWithLayout);
     }
   }
-  if (isa<VectorType>(resultType) && resLayout)
+  if (resLayout)
     xegpu::setTemporaryLayout(result, resLayout);
 
   for (OpOperand &opr : op->getOpOperands()) {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 8aa0758943cd1..af82effb9d379 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -1562,7 +1562,6 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
 
   target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp,
                                vector::TransposeOp, vector::BroadcastOp,
-                               vector::MultiDimReductionOp,
                                vector::ConstantMaskOp, vector::CreateMaskOp>(
       [=](Operation *op) -> bool {
         // Check for either a SliceAttr or LayoutAttr on the result.
@@ -1570,7 +1569,13 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
             xegpu::getTemporaryLayout(dyn_cast<OpResult>(op->getResult(0)));
         return isLegal(layout);
       });
-
+  target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
+      [=](Operation *op) -> bool {
+        // Check operand since the result maybe scalar not bearing layout..
+        auto layout =
+            xegpu::getTemporaryLayout(dyn_cast<vector::MultiDimReductionOp>(op)->getOpOperand(0));
+        return isLegal(layout);
+      });
   target.addDynamicallyLegalOp<xegpu::LoadGatherOp>(
       [=](xegpu::LoadGatherOp op) -> bool {
         auto layout = op.getLayoutAttr();
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 08b334ddec3fc..8d65ea497ad88 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -32,6 +32,11 @@ gpu.func @load_nd() {
   %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+  %anchor = xegpu.convert_layout %1
+    <{
+      input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }> : vector<16x16xf16>
   gpu.return
 }
 
@@ -116,6 +121,11 @@ gpu.func @dpas() {
      layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
      layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>  -> vector<8x16xf32>
+  %anchor = xegpu.convert_layout %4
+    <{
+      input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }> : vector<8x16xf32>
   gpu.return
 }
 
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
index c4401515414b6..e02bd9b0370ad 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
@@ -253,10 +253,16 @@ gpu.module @xevm_module{
        layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
     %5 = math.exp %4
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<8x16xf32>
     %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> ->
       !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %5, %6[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
+    %anchor = xegpu.convert_layout %5
+      <{
+        input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+        target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+      }> : vector<8x16xf32>
+    xegpu.store_nd %anchor, %6[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
       !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
   }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
index 53ce8d0e38949..94e8b7504a1d6 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
@@ -14,9 +14,14 @@ gpu.module @test_elementwise_ops {
       {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>}
       : vector<24x32xf32>
     // CHECK: arith.negf {{.*}} : vector<12x8xf32>
-    %negf = arith.negf %load_a
+    %negf = arith.negf %exp
       {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>}
       : vector<24x32xf32>
+    %anchor = xegpu.convert_layout %negf
+      <{
+        input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>,
+        target_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>
+      }> : vector<24x32xf32>
     gpu.return
   }
 
@@ -32,9 +37,14 @@ gpu.module @test_elementwise_ops {
       {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : vector<24x32xf32>
     // CHECK: arith.negf {{.*}} : vector<12x8xf32>
-    %negf = arith.negf %load_a
+    %negf = arith.negf %exp
       {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : vector<24x32xf32>
+    %anchor = xegpu.convert_layout %negf
+      <{
+        input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>,
+        target_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>
+      }> : vector<24x32xf32>
     gpu.return
   }
 
@@ -55,9 +65,14 @@ gpu.module @test_elementwise_ops {
       {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : vector<24x32xf32>
     // CHECK: math.powf {{.*}}, {{.*}} : vector<12x8xf32>
-    %powf = math.powf %load_a, %load_b
+    %powf = math.powf %addf, %load_b
       {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : vector<24x32xf32>
+    %anchor = xegpu.convert_layout %powf
+      <{
+        input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>,
+        target_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>
+      }> : vector<24x32xf32>
     gpu.return
   }
 
@@ -83,9 +98,14 @@ gpu.module @test_elementwise_ops {
       {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : vector<24x32xi1>, vector<24x32xf32>
     // CHECK: math.fma  {{.*}}, {{.*}}, {{.*}} : vector<12x8xf32>
-    %fma = math.fma %load_a, %load_b, %load_a
+    %fma = math.fma %load_a, %load_b, %select
       {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : vector<24x32xf32>
+    %anchor = xegpu.convert_layout %fma
+      <{
+        input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>,
+        target_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>
+      }> : vector<24x32xf32>
     gpu.return
   }
 
@@ -105,10 +125,15 @@ gpu.module @test_elementwise_ops {
     %truncf = arith.truncf %load_a
       {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : vector<24x32xf32> to vector<24x32xf16>
-    // CHECK: arith.bitcast {{.*}} : vector<12x8xi32> to vector<12x8xf32>
-    %bitcast = arith.bitcast %load_b
+    // CHECK: arith.bitcast {{.*}} : vector<12x8xf16> to vector<12x8xi16>
+    %bitcast = arith.bitcast %truncf
       {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
-      : vector<24x32xi32> to vector<24x32xf32>
+      : vector<24x32xf16> to vector<24x32xi16>
+    %anchor = xegpu.convert_layout %bitcast
+      <{
+        input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>,
+        target_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>
+      }> : vector<24x32xi16>
     gpu.return
   }
 
@@ -142,6 +167,12 @@ gpu.module @test_elementwise_ops {
     %cmpi = arith.cmpi eq, %load_c, %load_d
       {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : vector<24x32xi32>
+    %res = arith.select %cmpi, %cmpi, %cmpf : vector<24x32xi1>, vector<24x32xi1>
+    %anchor = xegpu.convert_layout %res
+      <{
+        input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>,
+        target_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>
+      }> : vector<24x32xi1>
     gpu.return
   }
 
@@ -165,9 +196,14 @@ gpu.module @test_elementwise_ops {
       : vector<24x32xf32>
     // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} : vector<2x2xf32>
     // CHECK-NOT: math.powf
-    %powf = math.powf %load_a, %load_b
+    %powf = math.powf %negf, %load_b
       {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
       : vector<24x32xf32>
+    %anchor = xegpu.convert_layout %powf
+      <{
+        input_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>,
+        target_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>
+      }> : vector<24x32xf32>
     gpu.return
   }
 }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index 17a5db6b8401d..fefe2091d458d 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -21,6 +21,11 @@ gpu.module @test_distribution {
     %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
       -> vector<256x128xf32>
+    %anchor = xegpu.convert_layout %load
+      <{
+        input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+        target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>
+      }> : vector<256x128xf32>
     gpu.return
   }
 
@@ -91,6 +96,11 @@ gpu.module @test_distribution {
     // CHECK-NOT: arith.addf
     %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>, dims = [1]>} [1]
       : vector<256x64xf32> to vector<256xf32>
+    %anchor = xegpu.convert_layout %reduce
+      <{
+        input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>, dims = [1]>,
+        target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>, dims = [1]>
+      }> : vector<256xf32>
     gpu.return
   }
 
@@ -130,6 +140,11 @@ gpu.module @test_distribution {
     // CHECK-NOT: vector.transpose
     %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
     : vector<256x128xf32> to vector<128x256xf32>
+    %anchor = xegpu.convert_layout %trans
+      <{
+        input_layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 32], lane_layout = [1, 16], lane_data = [1, 1]>,
+        target_layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 32], lane_layout = [1, 16], lane_data = [1, 1]>
+      }> : vector<128x256xf32>
       gpu.return
   }
 
@@ -138,6 +153,11 @@ gpu.module @test_distribution {
     // CHECK-COUNT-4: vector.create_mask {{.*}}, {{.*}} : vector<16x16xi1>
     // CHECK-NOT: vector.create_mask
     %constant_mask = vector.constant_mask [16, 16] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>} : vector<256x128xi1>
+    %anchor = xegpu.convert_layout %constant_mask
+      <{
+        input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>,
+        target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>
+      }> : vector<256x128xi1>
     gpu.return
   }
 
@@ -146,6 +166,11 @@ gpu.module @test_distribution {
     // CHECK-NOT: vector.create_mask
     %cst16 = arith.constant 16 : index
     %constant_mask = vector.create_mask %cst16, %cst16 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>} : vector<256x128xi1>
+    %anchor = xegpu.convert_layout %constant_mask
+      <{
+        input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>,
+        target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>
+      }> : vector<256x128xi1>
     gpu.return
   }
 
@@ -160,8 +185,8 @@ gpu.module @test_distribution {
     %2 = vector.multi_reduction <maximumf>, %1, %cst_0 {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 1], inst_data = [8, 1]>, dims = [1]>} [1] : vector<256x128xf32> to vector<256xf32>
     %3 = vector.shape_cast %2 {layout_result_0 =  #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 1], inst_data = [8, 1]>, layout_operand_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 1], inst_data = [8, 1]>, dims = [1]>} : vector<256xf32> to vector<256x1xf32>
     %4 = vector.broadcast %3 {layout_result_0 =  #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>} : vector<256x1xf32>to vector<256x128xf32>
-    %9 = xegpu.create_nd_tdesc %arg0 : memref<4096x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>>
-    xegpu.store_nd %4, %9[%block_id_x, 0] : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>>
+    %9 = xegpu.create_nd_tdesc %arg0 : memref<4096x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
+    xegpu.store_nd %4, %9[%block_id_x, 0] <{layout =#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>}>: vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
     gpu.return
   }
 
@@ -214,6 +239,11 @@ gpu.module @test_distribution {
     %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>} dense<true> : vector<8x256xi1>
     %val = xegpu.load %arg0[%offset], %mask <{layout = #xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>}> : memref<2048xf32, 1>, vector<8x256xindex>, vector<8x256xi1> -> vector<8x256xf32>
     %reduce = vector.multi_reduction <add>, %val, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>, dims = [1]>} [1] : vector<8x256xf32> to vector<8xf32>
+    %anchor = xegpu.convert_layout %reduce
+      <{
+        input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>, dims = [1]>,
+        target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>, dims = [1]>
+      }> : vector<8xf32>
     gpu.return
   }
 
@@ -240,6 +270,11 @@ gpu.module @test_distribution {
     // CHECK: %[[RES1:.*]] = vector.broadcast %[[ADD4]] : vector<4xindex> to vector<16x4xindex>
     %2 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4]>, dims = [0]>} : vector<8xindex>
     %bcast = vector.broadcast %2 {layout_result_0 = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4]>} : vector<8xindex> to vector<256x8xindex>
+    %anchor = xegpu.convert_layout %bcast
+      <{
+        input_layout = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4]>,
+        target_layout = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4]>
+      }> : vector<256x8xindex>
     gpu.return
   }
 
@@ -265,6 +300,11 @@ gpu.module @test_distribution {
     %broadcast = vector.broadcast %load
       {layout_result_0 = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
       : vector<128x1xf32> to vector<128x64xf32>
+    %anchor = xegpu.convert_layout %broadcast
+      <{
+        input_layout = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 32], lane_layout = [8, 1], lane_data = [1, 1]>,
+        target_layout = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 32], lane_layout = [8, 1], lane_data = [1, 1]>
+      }> : vector<128x64xf32>
     gpu.return
   }
 
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index f2cc05808ed12..c9aff190d84d7 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -43,6 +43,11 @@ gpu.module @test_distribution {
     %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
       -> vector<256x128xf32>
+    %anchor = xegpu.convert_layout %load
+      <{
+        input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>,
+        target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>
+      }> : vector<256x128xf32>
     gpu.return
   }
 
@@ -133,6 +138,10 @@ gpu.module @test_distribution {
     %broadcast = vector.broadcast %load
       {layout_result_0 = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
       : vector<256x1xf32> to vector<256x32xf32>
+    %anchor = xegpu.convert_layout %broadcast
+      <{input_layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 32], lane_layout = [8, 1], lane_data = [1, 1]>,
+        target_layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 32], lane_layout = [8, 1], lane_data = [1, 1]>}>
+      : vector<256x32xf32>
     gpu.return
   }
 
@@ -148,6 +157,11 @@ gpu.module @test_distribution {
     %broadcast = vector.broadcast %load
       {layout_result_0 = #xegpu.layout<sg_layout = [1, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<1x128xf32> to vector<32x128xf32>
+    %anchor = xegpu.convert_layout %broadcast
+      <{
+        input_layout = #xegpu.layout<sg_layout = [1, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>,
+        target_layout = #xegpu.layout<sg_layout = [1, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>
+      }> : vector<32x128xf32>
     gpu.return
   }
 
@@ -232,6 +246,11 @@ gpu.module @test_distribution {
         : !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
         -> vector<128x64xf32>
       %exp = math.exp %load {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>} : vector<128x64xf32>
+      %anchor = xegpu.convert_layout %exp
+        <{
+          input_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>,
+          target_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>
+        }> : vector<128x64xf32>
     }{sg_id_range = #xegpu.range<[2, 18]>}
     gpu.return
   }
@@ -261,6 +280,11 @@ gpu.module @test_distribution {
           : !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
           -> vector<128x64xf32>
         %exp = math.exp %ld {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>} : vector<128x64xf32>
+        %anchor = xegpu.convert_layout %exp
+          <{
+            input_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>,
+            target_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>
+          }> : vector<128x64xf32>
     }
   } {sg_id_range = #xegpu.range<[3, 19]>}
   gpu.return
@@ -374,6 +398,10 @@ gpu.module @test_distribution {
     // CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [0] : vector<4x4xf32> to vector<4xf32>
     %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>, dims = [0]>} [0]
       : vector<4x128xf32> to vector<128xf32>
+    %anchor = xegpu.convert_layout %reduce
+      <{input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>, dims = [0]>, 
+      target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>, dims = [0]>}>
+      : vector<128xf32>
     gpu.return
   }
 
@@ -388,6 +416,10 @@ gpu.module @test_distribution {
     // CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [1] : vector<16x64xf32> to vector<16xf32>
     %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>, dims = [1]>} [1]
       : vector<256x64xf32> to vector<256xf32>
+    %anchor = xegpu.convert_layout %reduce
+      <{input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>, dims = [1]>, 
+      target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>, dims = [1]>}>
+      : vector<256xf32>
     gpu.return
   }
 
@@ -400,6 +432,11 @@ gpu.module @test_distribution {
       // CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [3] : vector<1x1x1x32xf16> to vector<1x1x1xf16>
       %reduce = vector.multi_reduction <add>, %load, %cst_acc {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>, dims = [3]>} [3]
       : vector<4x2x6x32xf16> to vector<4x2x6xf16>
+      %anchor = xegpu.convert_layout %reduce
+        <{
+          input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>, dims = [3]>,
+          target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>, dims = [3]>
+        }> : vector<4x2x6xf16>
       gpu.return
     }
 
@@ -427,6 +464,11 @@ gpu.module @test_distribution {
       -> vector<32x32xf32>
     %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 4], sg_data = [8, 8]>, dims = [0, 1]>} [0, 1]
       : vector<32x32xf32> to f32
+    %anchor = xegpu.convert_layout %reduce
+        <{
+          input_layout =  #xegpu.slice<#xegpu.layout<sg_layout = [4, 4], sg_data = [8, 8]>, dims = [0, 1]>,
+          target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 4], sg_data = [8, 8]>, dims = [0, 1]>
+        }> : f32
     gpu.return
   }
 
@@ -446,6 +488,11 @@ gpu.module @test_distribution {
     //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
     %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex>
+    %anchor = xegpu.convert_layout %step
+      <{
+        input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>,
+        target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>
+      }> : vector<128xindex>
     gpu.return
   }
 
@@ -461,6 +508,11 @@ gpu.module @test_distribution {
     //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex>
     %step = vector.step {layout_result_0 = #xegpu.layout<sg_layout = [16], sg_data = [8]>}: vector<128xindex>
+    %anchor = xegpu.convert_layout %step
+      <{
+        input_layout = #xegpu.layout<sg_layout = [16], sg_data = [8]>,
+        target_layout = #xegpu.layout<sg_layout = [16], sg_data = [8]>
+      }> : vector<128xindex>
     gpu.return
   }
 
@@ -478,6 +530,11 @@ gpu.module @test_distribution {
     %muli = arith.muli %cst, %step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, dims = [0, 1, 2]>} : vector<128xindex>
     //CHECK: vector.shape_cast {{.*}} : vector<32xindex> to vector<1x1x1x32xindex>
     %shape_cast = vector.shape_cast %muli {layout_result_0 = #xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, layout_operand_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, dims = [0, 1, 2]>} : vector<128xindex> to vector<1x1x1x128xindex>
+    %anchor = xegpu.convert_layout %shape_cast
+      <{
+        input_layout = #xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>,
+        target_layout = #xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>
+      }> : vector<1x1x1x128xindex>
     gpu.return
   }
 
@@ -486,6 +543,11 @@ gpu.module @test_distribution {
     %muli = arith.muli %arg0, %arg1 : index
     // CHECK: vector.broadcast {{.*}} : index to vector<1x1x1x32xindex>
     %broadcast = vector.broadcast %muli {layout_result_0 = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} : index to vector<4x2x6x32xindex>
+    %anchor = xegpu.convert_layout %broadcast
+      <{
+        input_layout = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>,
+        target_layout = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>
+      }> : vector<4x2x6x32xindex>
     gpu.return
   }
 
@@ -499,6 +561,11 @@ gpu.module @test_distribution {
     //CHECK: vector.transpose {{.*}}, [1, 0] : vector<64x32xf32> to vector<32x64xf32>
     %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<256x32xf32> to vector<32x256xf32>
+    %anchor = xegpu.convert_layout %trans
+      <{
+        input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], lane_layout = [1, 16], lane_data = [1, 1]>,
+        target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], lane_layout = [1, 16], lane_data = [1, 1]>
+      }> : vector<32x256xf32>
       gpu.return
   }
 
@@ -568,6 +635,11 @@ gpu.module @test_distribution {
   gpu.func @scalar_broadcast(%arg0: index) {
     // CHECK: vector.broadcast {{.*}} : index to vector<1x1x1xindex>
     %broadcast = vector.broadcast %arg0 {layout_result_0 = #xegpu.layout<sg_layout = [4, 8, 1], sg_data = [1, 1, 1]>} : index to vector<4x1x1xindex>
+    %anchor = xegpu.convert_layout %broadcast
+      <{
+        input_layout = #xegpu.layout<sg_layout = [4, 8, 1], sg_data = [1, 1, 1]>,
+        target_layout = #xegpu.layout<sg_layout = [4, 8, 1], sg_data = [1, 1, 1]>
+      }> : vector<4x1x1xindex>
     gpu.return
   }
 
@@ -582,6 +654,11 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[MIN:.*]] = arith.minsi %[[MAX]], %[[C16:.*]] : index
     // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MIN]] : vector<16xi1>
     %constant_mask = vector.constant_mask [8] {layout_result_0 = #xegpu.layout<sg_layout = [2], sg_data = [16]>} : vector<32xi1>
+    %anchor = xegpu.convert_layout %constant_mask
+      <{
+        input_layout = #xegpu.layout<sg_layout = [2], sg_data = [16]>,
+        target_layout = #xegpu.layout<sg_layout = [2], sg_data = [16]>
+      }> : vector<32xi1>
     gpu.return
   }
 
@@ -603,6 +680,11 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[MINCOL:.*]] = arith.minsi %[[MAXCOL]], %[[C32:.*]] : index
     // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MINROW]], %[[MINCOL]] : vector<32x32xi1>
     %constant_mask = vector.constant_mask [16, 16] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xi1>
+    %anchor = xegpu.convert_layout %constant_mask
+      <{
+        input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>,
+        target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>
+      }> : vector<256x128xi1>
     gpu.return
   }
 
@@ -618,6 +700,11 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MIN]] : vector<16xi1>
     %cst8 = arith.constant 8 : index
     %constant_mask = vector.create_mask %cst8 {layout_result_0 = #xegpu.layout<sg_layout = [2], sg_data = [16]>} : vector<32xi1>
+    %anchor = xegpu.convert_layout %constant_mask
+      <{
+        input_layout = #xegpu.layout<sg_layout = [2], sg_data = [16]>,
+        target_layout = #xegpu.layout<sg_layout = [2], sg_data = [16]>
+      }> : vector<32xi1>
     gpu.return
   }
 
@@ -640,6 +727,11 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MINROW]], %[[MINCOL]] : vector<32x32xi1>
     %cst16 = arith.constant 16 : index
     %constant_mask = vector.create_mask %cst16, %cst16 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xi1>
+    %anchor = xegpu.convert_layout %constant_mask
+      <{
+        input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>,
+        target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>
+      }> : vector<256x128xi1>
     gpu.return
   }
 
@@ -656,6 +748,11 @@ gpu.module @test_distribution {
     // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[LOAD]] : vector<32xf32> to vector<32x32xf32>
     %4 = vector.broadcast %3 {layout_result_0 =
         #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>} : vector<256xf32> to vector<256x256xf32>
+    %anchor = xegpu.convert_layout %4
+      <{
+        input_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>,
+        target_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>
+      }> : vector<256x256xf32>
     gpu.return
   }
 
@@ -684,6 +781,11 @@ gpu.module @test_distribution {
     %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>} dense<true> : vector<1x32x32xi1>
     %14 = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>} : memref<?xf32>, vector<1x32x32xindex>, vector<1x32x32xi1> -> vector<1x32x32xf32>
     %15 = vector.multi_reduction <add>, %14, %cst_3 {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>, dims = [1]>} [1] : vector<1x32x32xf32> to vector<1x32xf32>
+    %anchor = xegpu.convert_layout %15
+      <{
+        input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>, dims = [1]>,
+        target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>, dims = [1]>
+      }> : vector<1x32xf32>
     gpu.return
   }
 
@@ -721,6 +823,11 @@ gpu.module @test_distribution {
       -> vector<256x128xf32>
     %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0]
       : vector<256x128xf32> to vector<128xf32>
+    %anchor = xegpu.convert_layout %reduce
+      <{
+        input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>,
+        target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>
+      }> : vector<128xf32>
     gpu.return
   }
 
@@ -749,6 +856,11 @@ gpu.module @test_distribution {
     %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>} dense<true> : vector<2x2x128x128xi1>
     %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>} : memref<?xf32>, vector<2x2x128x128xindex>, vector<2x2x128x128xi1> -> vector<2x2x128x128xf32>
     %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>, dims = [2, 3]>} [2, 3] : vector<2x2x128x128xf32> to vector<2x2xf32>
+    %anchor = xegpu.convert_layout %reduce
+      <{
+        input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>, dims = [2, 3]>,
+        target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>, dims = [2, 3]>
+      }> : vector<2x2xf32>
     gpu.return
   }
 
@@ -777,6 +889,11 @@ gpu.module @test_distribution {
     %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>} dense<true> : vector<32x32x128x128xi1>
     %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>} : memref<?xf32>, vector<32x32x128x128xindex>, vector<32x32x128x128xi1> -> vector<32x32x128x128xf32>
     %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>, dims = [2, 3]>} [2, 3] : vector<32x32x128x128xf32> to vector<32x32xf32>
+    %anchor = xegpu.convert_layout %reduce
+      <{
+        input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>, dims = [2, 3]>,
+        target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>, dims = [2, 3]>
+      }> : vector<32x32xf32>
     gpu.return
   }
 
@@ -790,6 +907,11 @@ gpu.module @test_distribution {
     %load =  xegpu.load_nd %tdesc[0, 0] <{layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16],lane_layout = [1, 16], lane_data = [1, 1]>}>
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
       -> vector<256x128xf32>
+    %anchor = xegpu.convert_layout %load
+      <{
+        input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+        target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1]>
+      }> : vector<256x128xf32>
     gpu.return
   }
 
@@ -858,6 +980,11 @@ gpu.module @test_distribution {
     %1 = xegpu.load_nd %0[0, 0] {layout = #xegpu.layout<sg_layout = [4, 16], sg_data = [32, 16], inst_data = [16, 16]>} : !xegpu.tensor_desc<128x256xf32, #xegpu.layout<sg_layout = [4, 16], sg_data = [32, 16], inst_data = [16, 16]>> -> vector<128x256xf32>
     %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [4, 16], sg_data = [32, 16], inst_data = [16, 16]>,
                                    target_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 32], inst_data = [16, 16]>}> : vector<128x256xf32>
+    %anchor = xegpu.convert_layout %2
+      <{
+        input_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 32], inst_data = [16, 16]>,
+        target_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 32], inst_data = [16, 16]>
+      }> : vector<128x256xf32>
     gpu.return
   }
 
@@ -896,6 +1023,11 @@ gpu.module @test_distribution {
     %1 = xegpu.load %arg0[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [8, 4, 16], sg_data = [1, 32, 16], inst_data = [1, 16, 16]>} : memref<?xf32>, vector<8x128x256xindex>, vector<8x128x256xi1> -> vector<8x128x256xf32>
     %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [8, 4, 16], sg_data = [1, 32, 16], inst_data = [1, 16, 16]>,
                                    target_layout = #xegpu.layout<sg_layout = [8, 8, 8], sg_data = [1, 16, 32], inst_data = [1, 16, 16]>}> : vector<8x128x256xf32>
+    %anchor = xegpu.convert_layout %2
+      <{
+        input_layout = #xegpu.layout<sg_layout = [8, 8, 8], sg_data = [1, 16, 32], inst_data = [1, 16, 16]>,
+        target_layout = #xegpu.layout<sg_layout = [8, 8, 8], sg_data = [1, 16, 32], inst_data = [1, 16, 16]>
+      }> : vector<8x128x256xf32>
     gpu.return
   }
 
@@ -945,6 +1077,11 @@ gpu.module @test_distribution {
     %bcast2 = vector.broadcast %scast2 {layout_result_0 =
         #xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 16]>, layout_operand_0 =
         #xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 1, 32, 16, 16]>} : vector<256x16x1x256x16x16xf32> to vector<256x16x16x256x16x16xf32>
+    %anchor = xegpu.convert_layout %bcast2
+      <{
+        input_layout = #xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 16]>,
+        target_layout = #xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 16]>
+      }> : vector<256x16x16x256x16x16xf32>
     gpu.return
   }
 

>From eb8574e16ce2f32a741aa67550fad4282c7cb433 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 1 May 2026 16:46:17 +0000
Subject: [PATCH 03/11] fix tests

---
 .../XeGPU/Transforms/XeGPULayoutImpl.h        |  10 +
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      | 113 +++++++
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 318 ++++++------------
 .../Dialect/XeGPU/sg-to-wi-experimental.mlir  |  14 +-
 mlir/test/Dialect/XeGPU/xegpu-blocking.mlir   |   9 +-
 .../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir |  14 +-
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir   |  40 ++-
 7 files changed, 285 insertions(+), 233 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
index 2dd8d9f610faf..cafd3f392ff72 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
@@ -117,6 +117,16 @@ inferInsertStridedSliceSourceLayout(DistributeLayoutAttr resLayout,
                                     ArrayRef<int64_t> resShape,
                                     ArrayRef<int64_t> srcShape);
 
+/// Infers the source layout attribute for an insert operation.
+/// using same logic as inferInsertStridedSliceSourceLayout
+DistributeLayoutAttr inferInsertSourceLayout(DistributeLayoutAttr resLayout,
+                                             ArrayRef<int64_t> resShape,
+                                             ArrayRef<int64_t> srcShape);
+
+DistributeLayoutAttr inferExtractSourceLayout(DistributeLayoutAttr resLayout,
+                                              ArrayRef<int64_t> resShape,
+                                              ArrayRef<int64_t> srcShape);
+
 /// Infers the layout attribute for mask and offset operand for Chunked load
 /// and store, given the anchor layout attribute for the value being load/store.
 DistributeLayoutAttr
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index eda7b18bd978a..5cd1a8e9c83ec 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -286,6 +286,11 @@ bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
   rootOp->walk([&](gpu::GPUFuncOp func) {
     processFunc(func.getBody(), func.getName());
   });
+  // dump out the root op here for debug purpose
+
+  llvm::dbgs() << "After recovering temporary layout attributes for function: "
+               << rootOp->getName() << "\n";
+  rootOp->dump();
 
   return true;
 }
@@ -470,6 +475,88 @@ xegpu::DistributeLayoutAttr xegpu::inferInsertStridedSliceSourceLayout(
   return resLayout;
 }
 
+/// Infers the source layout attribute for an insert operation
+/// given the result layout attribute, result shape, and source shape. Removes
+/// leading dimensions from the result layout to match the source shape size.
+xegpu::DistributeLayoutAttr
+xegpu::inferInsertSourceLayout(xegpu::DistributeLayoutAttr resLayout,
+                               ArrayRef<int64_t> resShape,
+                               ArrayRef<int64_t> srcShape) {
+
+  int srcShapeSize = srcShape.size();
+  int resShapeSize = resShape.size();
+  int dimDiff = resShapeSize - srcShapeSize;
+
+  if (dimDiff > 0) {
+    // assert that the leading dimensions being sliced off are not distributed
+    // (i.e. sg_layout and lane_layout for those dimensions are all 1)
+    auto resSgLayout = resLayout.getEffectiveSgLayoutAsInt();
+    auto resLaneLayout = resLayout.getEffectiveLaneLayoutAsInt();
+    for (int i = 0; i < dimDiff; i++) {
+      assert((resSgLayout.size() == 0 || resSgLayout[i] == 1) &&
+             (resLaneLayout.size() == 0 || resLaneLayout[i] == 1) &&
+             "Leading dimensions being sliced off must not be distributed");
+    }
+    return resLayout.dropDims(llvm::to_vector(llvm::seq<int64_t>(0, dimDiff)));
+  }
+  return resLayout;
+}
+
+/// Infers the source layout attribute for extract operation
+/// given the result layout attribute, result shape, and source shape. Adds
+/// leading dimensions to the source layout to match the source shape size.
+xegpu::DistributeLayoutAttr
+xegpu::inferExtractSourceLayout(xegpu::DistributeLayoutAttr resLayout,
+                                ArrayRef<int64_t> resShape,
+                                ArrayRef<int64_t> srcShape) {
+
+  int srcShapeSize = srcShape.size();
+  int resShapeSize = resShape.size();
+  int dimDiff = srcShapeSize - resShapeSize;
+  auto context = resLayout.getContext();
+  // construct the source layout by adding unit dimensions to the front of
+  // result layout
+
+  SmallVector<int64_t> sgLayout(srcShapeSize, 1);
+  SmallVector<int64_t> sgData(srcShapeSize, 1);
+  SmallVector<int64_t> instData(srcShapeSize, 1);
+  SmallVector<int64_t> laneLayout(srcShapeSize, 1);
+  SmallVector<int64_t> laneData(srcShapeSize, 1);
+
+  if (dimDiff > 0) {
+    auto resSgLayout = resLayout.getEffectiveSgLayoutAsInt();
+    auto resSgData = resLayout.getEffectiveSgDataAsInt();
+    auto resInstData = resLayout.getEffectiveInstDataAsInt();
+    auto resLaneLayout = resLayout.getEffectiveLaneLayoutAsInt();
+    auto resLaneData = resLayout.getEffectiveLaneDataAsInt();
+
+    for (int i = 0; i < resShapeSize; i++) {
+      sgLayout[dimDiff + i] = (resSgLayout.size() == 0) ? 1 : resSgLayout[i];
+      sgData[dimDiff + i] = (resSgData.size() == 0) ? 1 : resSgData[i];
+      instData[dimDiff + i] = (resInstData.size() == 0) ? 1 : resInstData[i];
+      laneLayout[dimDiff + i] =
+          (resLaneLayout.size() == 0) ? 1 : resLaneLayout[i];
+      laneData[dimDiff + i] = (resLaneData.size() == 0) ? 1 : resLaneData[i];
+    }
+
+    auto toAttr = [&](ArrayRef<int64_t> v) -> DenseI32ArrayAttr {
+      if (v.empty())
+        return DenseI32ArrayAttr();
+      SmallVector<int32_t> v32(v.begin(), v.end());
+      return DenseI32ArrayAttr::get(context, v32);
+    };
+    auto srcLayout = xegpu::LayoutAttr::get(
+        context, resSgLayout.empty() ? nullptr : toAttr(sgLayout),
+        resSgData.empty() ? nullptr : toAttr(sgData),
+        resInstData.empty() ? nullptr : toAttr(instData),
+        resLaneLayout.empty() ? nullptr : toAttr(laneLayout),
+        resLaneData.empty() ? nullptr : toAttr(laneData), nullptr);
+    // TODO: add layout attribute interface: expandDims
+    return srcLayout;
+  }
+  return resLayout;
+}
+
 /// Infers the source layout attribute for a shape cast operation given the
 /// result layout attribute, result shape, and source shape.
 xegpu::DistributeLayoutAttr
@@ -1573,6 +1660,32 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
       return resLayout;
   }
 
+  // For vector::Insert Op, infer source layout from result layout using
+  // shapes.
+  if (auto insert = dyn_cast<vector::InsertOp>(op)) {
+    VectorType resVecTy = dyn_cast<VectorType>(insert.getResult().getType());
+    VectorType valueToStoreTy =
+        dyn_cast<VectorType>(insert.getValueToStore().getType());
+
+    if (idx == 0) {
+      return xegpu::inferInsertSourceLayout(resLayout, resVecTy.getShape(),
+                                            valueToStoreTy.getShape());
+    }
+    if (idx == 1)
+      return resLayout;
+  }
+
+  // For vector::Extract Op, infer source layout from result layout using
+  // shapes.
+  if (auto extract = dyn_cast<vector::ExtractOp>(op)) {
+    VectorType srcVecTy = dyn_cast<VectorType>(extract.getSource().getType());
+    VectorType resVecTy = dyn_cast<VectorType>(extract.getResult().getType());
+    if (!srcVecTy || !resVecTy)
+      return nullptr;
+    return xegpu::inferExtractSourceLayout(resLayout, resVecTy.getShape(),
+                                           srcVecTy.getShape());
+  }
+
   // For vector::TransposeOp, infer source layout from result layout using
   // permutation.
   if (auto transpose = dyn_cast<vector::TransposeOp>(op)) {
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 8d65ea497ad88..057c9b80926a5 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -9,7 +9,7 @@ gpu.module @xevm_module {
 gpu.func @create_nd_tdesc(%arg0: memref<256x256xf16>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16>
-    -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    -> !xegpu.tensor_desc<16x16xf16>
   gpu.return
 }
 
@@ -19,7 +19,7 @@ gpu.func @create_nd_tdesc(%arg0: memref<256x256xf16>) {
 gpu.func @cerate_nd_tedesc_nonmemref_source(%arg0: ui64) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0, shape : [256, 256], strides : [256, 1] : ui64
-    -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    -> !xegpu.tensor_desc<16x16xf16>
   gpu.return
 }
 
@@ -29,9 +29,9 @@ gpu.func @cerate_nd_tedesc_nonmemref_source(%arg0: ui64) {
 // CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<16xf16> to vector<16x1xf16>
 gpu.func @load_nd() {
   %c0 = arith.constant 0 : index
-  %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16>
   %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+    : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
   %anchor = xegpu.convert_layout %1
     <{
       input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
@@ -46,9 +46,9 @@ gpu.func @load_nd() {
 // CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<16xf16> to vector<16x1xf16>
 gpu.func @load_nd_packed() {
   %c0 = arith.constant 0 : index
-  %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+  %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16>
   %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
-    : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+    : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
   gpu.return
 }
 
@@ -58,9 +58,9 @@ gpu.func @load_nd_packed() {
 // CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<8xf32> to vector<1x8xf32>
 gpu.func @load_nd_transpose() {
   %c0 = arith.constant 0 : index
-  %0 = "some_op"() : () -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+  %0 = "some_op"() : () -> !xegpu.tensor_desc<16x8xf32>
   %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-    : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xf32>
+    : !xegpu.tensor_desc<16x8xf32> -> vector<16x8xf32>
   gpu.return
 }
 
@@ -70,9 +70,9 @@ gpu.func @load_nd_transpose() {
 // CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<64xf16> to vector<2x32x1xf16>
 gpu.func @load_nd_array_length() {
   %c0 = arith.constant 0 : index
-  %0 = "some_op"() : () -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %0 = "some_op"() : () -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
   %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x32x16xf16>
+    : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x32x16xf16>
   gpu.return
 }
 
@@ -84,12 +84,12 @@ gpu.func @load_nd_array_length() {
 // CHECK: xegpu.store_nd %[[CAST3]], %{{.*}}[%[[C0]], %[[C0]]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
 gpu.func @store_nd() {
   %c0 = arith.constant 0 : index
-  %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16>
+  %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16>
   %2 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+    : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
   xegpu.store_nd %2, %1[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   gpu.return
 }
 
@@ -108,14 +108,13 @@ gpu.func @store_nd() {
 // CHECK: gpu.return
 gpu.func @dpas() {
   %c0 = arith.constant 0 : index
-  %0 = "some_op"() : () -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-  %5 = arith.constant  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    dense<0.0> : vector<8x16xf32>
+  %0 = "some_op"() : () -> !xegpu.tensor_desc<8x16xf16>
+  %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16>
+  %5 = arith.constant dense<0.0> : vector<8x16xf32>
   %2 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+    : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %3 = xegpu.load_nd %1[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
-    : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+    : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
   %4 = xegpu.dpas %2, %3, %5
     {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
      layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
@@ -138,13 +137,11 @@ gpu.func @dpas() {
 // CHECK: gpu.return
 gpu.func @elementwise() {
   %c0 = arith.constant 0 : index
-  %0 = arith.constant  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    dense<1.0> : vector<16x16xf32>
-  %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %0 = arith.constant dense<1.0> : vector<16x16xf32>
+  %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32>
   %2 = xegpu.load_nd %1[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
+    : !xegpu.tensor_desc<16x16xf32> -> vector<16x16xf32>
   %3 = arith.addf %0, %2
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : vector<16x16xf32>
   %cl3 = xegpu.convert_layout %3
     <{
@@ -158,8 +155,7 @@ gpu.func @elementwise() {
 // CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x1xf32>
 // CHECK: gpu.return
 gpu.func @arith_constant() {
-  %0 = arith.constant  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    dense<1.0> : vector<16x16xf32>
+  %0 = arith.constant dense<1.0> : vector<16x16xf32>
   %cl0 = xegpu.convert_layout %0
     <{
       input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
@@ -174,9 +170,9 @@ gpu.func @arith_constant() {
 // CHECK: gpu.return
 gpu.func @prefetch_nd() {
   %c0 = arith.constant 0 : index
-  %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16>
   xegpu.prefetch_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    : !xegpu.tensor_desc<16x16xf16>
   gpu.return
 }
 
@@ -187,12 +183,8 @@ gpu.func @prefetch_nd() {
 // CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
 // CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<8xf16> to vector<1x8xf16>
 gpu.func @scatter_load_chunksize(%src: memref<256xf16>) {
-  %offset = arith.constant
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
-    dense<12> : vector<16xindex>
-  %mask = arith.constant
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
-    dense<true> : vector<16xi1>
+  %offset = arith.constant dense<12> : vector<16xindex>
+  %mask = arith.constant dense<true> : vector<16xi1>
   %0 = xegpu.load %src[%offset], %mask
     <{chunk_size = 8, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
     : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
@@ -209,12 +201,8 @@ gpu.func @scatter_load_chunksize(%src: memref<256xf16>) {
 // CHECK: xegpu.store %[[C2]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
 // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
 gpu.func @scatter_store_chunksize(%src: memref<256xf16>) {
-  %offset = arith.constant
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
-    dense<12> : vector<16xindex>
-  %mask = arith.constant
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
-    dense<true> : vector<16xi1>
+  %offset = arith.constant dense<12> : vector<16xindex>
+  %mask = arith.constant dense<true> : vector<16xi1>
   %0 = xegpu.load %src[%offset], %mask
     <{chunk_size = 8, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
     : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
@@ -230,12 +218,8 @@ gpu.func @scatter_store_chunksize(%src: memref<256xf16>) {
 // CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]]
 // CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
 gpu.func @scatter_load(%src: memref<256xf16>) {
-  %offset = arith.constant
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
-    dense<12> : vector<16xindex>
-  %mask = arith.constant
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
-    dense<true> : vector<16xi1>
+  %offset = arith.constant dense<12> : vector<16xindex>
+  %mask = arith.constant dense<true> : vector<16xi1>
   %0 = xegpu.load %src[%offset], %mask
     <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}>
     : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
@@ -250,12 +234,8 @@ gpu.func @scatter_load(%src: memref<256xf16>) {
 // CHECK: xegpu.store %[[LOAD]], %arg0[%[[OFFSET]]], %[[MASK]]
 // CHECK-SAME: : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
 gpu.func @scatter_store(%src: memref<256xf16>) {
-  %offset = arith.constant
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
-    dense<12> : vector<16xindex>
-  %mask = arith.constant
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
-    dense<true> : vector<16xi1>
+  %offset = arith.constant dense<12> : vector<16xindex>
+  %mask = arith.constant dense<true> : vector<16xi1>
   %0 = xegpu.load %src[%offset], %mask
     <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}>
     : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
@@ -279,12 +259,8 @@ gpu.func @scatter_store(%src: memref<256xf16>) {
 // CHECK: xegpu.store %[[CAST2]], %arg0[%[[V3]]], %[[V4]]
 // CHECK-SAME: : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
 gpu.func @scatter_ops_with_leading_dims(%src: memref<256xf16>) {
-  %mask = arith.constant
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>}
-    dense<1> : vector<1x1x16xi1>
-  %offset = arith.constant
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>}
-    dense<12> : vector<1x1x16xindex>
+  %mask = arith.constant dense<1> : vector<1x1x16xi1>
+  %offset = arith.constant dense<12> : vector<1x1x16xindex>
   %0 = xegpu.load %src[%offset], %mask
     <{layout = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>}>
     : memref<256xf16>, vector<1x1x16xindex>, vector<1x1x16xi1> -> vector<1x1x16xf16>
@@ -316,8 +292,13 @@ gpu.func @scatter_ops_with_leading_dims(%src: memref<256xf16>) {
 // CHECK:     %[[FINAL:.*]] = arith.addf %[[ADD4]], %[[CST]] : f32
 gpu.func @vector_reduction() {
   %acc = arith.constant 1.0 : f32
-  %0 = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : () -> vector<32xf32>
+  %0 = "some_op"() : () -> vector<32xf32>
   %2 = vector.reduction <add>, %0, %acc : vector<32xf32> into f32
+  %anchor = xegpu.convert_layout %2
+    <{
+      input_layout = #xegpu.slice<#xegpu.layout<lane_layout = [16], lane_data = [1]>, dims = [0]>,
+      target_layout = #xegpu.slice<#xegpu.layout<lane_layout = [16], lane_data = [1]>, dims = [0]>
+    }> : f32
   gpu.return
 }
 
@@ -372,16 +353,9 @@ gpu.func @vector_reduction() {
 // CHECK: %[[V19:.*]] = vector.insert %[[V18]], %[[V9]] [1] : f32 into vector<2xf32>
 gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
   %c0 = arith.constant 0 : index
-  %src = arith.constant
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      dense<0.0>  : vector<2x16xf32>
-    %acc = arith.constant
-      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-      dense<0.0>  : vector<2xf32>
-    %1 = vector.multi_reduction <add>, %src, %acc
-      {
-        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
-      }
+  %src = arith.constant dense<0.0>  : vector<2x16xf32>
+  %acc = arith.constant dense<0.0>  : vector<2xf32>
+  %1 = vector.multi_reduction <add>, %src, %acc
       [1] : vector<2x16xf32> to vector<2xf32>
   %cl1 = xegpu.convert_layout %1
     <{
@@ -442,16 +416,9 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
 // CHECK: %[[V19:.*]] = vector.insert %[[V18]], %[[V9]] [1] : f32 into vector<2xf32>
 gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
   %c0 = arith.constant 0 : index
-    %src = arith.constant
-      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-      dense<0.0> : vector<16x2xf32>
-    %acc = arith.constant
-      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-      dense<0.0>  : vector<2xf32>
-    %1 = vector.multi_reduction <add>, %src, %acc
-      {
-        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>
-      }
+  %src = arith.constant dense<0.0> : vector<16x2xf32>
+  %acc = arith.constant dense<0.0>  : vector<2xf32>
+  %1 = vector.multi_reduction <add>, %src, %acc
       [0] : vector<16x2xf32> to vector<2xf32>
   %cl1 = xegpu.convert_layout %1
     <{
@@ -472,16 +439,9 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index)
 // CHECK:         gpu.return
 gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index) {
   %c0 = arith.constant 0 : index
-    %src = arith.constant
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      dense<0.0>  : vector<4x16xf32>
-    %acc = arith.constant
-      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
-      dense<0.0>  : vector<16xf32>
-    %1 = vector.multi_reduction <add>, %src, %acc
-      {
-        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>
-      }
+  %src = arith.constant dense<0.0>  : vector<4x16xf32>
+  %acc = arith.constant dense<0.0>  : vector<16xf32>
+  %1 = vector.multi_reduction <add>, %src, %acc
       [0] : vector<4x16xf32> to vector<16xf32>
   %cl1 = xegpu.convert_layout %1
     <{
@@ -502,16 +462,9 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index)
 // CHECK:         gpu.return
 gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index) {
   %c0 = arith.constant 0 : index
-    %src = arith.constant
-      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-      dense<0.0>  : vector<16x12xf32>
-    %acc = arith.constant
-      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>}
-      dense<0.0>  : vector<16xf32>
-    %1 = vector.multi_reduction <add>, %src, %acc
-      {
-        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>
-      }
+  %src = arith.constant dense<0.0>  : vector<16x12xf32>
+  %acc = arith.constant dense<0.0>  : vector<16xf32>
+  %1 = vector.multi_reduction <add>, %src, %acc
       [1] : vector<16x12xf32> to vector<16xf32>
   %cl1 = xegpu.convert_layout %1
     <{
@@ -528,12 +481,8 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
 // CHECK-NEXT:    gpu.return
 gpu.func @vector_transpose() {
   %cst = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>}
     : () -> (vector<16x2xf32>)
   %transpose = vector.transpose %cst, [1, 0]
-    {
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-    }
     : vector<16x2xf32> to vector<2x16xf32>
   %transpose2 = xegpu.convert_layout %transpose
     <{
@@ -549,14 +498,13 @@ gpu.func @vector_transpose() {
 // CHECK-NEXT:    %[[BC:.*]] = vector.bitcast %[[CAST]] : vector<4x2xi8> to vector<4x1xi16>
 // CHECK-NEXT:    gpu.return
 gpu.func @vector_bitcast() {
-  %cst = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
-    : () -> (vector<4x32xi8>)
-  %bitcast = vector.bitcast %cst
-    {
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-    }
-    : vector<4x32xi8> to vector<4x16xi16>
+  %cst = "some_op"() : () -> (vector<4x32xi8>)
+  %bitcast = vector.bitcast %cst : vector<4x32xi8> to vector<4x16xi16>
+  %anchor = xegpu.convert_layout %bitcast
+    <{
+      input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }> : vector<4x16xi16>
   gpu.return
 }
 
@@ -570,7 +518,6 @@ gpu.func @vector_bitcast() {
 //       CHECK:   gpu.return
 gpu.func @create_mask_1d(%m0: index) {
   %mask = vector.create_mask %m0
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
     : vector<16xi1>
   %mask_cl = xegpu.convert_layout %mask
     <{
@@ -590,7 +537,6 @@ gpu.func @create_mask_1d(%m0: index) {
 //       CHECK:   gpu.return
 gpu.func @constant_mask_1d() {
   %mask = vector.constant_mask [4]
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
     : vector<16xi1>
   %mask_cl = xegpu.convert_layout %mask
     <{
@@ -616,7 +562,6 @@ gpu.func @constant_mask_1d() {
 //       CHECK:   gpu.return
 gpu.func @create_mask_2d(%m0: index, %m1: index) {
   %mask = vector.create_mask %m0, %m1
-    {layout_result_0 = #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>}
     : vector<8x4xi1>
   %mask_cl = xegpu.convert_layout %mask
     <{
@@ -643,7 +588,6 @@ gpu.func @create_mask_2d(%m0: index, %m1: index) {
 //       CHECK:   gpu.return
 gpu.func @constant_mask_2d() {
   %mask = vector.constant_mask [2, 3]
-    {layout_result_0 = #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>}
     : vector<8x4xi1>
       %mask_cl = xegpu.convert_layout %mask
         <{
@@ -653,7 +597,6 @@ gpu.func @constant_mask_2d() {
       gpu.return
 }
 
-
 // CHECK-LABEL: gpu.func @vector_multi_reduction_3d_leading_unit_dim_lane_local
 // CHECK-DAG:     %[[SRC:.*]] = arith.constant dense<0.000000e+00> : vector<1x16x2xf32>
 // CHECK-DAG:     %[[ACC:.*]] = arith.constant dense<0.000000e+00> : vector<1x2xf32>
@@ -669,16 +612,9 @@ gpu.func @constant_mask_2d() {
 // CHECK:         vector.insert %[[R1]], %[[I0]] [0, 1] : f32 into vector<1x2xf32>
 // CHECK:         gpu.return
 gpu.func @vector_multi_reduction_3d_leading_unit_dim_lane_local() {
-    %src = arith.constant
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>}
-      dense<0.0>  : vector<1x16x32xf32>
-    %acc = arith.constant
-      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>}
-      dense<0.0>  : vector<1x32xf32>
+    %src = arith.constant dense<0.0>  : vector<1x16x32xf32>
+    %acc = arith.constant dense<0.0>  : vector<1x32xf32>
     %1 = vector.multi_reduction <add>, %src, %acc
-      {
-        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>
-      }
       [1] : vector<1x16x32xf32> to vector<1x32xf32>
   %cl1 = xegpu.convert_layout %1
     <{
@@ -707,16 +643,9 @@ gpu.func @vector_multi_reduction_3d_leading_unit_dim_lane_local() {
 // CHECK:         vector.insert %[[WITH_ACC1]], %[[INS0]] [0, 1] : f32 into vector<1x2xf32>
 // CHECK:         gpu.return
 gpu.func @vector_multi_reduction_3d_leading_unit_dim_cross_lane() {
-    %src = arith.constant
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16, 1], lane_data = [1, 1, 1]>}
-      dense<0.0>  : vector<1x16x2xf32>
-    %acc = arith.constant
-      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16, 1], lane_data = [1, 1, 1]>, dims = [1]>}
-      dense<0.0>  : vector<1x2xf32>
+    %src = arith.constant dense<0.0>  : vector<1x16x2xf32>
+    %acc = arith.constant dense<0.0>  : vector<1x2xf32>
     %1 = vector.multi_reduction <add>, %src, %acc
-      {
-        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16, 1], lane_data = [1, 1, 1]>, dims = [1]>
-      }
       [1] : vector<1x16x2xf32> to vector<1x2xf32>
   %cl1 = xegpu.convert_layout %1
     <{
@@ -729,11 +658,8 @@ gpu.func @vector_multi_reduction_3d_leading_unit_dim_cross_lane() {
 // CHECK-LABEL: gpu.func @vector_extract_from_2d
 // CHECK: %[[EXT:.*]] = vector.extract %{{.*}}[0] : vector<1xf32> from vector<4x1xf32>
 gpu.func @vector_extract_from_2d() {
-  %src = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : () -> vector<4x16xf32>
+  %src = "some_op"() : () -> vector<4x16xf32>
   %0 = vector.extract %src[0]
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
     : vector<16xf32> from vector<4x16xf32>
   %cl0 = xegpu.convert_layout %0
     <{
@@ -747,10 +673,8 @@ gpu.func @vector_extract_from_2d() {
 // CHECK: %[[EXT:.*]] = vector.extract %{{.*}}[2] : vector<1xf32> from vector<8x1xf32>
 gpu.func @vector_extract_from_2d_offset2() {
   %src = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : () -> vector<8x16xf32>
   %0 = vector.extract %src[2]
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
     : vector<16xf32> from vector<8x16xf32>
   %cl0 = xegpu.convert_layout %0
     <{
@@ -764,13 +688,10 @@ gpu.func @vector_extract_from_2d_offset2() {
 // CHECK: %[[INS:.*]] = vector.insert %{{.*}}, %{{.*}}[0] : vector<1xf32> into vector<4x1xf32>
 gpu.func @vector_insert_into_2d() {
   %val = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
     : () -> vector<16xf32>
   %dst = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : () -> vector<4x16xf32>
   %0 = vector.insert %val, %dst[0]
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : vector<16xf32> into vector<4x16xf32>
   %cl0 = xegpu.convert_layout %0
     <{
@@ -784,13 +705,10 @@ gpu.func @vector_insert_into_2d() {
 // CHECK: %[[INS:.*]] = vector.insert %{{.*}}, %{{.*}}[2] : vector<1xf32> into vector<8x1xf32>
 gpu.func @vector_insert_into_2d_offset2() {
   %val = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
     : () -> vector<16xf32>
   %dst = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : () -> vector<8x16xf32>
   %0 = vector.insert %val, %dst[2]
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : vector<16xf32> into vector<8x16xf32>
   %cl0 = xegpu.convert_layout %0
     <{
@@ -804,10 +722,8 @@ gpu.func @vector_insert_into_2d_offset2() {
 // CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
 gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted() {
   %0 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : () -> vector<24x16xf32>
-  %1 = vector.extract_strided_slice %0 { offsets = [8, 0], sizes = [8, 16], strides = [1, 1],
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+  %1 = vector.extract_strided_slice %0 { offsets = [8, 0], sizes = [8, 16], strides = [1, 1]
     }
     : vector<24x16xf32> to vector<8x16xf32>
   %cl1 = xegpu.convert_layout %1
@@ -822,10 +738,8 @@ gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted() {
 // CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 3], sizes = [8, 1], strides = [1, 1]} : vector<24x4xf32> to vector<8x1xf32>
 gpu.func @vector_extract_strided_slice_inner_distributed() {
   %0 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : () -> vector<24x64xf32>
-  %1 = vector.extract_strided_slice %0 { offsets = [8, 48], sizes = [8, 16], strides = [1, 1],
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+  %1 = vector.extract_strided_slice %0 { offsets = [8, 48], sizes = [8, 16], strides = [1, 1]
     }
     : vector<24x64xf32> to vector<8x16xf32>
   %cl1 = xegpu.convert_layout %1
@@ -840,10 +754,8 @@ gpu.func @vector_extract_strided_slice_inner_distributed() {
 // CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [1, 0], sizes = [1, 16], strides = [1, 1]} : vector<2x16xf32> to vector<1x16xf32>
 gpu.func @vector_extract_strided_slice_outer_distributed() {
   %0 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
     : () -> vector<32x16xf32>
-  %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [16], strides = [1],
-      layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>
+  %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [16], strides = [1]
     }
     : vector<32x16xf32> to vector<16x16xf32>
   %cl1 = xegpu.convert_layout %1
@@ -858,10 +770,8 @@ gpu.func @vector_extract_strided_slice_outer_distributed() {
 // CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [1], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32>
 gpu.func @vector_extract_strided_slice_1d() {
   %0 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
     : () -> vector<64xf32>
-  %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [32], strides = [1],
-      layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+  %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [32], strides = [1]
     }
     : vector<64xf32> to vector<32xf32>
   %cl1 = xegpu.convert_layout %1
@@ -876,10 +786,8 @@ gpu.func @vector_extract_strided_slice_1d() {
 // CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
 gpu.func @vector_extract_strided_slice_partial_offsets() {
   %0 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : () -> vector<24x16xf32>
-  %1 = vector.extract_strided_slice %0 { offsets = [8], sizes = [8], strides = [1],
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+  %1 = vector.extract_strided_slice %0 { offsets = [8], sizes = [8], strides = [1]
     }
     : vector<24x16xf32> to vector<8x16xf32>
   %cl1 = xegpu.convert_layout %1
@@ -894,13 +802,10 @@ gpu.func @vector_extract_strided_slice_partial_offsets() {
 // CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32>
 gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted() {
   %0 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : () -> vector<16x16xf32>
   %1 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : () -> vector<64x16xf32>
-  %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 0], strides = [1, 1],
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+  %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 0], strides = [1, 1]
     }
     : vector<16x16xf32> into vector<64x16xf32>
   %cl2 = xegpu.convert_layout %2
@@ -915,13 +820,10 @@ gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted() {
 // CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 1], strides = [1, 1]} : vector<16x1xf32> into vector<64x2xf32>
 gpu.func @vector_insert_strided_slice_inner_distributed() {
   %0 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : () -> vector<16x16xf32>
   %1 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : () -> vector<64x32xf32>
-  %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 16], strides = [1, 1],
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+  %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 16], strides = [1, 1]
     }
     : vector<16x16xf32> into vector<64x32xf32>
   %cl2 = xegpu.convert_layout %2
@@ -936,13 +838,10 @@ gpu.func @vector_insert_strided_slice_inner_distributed() {
 // CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [2, 4], strides = [1, 1]} : vector<1x16xf32> into vector<3x32xf32>
 gpu.func @vector_insert_strided_slice_outer_distributed() {
   %0 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
     : () -> vector<16x16xf32>
   %1 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
     : () -> vector<48x32xf32>
-  %2 = vector.insert_strided_slice %0, %1 { offsets = [32, 4], strides = [1, 1],
-      layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>
+  %2 = vector.insert_strided_slice %0, %1 { offsets = [32, 4], strides = [1, 1]
     }
     : vector<16x16xf32> into vector<48x32xf32>
   %cl2 = xegpu.convert_layout %2
@@ -957,13 +856,10 @@ gpu.func @vector_insert_strided_slice_outer_distributed() {
 // CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
 gpu.func @vector_insert_strided_slice_1d() {
   %0 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
     : () -> vector<16xf32>
   %1 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
     : () -> vector<48xf32>
-  %2 = vector.insert_strided_slice %0, %1 { offsets = [16], strides = [1],
-      layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+  %2 = vector.insert_strided_slice %0, %1 { offsets = [16], strides = [1]
     }
     : vector<16xf32> into vector<48xf32>
   %cl2 = xegpu.convert_layout %2
@@ -978,13 +874,10 @@ gpu.func @vector_insert_strided_slice_1d() {
 // CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [13, 0], strides = [1]} : vector<1xf32> into vector<64x1xf32>
 gpu.func @vector_insert_strided_slice_different_ranks() {
   %0 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
     : () -> vector<16xf32>
   %1 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : () -> vector<64x16xf32>
-  %2 = vector.insert_strided_slice %0, %1 { offsets = [13, 0], strides = [1],
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+  %2 = vector.insert_strided_slice %0, %1 { offsets = [13, 0], strides = [1]
     }
     : vector<16xf32> into vector<64x16xf32>
   %cl2 = xegpu.convert_layout %2
@@ -999,10 +892,8 @@ gpu.func @vector_insert_strided_slice_different_ranks() {
 // CHECK-NOT: xegpu.convert_layout
 gpu.func @convert_layout_removed_when_compatible() {
   %0 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
     : () -> vector<16xf32>
   %2 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>}
     : () -> vector<1xf32>
   %1 = xegpu.convert_layout %0
     <{input_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
@@ -1106,9 +997,9 @@ gpu.module @xevm_module {
 // CHECK: %[[NEG:.*]] = arith.negf %[[SRC]] : vector<16x1xf16>
 // CHECK: gpu.return
 gpu.func @elementwise_wrap_around_dim() {
-  %0 = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+  %0 = "some_op"()
     : () -> vector<16x1xf16>
-  %1 = arith.negf %0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+  %1 = arith.negf %0
     : vector<16x1xf16>
    %cl1 = xegpu.convert_layout %1
      <{
@@ -1128,7 +1019,7 @@ gpu.module @xevm_module {
 // CHECK:         %[[REM2:.*]] = arith.remui %[[REM]], %[[C16]]{{.*}} : index
 // CHECK:         %[[VEC:.*]] = vector.from_elements %[[REM2]] : vector<1xindex>
 gpu.func @vector_step_slice() {
-  %0 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 2]>} : vector<16xindex>
+  %0 = vector.step : vector<16xindex>
   %cl0 = xegpu.convert_layout %0
     <{
       input_layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 2]>,
@@ -1143,7 +1034,7 @@ gpu.module @xevm_module {
 // CHECK-LABEL: gpu.func @vector_step_slice_unit
 // CHECK:         %[[VEC:.*]] = vector.from_elements %{{.*}} : vector<1xindex>
 gpu.func @vector_step_slice_unit() {
-  %0 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 3]>} : vector<1xindex>
+  %0 = vector.step : vector<1xindex>
   %cl0 = xegpu.convert_layout %0
     <{
       input_layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 3]>,
@@ -1165,7 +1056,7 @@ gpu.module @xevm_module {
 // CHECK:         %[[V3:.*]] = arith.addi %[[V2]], %{{.*}} : index
 // CHECK:         %[[VEC:.*]] = vector.from_elements %[[V0]], %[[V1]], %[[V2]], %[[V3]] : vector<4xindex>
 gpu.func @vector_step_slice_multi_dist() {
-  %0 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [2, 4, 2], lane_data = [1, 2, 1]>, dims = [0, 2]>} : vector<16xindex>
+  %0 = vector.step : vector<16xindex>
   %cl0 = xegpu.convert_layout %0
     <{
       input_layout = #xegpu.slice<#xegpu.layout<lane_layout = [2, 4, 2], lane_data = [1, 2, 1]>, dims = [0, 2]>,
@@ -1181,12 +1072,8 @@ gpu.module @xevm_module {
 // CHECK:         %[[SC:.*]] = vector.shape_cast %{{.*}} : vector<1xf32> to vector<1x1xf32>
 gpu.func @vector_shapecast_rank_increasing() {
   %cst = "some_op"()
-    {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
     : () -> (vector<16xf32>)
   %cast = vector.shape_cast %cst
-    {
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-    }
     : vector<16xf32> to vector<1x16xf32>
   %cast_cl = xegpu.convert_layout %cast
     <{
@@ -1203,12 +1090,8 @@ gpu.module @xevm_module {
 // CHECK:         %[[SC:.*]] = vector.shape_cast %{{.*}} : vector<1x1xf32> to vector<1xf32>
 gpu.func @vector_shapecast_rank_reducing() {
   %cst = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : () -> (vector<1x16xf32>)
   %cast = vector.shape_cast %cst
-    {
-      layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>
-    }
     : vector<1x16xf32> to vector<16xf32>
   %cast_cl = xegpu.convert_layout %cast
     <{
@@ -1225,12 +1108,8 @@ gpu.module @xevm_module {
 // CHECK:         %[[SC:.*]] = vector.shape_cast %{{.*}} : vector<1xf32> to vector<1x1xf32>
 gpu.func @vector_shapecast_rank_increasing_without_slicing_layout() {
   %cst = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
     : () -> (vector<16xf32>)
   %cast = vector.shape_cast %cst
-    {
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-    }
     : vector<16xf32> to vector<1x16xf32>
   %cast_cl = xegpu.convert_layout %cast
     <{
@@ -1248,9 +1127,13 @@ gpu.module @xevm_module {
 // CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC]] : vector<16xf16> to vector<1xf16>
 // CHECK: %[[BCAST:.*]] = vector.broadcast %[[CAST]] : vector<1xf16> to vector<16x1xf16>
 gpu.func @vector_broadcast_1d_to_2d(%laneid: index) {
-  %0 = "some_op"() {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : () -> vector<16xf16>
-  %1 = vector.broadcast %0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x16xf16>
-  "some_use"(%1) : (vector<16x16xf16>) -> ()
+  %0 = "some_op"() : () -> vector<16xf16>
+  %1 = vector.broadcast %0 : vector<16xf16> to vector<16x16xf16>
+  %anchor = xegpu.convert_layout %1
+    <{
+      input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }> : vector<16x16xf16>
   gpu.return
 }
 }
@@ -1261,8 +1144,7 @@ gpu.module @xevm_module {
 // CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x1xf16>
 // CHECK: gpu.return
 gpu.func @constant_wrap_around_dim() {
-  %0 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    dense<1.0> : vector<16x1xf16>
+  %0 = arith.constant dense<1.0> : vector<16x1xf16>
   %cl0 = xegpu.convert_layout %0
     <{
       input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
@@ -1279,9 +1161,14 @@ gpu.module @xevm_module {
 // CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC]] : vector<16x16xf16> to vector<16x1xf16>
 // CHECK: %[[BCAST:.*]] = vector.broadcast %[[CAST]] : vector<16x1xf16> to vector<1x16x1xf16>
 gpu.func @vector_broadcast_2d_to_3d(%laneid: index) {
-  %0 = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<16x16xf16>
-  %1 = vector.broadcast %0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>} : vector<16x16xf16> to vector<1x16x16xf16>
-  "some_use"(%1) : (vector<1x16x16xf16>) -> ()
+  %0 = "some_op"() : () -> vector<16x16xf16>
+  %1 = vector.broadcast %0 : vector<16x16xf16> to vector<1x16x16xf16>
+  %2 = xegpu.convert_layout %1
+    <{
+      input_layout = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>,
+      target_layout = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>
+    }> : vector<1x16x16xf16>
+  "some_use"(%2) : (vector<1x16x16xf16>) -> ()
   gpu.return
 }
 }
@@ -1292,8 +1179,8 @@ gpu.module @xevm_module {
 // CHECK: %[[SRC:.*]] = "some_op"()
 // CHECK-NOT: vector.broadcast
 gpu.func @vector_broadcast_2d_to_2d_noop(%laneid: index) {
-  %0 = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<16x1xf16>
-  %1 = vector.broadcast %0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
+  %0 = "some_op"() : () -> vector<16x1xf16>
+  %1 = vector.broadcast %0 : vector<16x1xf16> to vector<16x16xf16>
   %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>
   "some_use"(%2) : (vector<16x16xf16>) -> ()
   gpu.return
@@ -1308,8 +1195,9 @@ gpu.module @xevm_module {
 // CHECK: %[[BCAST:.*]] = vector.broadcast %[[SRC]] : f16 to vector<16x1xf16>
 gpu.func @vector_broadcast_scalar_to_vector(%laneid: index) {
   %0 = "some_op"() : () -> f16
-  %1 = vector.broadcast %0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
-  "some_use"(%1) : (vector<16x16xf16>) -> ()
+  %1 = vector.broadcast %0 : f16 to vector<16x16xf16>
+  %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>
+  "some_use"(%2) : (vector<16x16xf16>) -> ()
   gpu.return
 }
 }
@@ -1349,13 +1237,9 @@ gpu.module @xevm_module {
 // CHECK:     %[[FINAL:.*]] = arith.addf %[[ADD5]], %[[ACC]] : f32
 gpu.func @vector_multi_reduction_1d_to_scalar() {
     %src = "some_op"()
-      {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
       : () -> vector<32xf32>
     %acc = arith.constant 0.0 : f32
     %1 = vector.multi_reduction <add>, %src, %acc
-      {
-        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16], lane_data = [1]>, dims = [0]>
-      }
       [0] : vector<32xf32> to f32
   %cl1 = xegpu.convert_layout %1
     <{
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
index e02bd9b0370ad..952d54a43ae38 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
@@ -503,12 +503,16 @@ gpu.module @xevm_module {
       %2 = vector.broadcast %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : f32 to vector<16xf32>
       %3 = vector.multi_reduction <add>, %1, %2 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
       %4 = vector.reduction <add>, %3 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} : vector<16xf32> into f32
-      %5 = vector.broadcast %4 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : f32 to vector<16xf32>
-      %cst_0 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<0> : vector<16xindex>
-      %cst_1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+      %anchor = xegpu.convert_layout %4
+        <{
+          input_layout = #xegpu.slice<#xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>, dims=[0]>,
+          target_layout =  #xegpu.slice<#xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>, dims=[0]>
+        }>
+        : f32
+      %5 = vector.broadcast %anchor : f32 to vector<16xf32>
+      %cst_0 = arith.constant  dense<0> : vector<16xindex>
+      %cst_1 = arith.constant  dense<true> : vector<16xi1>
       xegpu.store %5, %arg1[%cst_0], %cst_1 <{layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}> : vector<16xf32>, memref<256xf32>, vector<16xindex>, vector<16xi1>
     gpu.return
   }
 }
-
-
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index c2aac8fa6cf0b..cd7f8b9f69ff2 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -292,11 +292,11 @@ gpu.module @test_kernel  {
     %m = arith.muli %block_id_x, %c32 : index
     %0 = xegpu.create_nd_tdesc %a : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r>
     %1 = xegpu.load_nd %0[0] {layout = #r}: !xegpu.tensor_desc<32xf32, #r> -> vector<32xf32>
-    %11 = vector.shape_cast %1 {layout_result_0 = #l} :  vector<32xf32> to vector<32x1xf32>
+    %11 = vector.shape_cast %1 :  vector<32xf32> to vector<32x1xf32>
     // CHECK-COUNT-8: vector.broadcast {{.*}}: vector<16x1xf32> to vector<16x16xf32>
-    %2 = vector.broadcast  %11 {layout_result_0 = #l} : vector<32x1xf32> to vector<32x64xf32>
+    %2 = vector.broadcast  %11 : vector<32x1xf32> to vector<32x64xf32>
     %3 = xegpu.create_nd_tdesc %b : memref<16x512xf32> -> !xegpu.tensor_desc<32x64xf32, #l>
-    xegpu.store_nd %2, %3[0, 0] : vector<32x64xf32>, !xegpu.tensor_desc<32x64xf32, #l>
+    xegpu.store_nd %2, %3[0, 0] {layout = #l} : vector<32x64xf32>, !xegpu.tensor_desc<32x64xf32, #l>
     gpu.return
   }
 }
@@ -480,7 +480,8 @@ gpu.module @test_kernel {
 
   gpu.func @convert_layout(%B: vector<8x32x2xf16>) -> vector<8x32x2xf16> {
     %b = xegpu.convert_layout %B <{input_layout = #lb, target_layout = #b}> : vector<8x32x2xf16>
-    %e = math.exp %b {layout_result_0 = #b} : vector<8x32x2xf16>
+    %e = math.exp %b : vector<8x32x2xf16>
+    %anchor = xegpu.convert_layout %e <{input_layout = #b, target_layout = #b}> : vector<8x32x2xf16>
     gpu.return %e : vector<8x32x2xf16>
   }
 }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index fefe2091d458d..2f43f9a840173 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -125,7 +125,12 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[T15:.*]] = arith.addi %[[T13]], %[[T14]] : index
     // CHECK-DAG: %[[T16:.*]] = vector.broadcast %[[T15]] : index to vector<2x1xindex>
     // CHECK-DAG: %[[T17:.*]] = arith.addi %[[CST]], %[[T16]] : vector<2x1xindex>
-    %cst_2 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 1], sg_data = [2, 1]>} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex>
+    %cst_2 = arith.constant dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex>
+    %anchor = xegpu.convert_layout %cst_2
+      <{
+        input_layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [2, 1]>,
+        target_layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [2, 1]>
+      }> : vector<32x1xindex>    
     gpu.return
   }
 
@@ -250,7 +255,12 @@ gpu.module @test_distribution {
   // CHECK-LABEL: splat_constant
   gpu.func @splat_constant() {
     // CHECK-COUNT-2: %[[CST:.*]] = arith.constant dense<0> : vector<4xindex>
-    %cst_2 = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4], order = [0, 1]>, dims = [0]>}  dense<0> : vector<8xindex>
+    %cst_2 = arith.constant dense<0> : vector<8xindex>
+    %anchor = xegpu.convert_layout %cst_2
+      <{
+        input_layout =  #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4], order = [0, 1]>, dims = [0]>,
+        target_layout =  #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4], order = [0, 1]>, dims = [0]>
+      }> : vector<8xindex>
     gpu.return
   }
 
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index c9aff190d84d7..f9697d83baf58 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -519,7 +519,12 @@ gpu.module @test_distribution {
   // CHECK-LABEL: constant_with_slice_attr
   gpu.func @constant_with_slice_attr() {
     //CHECK: [[cst:%.+]] = arith.constant dense<10> : vector<1xindex>
-    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 1]>, dims = [1, 2, 3]>} dense<10> : vector<4xindex>
+    %cst = arith.constant dense<10> : vector<4xindex>
+    %anchor = xegpu.convert_layout %cst
+      <{
+        input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 1]>, dims = [1, 2, 3]>,
+        target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 1]>, dims = [1, 2, 3]>
+      }> : vector<4xindex>
     gpu.return
   }
 
@@ -582,6 +587,11 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[T7:.*]] = vector.broadcast %[[T6]] : index to vector<1x1xindex>
     // CHECK-DAG: %[[T8:.*]] = arith.addi %[[CST]], %[[T7]] : vector<1x1xindex>
     %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 1]>} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex>
+    %anchor = xegpu.convert_layout %cst
+      <{
+        input_layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 1]>,
+        target_layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 1]>
+      }> : vector<32x1xindex>
     gpu.return
   }
 
@@ -602,7 +612,7 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[ADD]], %[[MUL6]] : index
     // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<2x2xindex>
     // CHECK-DAG: %[[ADDCST:.*]] = arith.addi %[[BASECST]], %[[BCAST]] : vector<2x2xindex>
-    %cst_8x8 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2]>} dense<[
+    %cst_8x8 = arith.constant dense<[
          [0, 16, 32, 48, 64, 80, 96, 112],
          [8, 24, 40, 56, 72, 88, 104, 120],
          [16, 32, 48, 64, 80, 96, 112, 128],
@@ -612,6 +622,11 @@ gpu.module @test_distribution {
          [48, 64, 80, 96, 112, 128, 144, 160],
          [56, 72, 88, 104, 120, 136, 152, 168]
       ]> : vector<8x8xindex>
+    %anchor = xegpu.convert_layout %cst_8x8
+      <{
+        input_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2]>,
+        target_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2]>
+      }> : vector<8x8xindex>
       gpu.return
   }
 
@@ -625,9 +640,19 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[MUL]] : index
     // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<1xindex>
     // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[CST]], %[[BCAST]] : vector<1xindex>
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [1]>} dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496]> : vector<32xindex>
+    %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496]> : vector<32xindex>
+    %anchor = xegpu.convert_layout %cst
+      <{
+        input_layout = #xegpu.layout<sg_layout = [32], sg_data = [1]>,
+        target_layout = #xegpu.layout<sg_layout = [32], sg_data = [1]>
+      }> : vector<32xindex>
     // CHECK: arith.constant dense<{{\[}}{{\[}}0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15{{\]}}{{\]}}> : vector<1x16xindex>
-    %cst_1 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 16]>} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex>
+    %cst_1 = arith.constant dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex>
+    %anchor_1 = xegpu.convert_layout %cst_1
+      <{
+        input_layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 16]>,
+        target_layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 16]>
+      }> : vector<1x16xindex>
     gpu.return
   }
 
@@ -1232,7 +1257,12 @@ gpu.module @test_distribution {
   // CHECK-LABEL: distribute_constant
   gpu.func @distribute_constant() {
     // CHECK: arith.constant dense<1.000000e+00> : vector<32x32xf32>
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} dense<1.0> : vector<256x128xf32>
+    %cst = arith.constant dense<1.0> : vector<256x128xf32>
+    %anchor = xegpu.convert_layout %cst
+      <{
+        input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>,
+        target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>
+      }> : vector<256x128xf32>    
     gpu.return
   }
 

>From 707d1d4ae18cefc50c6876015c9a226affa3b2bf Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 1 May 2026 19:43:57 +0000
Subject: [PATCH 04/11] fix tests

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |  15 +-
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp |   7 +
 .../XeGPU/propagate-layout-inst-data.mlir     |   6 +-
 .../XeGPU/propagate-layout-subgroup.mlir      |   4 +-
 mlir/test/Dialect/XeGPU/propagate-layout.mlir |   4 +-
 .../XeGPU/sg-to-wi-experimental-unit.mlir     |   5 -
 .../Dialect/XeGPU/sg-to-wi-experimental.mlir  | 166 ++++++++----------
 7 files changed, 101 insertions(+), 106 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 4fe15c625ea49..da48bff7b5048 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -419,11 +419,20 @@ LogicalResult LoadNdOp::verify() {
     }
   }
 
+  // Handle array_length. Two result shape conventions are accepted:
+  //   * Legacy: leading array_length dimension prepended, e.g. descriptor
+  //     16x16 with array_length=2 -> [2, 16, 16].
+  //   * Stacked 2D: array blocks stacked along the non-FCD (first) dimension,
+  //     e.g. descriptor 16x16 with array_length=2 -> [32, 16].
   auto array_len = tdescTy.getArrayLength();
-  if (array_len > 1)
-    tdescShape.insert(tdescShape.begin(), array_len);
+  SmallVector<int64_t> stackedShape(tdescShape);
+  SmallVector<int64_t> prependedShape(tdescShape);
+  if (array_len > 1 && !tdescShape.empty()) {
+    stackedShape[0] *= array_len;
+    prependedShape.insert(prependedShape.begin(), array_len);
+  }
 
-  if (tdescShape != valueShape)
+  if (valueShape != stackedShape && valueShape != prependedShape)
     return emitOpError() << "Result shape " << makeString(valueShape)
                          << " is not consistent with tensor descriptor "
                          << tdescTy;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index a5776ebce2e95..1a3bc28cec002 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -1403,6 +1403,9 @@ struct ResolveLayoutConflicts {
 } // namespace
 
 LogicalResult ResolveLayoutConflicts::run() {
+  // dump the IR before resolving layout conflicts for debugging purposes.
+  DBGS() << "IR before resolving layout conflicts:\n";
+  parentOp->dump();
   // Scan all operations in the parent op and resolve layout conflicts at
   // tensor descriptor and vector use points.
   auto r = parentOp->walk([&](Operation *op) -> WalkResult {
@@ -1445,6 +1448,10 @@ LogicalResult ResolveLayoutConflicts::run() {
     return WalkResult::advance();
   });
 
+  // dump the IR after resolving layout conflicts for debugging purposes.
+  DBGS() << "IR after resolving layout conflicts:\n";
+  parentOp->dump();
+
   return r.wasInterrupted() ? failure() : success();
 }
 
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
index 6448db93c3f40..0d73985502e3f 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -248,8 +248,8 @@ func.func @insert_strided_slice_inst_data_with_packing(%arg0: memref<8x64xi8>) {
   %cst_small = arith.constant dense<1> : vector<4x64xi8>
   %cst_large = arith.constant dense<0> : vector<8x64xi8>
   %insert = vector.insert_strided_slice %cst_small, %cst_large {offsets = [0, 0], strides = [1, 1]} : vector<4x64xi8> into vector<8x64xi8>
-  %tdesc = xegpu.create_nd_tdesc %arg0 : memref<8x64xi8> -> !xegpu.tensor_desc<8x64xi8, #xegpu.layout<inst_data = [8, 64]>>
-  xegpu.store_nd %insert, %tdesc[0, 0] <{layout = #xegpu.layout<inst_data = [8, 64]>}>: vector<8x64xi8>, !xegpu.tensor_desc<8x64xi8, #xegpu.layout<inst_data = [8, 64]>>
+  %tdesc = xegpu.create_nd_tdesc %arg0 : memref<8x64xi8> -> !xegpu.tensor_desc<8x64xi8>
+  xegpu.store_nd %insert, %tdesc[0, 0] <{layout = #xegpu.layout<inst_data = [8, 64]>}>: vector<8x64xi8>, !xegpu.tensor_desc<8x64xi8>
   return
 }
 }
@@ -333,7 +333,7 @@ func.func @vector_shape_cast_expand_and_merge(%arg0: memref<256xf16>, %arg1: mem
 
     %4 = vector.shape_cast %2 : vector<2x4x32xf16> to vector<1x256xf16>
     %5 = vector.shape_cast %4 : vector<1x256xf16> to vector<256xf16>
-    xegpu.store %5, %arg1[%0], %cst <{layout = #xegpu.layout<inst_data = [32] >}> : vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1>
+    xegpu.store %5, %arg1[%0], %cst <{layout = #xegpu.layout<inst_data = [32]>}> : vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1>
     return
   }
 }
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
index d4ad9087149c1..2c28073ee1c01 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
@@ -133,9 +133,9 @@ gpu.module @test {
     %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
     %load = xegpu.load_nd %tdesc_src[0, 0] : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
     %reduce = vector.multi_reduction <add>, %load, %cst [1] : vector<32x64xf32> to vector<32xf32>
-    %tdesc_dst = xegpu.create_nd_tdesc %dst : memref<32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<sg_layout = [32], sg_data = [1]>>
+    %tdesc_dst = xegpu.create_nd_tdesc %dst : memref<32xf32> -> !xegpu.tensor_desc<32xf32>
     xegpu.store_nd %reduce, %tdesc_dst[0] <{layout = #xegpu.layout<sg_layout = [32], sg_data = [1]>}>
-      : vector<32xf32>, !xegpu.tensor_desc<32xf32, #xegpu.layout<sg_layout = [32], sg_data = [1]>>
+      : vector<32xf32>, !xegpu.tensor_desc<32xf32>
     gpu.return
   }
 }
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 72d066d516540..8c1f85435c771 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -960,8 +960,8 @@ func.func @insert_strided_slice_with_slice_layout(%arg0: memref<8x16xf32>) {
   %cst_small8 = vector.extract_strided_slice %cst_large_new {offsets = [0], sizes = [8], strides = [1]} : vector<16xf32> to vector<8xf32>
   %cst_small16x8 = vector.broadcast %cst_small8 : vector<8xf32> to vector<16x8xf32>
   %cst_small8x16 = vector.transpose %cst_small16x8, [1, 0] : vector<16x8xf32> to vector<8x16xf32>
-  %tdesc = xegpu.create_nd_tdesc %arg0 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  xegpu.store_nd %cst_small8x16, %tdesc[0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %tdesc = xegpu.create_nd_tdesc %arg0 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+  xegpu.store_nd %cst_small8x16, %tdesc[0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
 }
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 057c9b80926a5..d018c32bca694 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -32,11 +32,6 @@ gpu.func @load_nd() {
   %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16>
   %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-  %anchor = xegpu.convert_layout %1
-    <{
-      input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-      target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-    }> : vector<16x16xf16>
   gpu.return
 }
 
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
index 952d54a43ae38..ec553aa33f49b 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
@@ -37,37 +37,33 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar
   %0 = arith.muli %block_id_x, %c8 : index
   %1 = arith.muli %block_id_y, %c16 : index
   %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32>
   %3 = xegpu.load_nd %2[%0, %1]
-    {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-    layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
+    {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
   %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16>
-      -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<8x16xbf16>
   %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16>
-      -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+      -> !xegpu.tensor_desc<16x16xbf16>
 
   %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
     %7 = xegpu.load_nd %5[%0, %arg3]
-      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : !xegpu.tensor_desc<8x16xbf16> -> vector<8x16xbf16>
     %8 = xegpu.load_nd %6[%arg3, %1]
-      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
-      : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+      : !xegpu.tensor_desc<16x16xbf16> -> vector<16x16xbf16>
 
     %9 = xegpu.dpas %7, %8, %arg4
       {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
        layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-       layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+       layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
 
     scf.yield %9 : vector<8x16xf32>
-  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+  }
   xegpu.store_nd %4, %2[%0, %1] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}: vector<8x16xf32>,
-    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32>
   gpu.return
 }
 
@@ -105,44 +101,40 @@ gpu.func @gemm_with_preop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024
   %c16 = arith.constant 16 : index
   %c8 = arith.constant 8 : index
   %c1024 = arith.constant 1024 : index
-  %cst = arith.constant  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<1.0> : vector<8x16xbf16>
+  %cst = arith.constant dense<1.0> : vector<8x16xbf16>
   %block_id_x = gpu.block_id x
   %block_id_y = gpu.block_id y
   %0 = arith.muli %block_id_x, %c8 : index
   %1 = arith.muli %block_id_y, %c16 : index
   %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32>
   %3 = xegpu.load_nd %2[%0, %1]
-    {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-    layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
+    {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
   %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16>
-      -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<8x16xbf16>
   %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16>
-      -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+      -> !xegpu.tensor_desc<16x16xbf16>
 
   %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
     %7 = xegpu.load_nd %5[%0, %arg3]
-      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
-    %preop = arith.addf %7, %cst {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xbf16>
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : !xegpu.tensor_desc<8x16xbf16> -> vector<8x16xbf16>
+    %preop = arith.addf %7, %cst : vector<8x16xbf16>
     %8 = xegpu.load_nd %6[%arg3, %1]
-      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
-      : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+      : !xegpu.tensor_desc<16x16xbf16> -> vector<16x16xbf16>
 
     %9 = xegpu.dpas %preop, %8, %arg4
       {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
        layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-       layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+       layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
 
     scf.yield %9 : vector<8x16xf32>
-  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+  }
   xegpu.store_nd %4, %2[%0, %1] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}: vector<8x16xf32>,
-    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32>
   gpu.return
 }
 
@@ -181,38 +173,34 @@ gpu.func @gemm_with_postop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x102
   %0 = arith.muli %block_id_x, %c8 : index
   %1 = arith.muli %block_id_y, %c16 : index
   %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32>
   %3 = xegpu.load_nd %2[%0, %1]
-    {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-    layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
+    {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
   %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16>
-      -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<8x16xbf16>
   %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16>
-      -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+      -> !xegpu.tensor_desc<16x16xbf16>
 
   %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
     %7 = xegpu.load_nd %5[%0, %arg3]
-      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : !xegpu.tensor_desc<8x16xbf16> -> vector<8x16xbf16>
     %8 = xegpu.load_nd %6[%arg3, %1]
-      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
-      : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+      : !xegpu.tensor_desc<16x16xbf16> -> vector<16x16xbf16>
 
     %9 = xegpu.dpas %7, %8, %arg4
       {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
        layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-       layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+       layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
 
     scf.yield %9 : vector<8x16xf32>
-  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-  %postop = math.exp %4 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>
+  }
+  %postop = math.exp %4 : vector<8x16xf32>
   xegpu.store_nd %postop, %2[%0, %1] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}: vector<8x16xf32>,
-    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32>
   gpu.return
 }
 }
@@ -237,15 +225,15 @@ gpu.module @xevm_module{
     %c0 = arith.constant 0 : index
     %cst = arith.constant dense<0.0> : vector<8x16xf32>
     %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16>
-      -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<8x16xf16>
     %1 = xegpu.load_nd %0[%c0, %c0]
       {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
-      !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+      !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
     %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16>
-      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+      -> !xegpu.tensor_desc<16x16xf16>
     %3 = xegpu.load_nd %2[%c0, %c0]
       {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
-      : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+      : !xegpu.tensor_desc<16x16xf16>
       -> vector<16x16xf16>
     %4 = xegpu.dpas %1, %3, %cst
       {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
@@ -253,17 +241,16 @@ gpu.module @xevm_module{
        layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
     %5 = math.exp %4
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<8x16xf32>
     %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> ->
-      !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      !xegpu.tensor_desc<8x16xf32>
     %anchor = xegpu.convert_layout %5
       <{
         input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
         target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
       }> : vector<8x16xf32>
     xegpu.store_nd %anchor, %6[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
-      !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      !xegpu.tensor_desc<8x16xf32>
     gpu.return
   }
 }
@@ -287,8 +274,8 @@ gpu.module @xevm_module{
 // CHECK-SAME:        vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
 gpu.module @xevm_module{
   gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) {
-    %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
+    %1 = arith.constant dense<1>: vector<16xi1>
+    %offset = arith.constant dense<12> : vector<16xindex>
     %loaded = scf.if %pred -> (vector<16x8xf16>) {
       %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
         layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
@@ -345,33 +332,30 @@ gpu.module @xevm_module{
 gpu.module @xevm_module{
   gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) {
     %c0 = arith.constant 0 : index
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.0> : vector<8x16xf32>
+    %cst = arith.constant dense<0.0> : vector<8x16xf32>
     %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16>
-      -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<8x16xf16>
     %1 = xegpu.load_nd %0[%c0, %c0]
-      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
     %2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32>
-      -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>>
+      -> !xegpu.tensor_desc<16x8xi32>
     %3 = xegpu.load_nd %2[%c0, %c0]
-      {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>,
-       layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>}
-      : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>> -> vector<16x8xi32>
-    %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2], order = [0, 1]>}
+      {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>}
+      : !xegpu.tensor_desc<16x8xi32> -> vector<16x8xi32>
+    %4 = vector.bitcast %3
       : vector<16x8xi32> to vector<16x16xf16>
-    %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+    %5 = vector.transpose %4, [1, 0]
       : vector<16x16xf16> to vector<16x16xf16>
     %6 = xegpu.dpas %1, %5, %cst
       {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
        layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-       layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+       layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
     %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32>
-      -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<8x16xf32>
     xegpu.store_nd %6, %7[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
-      !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      !xegpu.tensor_desc<8x16xf32>
     gpu.return
   }
 }
@@ -448,15 +432,15 @@ gpu.module @xevm_module{
 gpu.module @xevm_module{
    gpu.func @vector_broadcast_1d_to_2d_broadcast_within_lane(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) {
     %c0 = arith.constant 0 : index
-    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} dense<0.000000e+00> : vector<16xf16>
+    %cst = arith.constant dense<0.000000e+00> : vector<16xf16>
     %tdesc0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16>
-      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<16x16xf16>
     %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
-      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
-    %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
-    %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x16xf16>
-    xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<16x16xf16>
+    %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+    %1 = vector.multi_reduction <add>, %0, %cst [0] : vector<16x16xf16> to vector<16xf16>
+    %2 = vector.broadcast %1 : vector<16xf16> to vector<16x16xf16>
+    xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
     gpu.return
   }
 }
@@ -470,10 +454,10 @@ gpu.module @xevm_module{
     %9 = gpu.block_id x
     %10 = arith.index_cast %9 : index to i16
     %11 = arith.bitcast %10 : i16 to f16
-    %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
+    %2 = vector.broadcast %11 : f16 to vector<16x16xf16>
     %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
-      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<16x16xf16>
+    xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
     gpu.return
   }
 }
@@ -497,12 +481,12 @@ gpu.module @xevm_module {
     // CHECK: %[[VEC_RED:.*]] = vector.broadcast %{{.*}} : f32 to vector<1xf32>
     // CHECK: xegpu.store %[[VEC_RED]], %arg1[%[[CST]]], %[[CST_0]] : vector<1xf32>, memref<256xf32>, vector<1xindex>, vector<1xi1>
   gpu.func @vector_reduce_2d(%arg0: memref<4x16xf32>, %arg1: memref<256xf32>) {
-      %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} 1.000000e+00 : f32
-      %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-      %1 = xegpu.load_nd %0[0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<4x16xf32>
-      %2 = vector.broadcast %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : f32 to vector<16xf32>
-      %3 = vector.multi_reduction <add>, %1, %2 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
-      %4 = vector.reduction <add>, %3 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} : vector<16xf32> into f32
+      %cst = arith.constant 1.000000e+00 : f32
+      %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32>
+      %1 = xegpu.load_nd %0[0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<4x16xf32> -> vector<4x16xf32>
+      %2 = vector.broadcast %cst : f32 to vector<16xf32>
+      %3 = vector.multi_reduction <add>, %1, %2 [0] : vector<4x16xf32> to vector<16xf32>
+      %4 = vector.reduction <add>, %3 : vector<16xf32> into f32
       %anchor = xegpu.convert_layout %4
         <{
           input_layout = #xegpu.slice<#xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>, dims=[0]>,

>From 061a338d82f0d150afd75a867e176d10ddde2112 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 1 May 2026 22:53:59 +0000
Subject: [PATCH 05/11] fix tests

---
 mlir/test/Dialect/XeGPU/xegpu-blocking.mlir   |  80 ++++++-------
 .../XeGPU/xegpu-wg-to-sg-elemwise.mlir        | 110 +++++-------------
 .../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir | 107 +++++++++--------
 3 files changed, 125 insertions(+), 172 deletions(-)

diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index cd7f8b9f69ff2..a9ba4306b3014 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -30,7 +30,7 @@ gpu.module @test_kernel {
       %c = xegpu.dpas %a, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
       scf.yield %c
         : vector<16x32xf32>
-    } {layout_result_0 = #c}
+    }
     //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
     xegpu.store_nd %out, %c_tdesc[0, 0] {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
     gpu.return
@@ -64,10 +64,10 @@ gpu.module @test_kernel {
       //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
       %b = xegpu.load_nd %b_tdesc[%k, %c0] {layout = #l2}: !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16>
       //CHECK-COUNT-8: xegpu.dpas {{.*}}
-      %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
+      %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
       scf.yield %c
         : vector<16x32xf32>
-    } {layout_result_0 = #l1}
+    }
     //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
     xegpu.store_nd %out, %c_tdesc[0, 0] {layout = #l1}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1>
     gpu.return
@@ -105,10 +105,10 @@ gpu.module @test_kernel {
       %a = xegpu.load_nd %a_tdesc[%c0, %a_off] {layout = #l1}: !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16>
       //CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
       %b = xegpu.load_nd %b_tdesc[%a_off, %c0] {layout = #l2}: !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16>
-      %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32>
+      %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32>
       scf.yield %c
         : vector<8x32xf32>
-    } {layout_result_0 = #l1}
+    }
     //CHECK-COUNT-2: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
     xegpu.store_nd %out, %c_tdesc[0, 0] {layout = #l1}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1>
     gpu.return
@@ -143,12 +143,12 @@ gpu.module @test_kernel {
       //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
       %b = xegpu.load_nd %b_tdesc[%k, %c0] {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
       //CHECK-COUNT-4: math.exp {{.*}} : vector<8x16xf16>
-      %e = math.exp %a {layout_result_0 = #a} : vector<16x32xf16>
+      %e = math.exp %a : vector<16x32xf16>
       //CHECK-COUNT-8: xegpu.dpas {{.*}}
-      %c = xegpu.dpas %e, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c,layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
+      %c = xegpu.dpas %e, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
       scf.yield %c
         : vector<16x32xf32>
-    } {layout_result_0 = #c}
+    }
     //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
     xegpu.store_nd %out, %c_tdesc[0, 0] {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
     gpu.return
@@ -176,7 +176,7 @@ gpu.module @test_kernel {
       %b = xegpu.load_nd %b_tdesc[%c0, %k] {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
 
       //CHECK-COUNT-4: arith.addf {{.*}} : vector<8x16xf16>
-      %c = arith.addf %a, %b {layout_result_0 = #l} : vector<16x32xf16>
+      %c = arith.addf %a, %b : vector<16x32xf16>
 
       //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
       xegpu.store_nd %c, %c_tdesc[%c0, %k] {layout = #l}: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #l>
@@ -206,7 +206,7 @@ gpu.module @test_kernel {
       %b = xegpu.load_nd %b_tdesc[%k] {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
 
       //CHECK-COUNT-4: arith.addf {{.*}} : vector<8xf16>
-      %c = arith.addf %a, %b {layout_result_0 = #l} : vector<32xf16>
+      %c = arith.addf %a, %b : vector<32xf16>
 
       //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8xf16>, !xegpu.tensor_desc<8xf16>
       xegpu.store_nd %c, %c_tdesc[%k] {layout = #l}: vector<32xf16>, !xegpu.tensor_desc<32xf16, #l>
@@ -220,7 +220,7 @@ gpu.module @test_kernel {
 #r = #xegpu.slice<#xegpu.layout<inst_data = [16, 16]>, dims = [0]>
 gpu.module @test_kernel  {
   gpu.func @reduce_dim_0(%a: memref<16x512xf32>, %b: memref<512xf32>)  kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
-    %acc = arith.constant {layout_result_0 = #r} dense<0.0> : vector<64xf32>
+    %acc = arith.constant dense<0.0> : vector<64xf32>
     %c64 = arith.constant 64 : index
     %block_id_x = gpu.block_id x
     %m = arith.muli %block_id_x, %c64 : index
@@ -228,7 +228,7 @@ gpu.module @test_kernel  {
     %1 = xegpu.load_nd %0[0, 0] {layout = #l}: !xegpu.tensor_desc<16x64xf32, #l> -> vector<16x64xf32>
     // CHECK: vector.multi_reduction <add>, {{.*}}, [[ACC:%[0-9A-Za-z]+]] [0] : vector<16x16xf32> to vector<16xf32>
     // CHECK-COUNT-3: vector.multi_reduction <add>, {{.*}}, [[ACC]] [0] : vector<16x16xf32> to vector<16xf32>
-    %2 = vector.multi_reduction <add>, %1, %acc {layout_result_0 = #r} [0]: vector<16x64xf32> to vector<64xf32>
+    %2 = vector.multi_reduction <add>, %1, %acc [0]: vector<16x64xf32> to vector<64xf32>
     %3 = xegpu.create_nd_tdesc %b : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r>
     xegpu.store_nd %2, %3[0] {layout = #r}: vector<64xf32>, !xegpu.tensor_desc<64xf32, #r>
     gpu.return
@@ -242,7 +242,7 @@ gpu.module @test_kernel   {
   gpu.func @reduce_dim_1(%a: memref<512x32xf32>, %b: memref<512xf32>)  kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
     %c1 = arith.constant 1 : index
     %c32 = arith.constant 32 : index
-    %acc = arith.constant {layout_result_0 = #r} dense<0.0> : vector<32xf32>
+    %acc = arith.constant dense<0.0> : vector<32xf32>
 
     %block_id_x = gpu.block_id x
     %block_id_y = gpu.block_id y
@@ -255,7 +255,7 @@ gpu.module @test_kernel   {
     // CHECK: vector.multi_reduction <add>, {{.*}}, [[INIT:%[0-9A-Za-z]+]] [1] : vector<16x16xf32> to vector<16xf32>
     // CHECK-COUNT-1: vector.multi_reduction <add>, {{.*}}, [[INIT]] [1] : vector<16x16xf32> to vector<16xf32>
 
-    %2 = vector.multi_reduction <add>, %1, %acc {layout_result_0 = #r} [1]: vector<32x128xf32> to vector<32xf32>
+    %2 = vector.multi_reduction <add>, %1, %acc [1]: vector<32x128xf32> to vector<32xf32>
     %3 = xegpu.create_nd_tdesc %b : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r>
     xegpu.store_nd %2, %3[0] {layout = #r}: vector<32xf32>, !xegpu.tensor_desc<32xf32, #r>
     gpu.return
@@ -274,7 +274,7 @@ gpu.module @test_kernel   {
     %0 = xegpu.create_nd_tdesc %a : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r>
     %1 = xegpu.load_nd %0[0] {layout = #r}: !xegpu.tensor_desc<64xf32, #r> -> vector<64xf32>
     // CHECK-COUNT-4: vector.broadcast {{.*}} : vector<16xf32> to vector<16x16xf32>
-    %2 = vector.broadcast  %1 {layout_result_0 = #l} : vector<64xf32> to vector<16x64xf32>
+    %2 = vector.broadcast  %1 : vector<64xf32> to vector<16x64xf32>
     %3 = xegpu.create_nd_tdesc %b : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l>
     xegpu.store_nd %2, %3[0, 0] {layout = #l}: vector<16x64xf32>, !xegpu.tensor_desc<16x64xf32, #l>
     gpu.return
@@ -313,7 +313,7 @@ gpu.module @test_kernel   {
     %0 = xegpu.create_nd_tdesc %a : memref<512x8xf32> -> !xegpu.tensor_desc<32x8xf32, #l>
     %1 = xegpu.load_nd %0[0, 0] {layout = #l}: !xegpu.tensor_desc<32x8xf32, #l> -> vector<32x8xf32>
     // CHECK-COUNT-2: vector.transpose {{.*}}  [1, 0] : vector<16x8xf32> to vector<8x16xf32>
-    %2 = vector.transpose  %1, [1, 0] {layout_result_0 = #t} : vector<32x8xf32> to vector<8x32xf32>
+    %2 = vector.transpose  %1, [1, 0] : vector<32x8xf32> to vector<8x32xf32>
     %3 = xegpu.create_nd_tdesc %b : memref<8x512xf32> -> !xegpu.tensor_desc<8x32xf32, #t>
     xegpu.store_nd %2, %3[0, 0] {layout = #t}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #t>
     gpu.return
@@ -325,8 +325,8 @@ gpu.module @test_kernel   {
 gpu.module @test_kernel {
   gpu.func @test_vector_constant_mask(%src: ui64, %dst: ui64) {
     //CHECK: arith.constant dense<true> : vector<16xi1>
-    %mask = vector.constant_mask [32] {layout_result_0 = #l} : vector<32xi1>
-    %cst = arith.constant {layout_result_0 = #l} dense<[
+    %mask = vector.constant_mask [32] : vector<32xi1>
+    %cst = arith.constant dense<[
       0,   8,  16,  24,  32,  40,  48,  56,
       64,  72,  80,  88,  96, 104, 112, 120,
       128, 136, 144, 152, 160, 168, 176, 184,
@@ -344,8 +344,8 @@ gpu.module @test_kernel {
   gpu.func @test_vector_create_mask(%src: ui64, %dst: ui64) {
     %c16 = arith.constant 16 : index
     //CHECK-COUNT-2: vector.create_mask {{.*}} : vector<16xi1>
-    %mask = vector.create_mask %c16 {layout_result_0 = #l} : vector<32xi1>
-    %cst = arith.constant {layout_result_0 = #l} dense<[
+    %mask = vector.create_mask %c16 : vector<32xi1>
+    %cst = arith.constant dense<[
       0,   8,  16,  24,  32,  40,  48,  56,
       64,  72,  80,  88,  96, 104, 112, 120,
       128, 136, 144, 152, 160, 168, 176, 184,
@@ -365,8 +365,8 @@ gpu.module @test_kernel {
     //CHECK: [[cst:%.+]] = arith.constant dense<16> : vector<16xindex>
     //CHECK: [[step:%.+]] = vector.step : vector<16xindex>
     //CHECK: arith.addi [[step]], [[cst]] : vector<16xindex>
-    %step = vector.step {layout_result_0 = #l} : vector<32xindex>
-    %mask = vector.create_mask %c16 {layout_result_0 = #l} : vector<32xi1>
+    %step = vector.step : vector<32xindex>
+    %mask = vector.create_mask %c16 : vector<32xi1>
     %ld = xegpu.load %src[%step], %mask {chunk_size = 1, layout = #l, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
     xegpu.store %ld, %dst[%step], %mask {chunk_size = 1, layout = #l, l1_hint = #xegpu.cache_hint<cached>} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1>
     gpu.return
@@ -403,7 +403,7 @@ gpu.module @test_kernel {
     %a = xegpu.load_nd %a_tdesc[0, 0] {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
     %b = xegpu.load_nd %b_tdesc[0, 0] {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
     %a1 = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16>
-    %c = xegpu.dpas %a1, %b {layout_a=#a, layout_b = #b, layout_cd = #c, layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    %c = xegpu.dpas %a1, %b {layout_a=#a, layout_b = #b, layout_cd = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     %c_tdesc = xegpu.create_nd_tdesc %C : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c>
     xegpu.store_nd %c, %c_tdesc[0, 0] {layout = #c}: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c>
     gpu.return
@@ -421,7 +421,7 @@ gpu.module @test_kernel {
     %c0 = arith.constant 0 : index
     %a_tdesc = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #a>
     %a = xegpu.load_nd %a_tdesc[0, 0] {layout = #a}: !xegpu.tensor_desc<16x16xf16, #a> -> vector<16x16xf16>
-    %a_reduce = vector.multi_reduction <add>, %a, %acc {layout_operand_0 = #a, layout_result_0 = #xegpu.slice<#a, dims = [0, 1]>} [0, 1] : vector<16x16xf16> to f16
+    %a_reduce = vector.multi_reduction <add>, %a, %acc [0, 1] : vector<16x16xf16> to f16
     %13 = xegpu.convert_layout %a_reduce <{input_layout = #xegpu.slice<#a, dims = [0, 1]>, target_layout = #xegpu.slice<#a, dims = [0, 1]>}> : f16
     memref.store %13, %arg1[%c0] : memref<4xf16>
     gpu.return
@@ -514,7 +514,7 @@ gpu.module @test_kernel {
   // CHECK-LABEL: load_with_offsets
   // CHECK-COUNT-2: xegpu.load  {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
   gpu.func @load_with_offsets(%src: ui64) -> vector<32xf32> {
-      %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
+      %cst = arith.constant dense<[
       0,   8,  16,  24,  32,  40,  48,  56,
       64,  72,  80,  88,  96, 104, 112, 120,
       128, 136, 144, 152, 160, 168, 176, 184,
@@ -522,7 +522,7 @@ gpu.module @test_kernel {
       ]> : vector<32xindex>
 
       %c17 = arith.constant 17: index
-      %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout<inst_data = [16]>}: vector<32xi1>
+      %mask = vector.create_mask %c17 : vector<32xi1>
       %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
 
       gpu.return %ld : vector<32xf32>
@@ -534,7 +534,7 @@ gpu.module @test_kernel {
   // CHECK-LABEL: store_with_offsets
   // CHECK-COUNT-2: xegpu.store  {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<16xf32>, ui64, vector<16xindex>, vector<16xi1>
   gpu.func @store_with_offsets(%src: ui64) {
-      %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
+      %cst = arith.constant dense<[
       0,   8,  16,  24,  32,  40,  48,  56,
       64,  72,  80,  88,  96, 104, 112, 120,
       128, 136, 144, 152, 160, 168, 176, 184,
@@ -542,9 +542,9 @@ gpu.module @test_kernel {
       ]> : vector<32xindex>
 
       %c17 = arith.constant 17: index
-      %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout<inst_data = [16]>}: vector<32xi1>
+      %mask = vector.create_mask %c17 : vector<32xi1>
 
-      %st_vec = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<1023.0>: vector<32xf32>
+      %st_vec = arith.constant dense<1023.0>: vector<32xf32>
       xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1>
 
       gpu.return
@@ -561,7 +561,7 @@ gpu.module @test_kernel {
   // CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
   // CHECK-COUNT-4: xegpu.load  {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16x2xf32>
    gpu.func @load_with_offsets_chunk(%src: ui64) -> vector<32x4xf32> {
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
+    %cst = arith.constant dense<[
         0,   8,  16,  24,  32,  40,  48,  56,
         64,  72,  80,  88,  96, 104, 112, 120,
         128, 136, 144, 152, 160, 168, 176, 184,
@@ -569,7 +569,7 @@ gpu.module @test_kernel {
     ]> : vector<32xindex>
 
     %c17 = arith.constant 17: index
-    %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout<inst_data = [16]>}: vector<32xi1>
+    %mask = vector.create_mask %c17 : vector<32xi1>
     %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32>
     gpu.return %ld : vector<32x4xf32>
    }
@@ -585,7 +585,7 @@ gpu.module @test_kernel {
   // CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
   // CHECK-COUNT-4: xegpu.store  {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<16x2xf32>, ui64, vector<16xindex>, vector<16xi1>
   gpu.func @store_with_offsets_chunk(%src: ui64) {
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
+    %cst = arith.constant dense<[
       0,   8,  16,  24,  32,  40,  48,  56,
       64,  72,  80,  88,  96, 104, 112, 120,
       128, 136, 144, 152, 160, 168, 176, 184,
@@ -593,9 +593,9 @@ gpu.module @test_kernel {
     ]> : vector<32xindex>
 
     %c17 = arith.constant 17: index
-    %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout<inst_data = [16]>}: vector<32xi1>
+    %mask = vector.create_mask %c17 : vector<32xi1>
 
-    %st_vec = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16, 2]>} dense<1023.>: vector<32x4xf32>
+    %st_vec = arith.constant dense<1023.>: vector<32x4xf32>
     xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : vector<32x4xf32>, ui64, vector<32xindex>, vector<32xi1>
     gpu.return
   }
@@ -614,14 +614,14 @@ gpu.module @test_kernel {
   // CHECK: [[ins_0:%.+]] = vector.insert_strided_slice [[ld_0]], [[cst]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x16xf32> into vector<1x1x32xf32>
   // CHECK: [[ins_1:%.+]] = vector.insert_strided_slice [[ld_1]], [[ins_0]] {offsets = [0, 0, 16], strides = [1, 1, 1]} : vector<1x1x16xf32> into vector<1x1x32xf32>
   gpu.func @preserve_unit_dim_of_load_inst_data(%src: ui64) -> vector<1x1x32xf32> {
-      %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>} dense<[[
+      %cst = arith.constant dense<[[
       [0,   8,  16,  24,  32,  40,  48,  56,
       64,  72,  80,  88,  96, 104, 112, 120,
       128, 136, 144, 152, 160, 168, 176, 184,
       192, 200, 208, 216, 224, 232, 240, 248]
       ]]> : vector<1x1x32xindex>
 
-      %mask = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>} dense<true> : vector<1x1x32xi1>
+      %mask = arith.constant dense<true> : vector<1x1x32xi1>
       %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [1, 1, 16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
 
       gpu.return %ld : vector<1x1x32xf32>
@@ -656,7 +656,7 @@ gpu.module @test_kernel {
     %a = xegpu.load_nd %a_tdesc[%c0, %c0] {layout = #l}: !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32>
     %b = xegpu.load_nd %b_tdesc[%c0, %c0] {layout = #l}: !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32>
 
-    %result = arith.addf %a, %b {layout_result_0 = #l} : vector<1x32xf32>
+    %result = arith.addf %a, %b : vector<1x32xf32>
     xegpu.store_nd %result, %c_tdesc[%c0, %c0] {layout = #l}: vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #l>
     gpu.return
   }
@@ -685,14 +685,14 @@ gpu.module @test_kernel {
     // CHECK: xegpu.store [[v5]], [[arg2]]{{\[}}[[c2]]], [[c0]]
     // CHECK-SAME: vector<1x1x16xf32>, ui64, vector<1x1x16xindex>, vector<1x1x16xi1>
   gpu.func @load_add_store_leading_unit_dims(%A: ui64, %B: ui64, %C: ui64) {
-    %cst = arith.constant {layout_result_0 = #inst_data} dense<[
+    %cst = arith.constant dense<[
       [[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
         128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]]
     ]> : vector<1x1x32xindex>
-    %mask = arith.constant {layout_result_0 = #inst_data} dense<true> : vector<1x1x32xi1>
+    %mask = arith.constant dense<true> : vector<1x1x32xi1>
     %a = xegpu.load %A[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
     %b = xegpu.load %B[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
-    %addf = arith.addf %a, %b {layout_result_0 = #inst_data} : vector<1x1x32xf32>
+    %addf = arith.addf %a, %b : vector<1x1x32xf32>
     xegpu.store %addf, %C[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : vector<1x1x32xf32>, ui64, vector<1x1x32xindex>, vector<1x1x32xi1>
     gpu.return
   }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
index 94e8b7504a1d6..3e8d183242a91 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
@@ -4,18 +4,14 @@ gpu.module @test_elementwise_ops {
 
   // CHECK-LABEL: unary_ops_sg_layout_only
   gpu.func @unary_ops_sg_layout_only(%a: memref<24x32xf32>) {
-    %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
-      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>>
+    %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
     %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>}
-      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>>
-      -> vector<24x32xf32>
+      : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
     // CHECK: math.exp {{.*}} : vector<12x8xf32>
     %exp = math.exp %load_a
-      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>}
       : vector<24x32xf32>
     // CHECK: arith.negf {{.*}} : vector<12x8xf32>
     %negf = arith.negf %exp
-      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>}
       : vector<24x32xf32>
     %anchor = xegpu.convert_layout %negf
       <{
@@ -27,18 +23,14 @@ gpu.module @test_elementwise_ops {
 
   // CHECK-LABEL: unary_ops
   gpu.func @unary_ops(%a: memref<24x32xf32>) {
-    %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
-      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
     %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-      -> vector<24x32xf32>
+      : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
     // CHECK: math.exp {{.*}} : vector<12x8xf32>
     %exp = math.exp %load_a
-      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : vector<24x32xf32>
     // CHECK: arith.negf {{.*}} : vector<12x8xf32>
     %negf = arith.negf %exp
-      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : vector<24x32xf32>
     %anchor = xegpu.convert_layout %negf
       <{
@@ -50,24 +42,16 @@ gpu.module @test_elementwise_ops {
 
   // CHECK-LABEL: binary_ops
   gpu.func @binary_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>) {
-    %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
-      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-    %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32>
-      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32> 
+    %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
     %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-      -> vector<24x32xf32>
+      : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
     %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-      -> vector<24x32xf32>
+      : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
     // CHECK: arith.addf {{.*}}, {{.*}} : vector<12x8xf32>
-    %addf = arith.addf %load_a, %load_b
-      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
-      : vector<24x32xf32>
+    %addf = arith.addf %load_a, %load_b : vector<24x32xf32>
     // CHECK: math.powf {{.*}}, {{.*}} : vector<12x8xf32>
-    %powf = math.powf %addf, %load_b
-      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
-      : vector<24x32xf32>
+    %powf = math.powf %addf, %load_b : vector<24x32xf32>
     %anchor = xegpu.convert_layout %powf
       <{
         input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>,
@@ -78,28 +62,20 @@ gpu.module @test_elementwise_ops {
 
   // CHECK-LABEL: ternary_ops
   gpu.func @ternary_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>, %c: memref<24x32xi1>) {
-    %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
-      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-    %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32>
-      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-    %tdesc_c = xegpu.create_nd_tdesc %c : memref<24x32xi1>
-      -> !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
+    %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
+    %tdesc_c = xegpu.create_nd_tdesc %c : memref<24x32xi1> -> !xegpu.tensor_desc<24x32xi1>
     %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-      -> vector<24x32xf32>
+      : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
     %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-      -> vector<24x32xf32>
+      : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
     %load_c = xegpu.load_nd %tdesc_c[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-      -> vector<24x32xi1>
+      : !xegpu.tensor_desc<24x32xi1> -> vector<24x32xi1>
     // CHECK: arith.select {{.*}}, {{.*}}, {{.*}} : vector<12x8xi1>, vector<12x8xf32>
     %select = arith.select %load_c, %load_a, %load_b
-      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : vector<24x32xi1>, vector<24x32xf32>
     // CHECK: math.fma  {{.*}}, {{.*}}, {{.*}} : vector<12x8xf32>
     %fma = math.fma %load_a, %load_b, %select
-      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : vector<24x32xf32>
     %anchor = xegpu.convert_layout %fma
       <{
@@ -111,23 +87,17 @@ gpu.module @test_elementwise_ops {
 
   // CHECK-LABEL: type_conversion_ops
   gpu.func @type_conversion_ops(%a: memref<24x32xf32>, %b: memref<24x32xi32>) {
-    %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
-      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-    %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xi32>
-      -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
+    %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xi32> -> !xegpu.tensor_desc<24x32xi32>
     %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-      -> vector<24x32xf32>
+      : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
     %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-      -> vector<24x32xi32>
+      : !xegpu.tensor_desc<24x32xi32> -> vector<24x32xi32>
     // CHECK: arith.truncf {{.*}} : vector<12x8xf32> to vector<12x8xf16>
     %truncf = arith.truncf %load_a
-      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : vector<24x32xf32> to vector<24x32xf16>
     // CHECK: arith.bitcast {{.*}} : vector<12x8xf16> to vector<12x8xi16>
     %bitcast = arith.bitcast %truncf
-      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : vector<24x32xf16> to vector<24x32xi16>
     %anchor = xegpu.convert_layout %bitcast
       <{
@@ -139,33 +109,23 @@ gpu.module @test_elementwise_ops {
 
   // CHECK-LABEL: comparison_ops
   gpu.func @comparison_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>, %c: memref<24x32xi32>, %d: memref<24x32xi32>) {
-    %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
-      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-    %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32>
-      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-    %tdesc_c = xegpu.create_nd_tdesc %c : memref<24x32xi32>
-      -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-    %tdesc_d = xegpu.create_nd_tdesc %d : memref<24x32xi32>
-      -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
+    %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
+    %tdesc_c = xegpu.create_nd_tdesc %c : memref<24x32xi32> -> !xegpu.tensor_desc<24x32xi32>
+    %tdesc_d = xegpu.create_nd_tdesc %d : memref<24x32xi32> -> !xegpu.tensor_desc<24x32xi32>
     %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-      -> vector<24x32xf32>
+      : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
     %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-      -> vector<24x32xf32>
+      : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
     %load_c = xegpu.load_nd %tdesc_c[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-      -> vector<24x32xi32>
+      : !xegpu.tensor_desc<24x32xi32> -> vector<24x32xi32>
     %load_d = xegpu.load_nd %tdesc_d[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-      -> vector<24x32xi32>
+      : !xegpu.tensor_desc<24x32xi32> -> vector<24x32xi32>
     // CHECK: arith.cmpf ult, {{.*}}, {{.*}} : vector<12x8xf32>
     %cmpf = arith.cmpf ult, %load_a, %load_b
-      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : vector<24x32xf32>
     // CHECK: arith.cmpi eq, {{.*}}, {{.*}} : vector<12x8xi32>
     %cmpi = arith.cmpi eq, %load_c, %load_d
-      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : vector<24x32xi32>
     %res = arith.select %cmpi, %cmpi, %cmpf : vector<24x32xi1>, vector<24x32xi1>
     %anchor = xegpu.convert_layout %res
@@ -179,25 +139,19 @@ gpu.module @test_elementwise_ops {
   // 1 to N decomposition of elementwise operations
   // CHECK-LABEL: elementwise_ops_rr_assignment
   gpu.func @elementwise_ops_rr_assignment(%a: memref<24x32xf32>, %b: memref<24x32xf32>) {
-     %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
-      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
-    %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32>
-      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+     %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
+    %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
     %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
-      -> vector<24x32xf32>
+      : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
     %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
-      -> vector<24x32xf32>
+      : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
     // CHECK-COUNT-12: arith.negf {{.*}} : vector<2x2xf32>
     // CHECK-NOT: arith.negf
     %negf = arith.negf %load_a
-      {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
       : vector<24x32xf32>
     // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} : vector<2x2xf32>
     // CHECK-NOT: math.powf
     %powf = math.powf %negf, %load_b
-      {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
       : vector<24x32xf32>
     %anchor = xegpu.convert_layout %powf
       <{
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index 2f43f9a840173..b1a6d81bc1140 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -17,9 +17,9 @@ gpu.module @test_distribution {
     // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
     // CHECK-NOT: xegpu.load_nd
     %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
-      -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<256x128xf32>
     %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+      : !xegpu.tensor_desc<256x128xf32>
       -> vector<256x128xf32>
     %anchor = xegpu.convert_layout %load
       <{
@@ -34,23 +34,23 @@ gpu.module @test_distribution {
     // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.store_nd
     %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
-      -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<256x128xf32>
     %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+      : !xegpu.tensor_desc<256x128xf32>
       -> vector<256x128xf32>
     xegpu.store_nd %load, %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
-      : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+      : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32>
     gpu.return
   }
 
   // CHECK-LABEL: prefetch_nd
   gpu.func @prefetch_nd(%src: memref<256x128xf32>) {
-    // CHECK-COUNT-4: xegpu.prefetch_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.prefetch_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.prefetch_nd
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
-      -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.prefetch_nd %tdesc[0, 0]
-      : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<256x128xf32>
+    xegpu.prefetch_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+      : !xegpu.tensor_desc<256x128xf32>
     gpu.return
   }
 
@@ -64,14 +64,14 @@ gpu.module @test_distribution {
     // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     // CHECK-NOT: xegpu.dpas
     %tdesc_a = xegpu.create_nd_tdesc %a : memref<256x128xf16>
-      -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<256x128xf16>
     %load_a =  xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+      : !xegpu.tensor_desc<256x128xf16>
       -> vector<256x128xf16>
     %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x256xf16>
-      -> !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
+      -> !xegpu.tensor_desc<128x256xf16>
     %load_b =  xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>}
-      : !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
+      : !xegpu.tensor_desc<128x256xf16>
       -> vector<128x256xf16>
     %dpas = xegpu.dpas %load_a, %load_b
        {layout_a = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
@@ -84,17 +84,17 @@ gpu.module @test_distribution {
   // CHECK-LABEL: vector_reduce_dim_1
   gpu.func @vector_reduce_dim_1(%src: memref<256x64xf32>) {
     // CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32>
-    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>, dims = [1]>} dense<1.0> : vector<256xf32>
+    %cst = arith.constant dense<1.0> : vector<256xf32>
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32>
-      -> !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>>
+      -> !xegpu.tensor_desc<256x64xf32>
     %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>}
-      : !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>>
+      : !xegpu.tensor_desc<256x64xf32>
       -> vector<256x64xf32>
     // CHECK-COUNT-2: vector.multi_reduction <add>, {{.*}}, %[[C0:.*]] [1] : vector<16x64xf32> to vector<16xf32>
     // CHECK-NOT: vector.multi_reduction
     // CHECK-COUNT-2: arith.addf {{.*}}, {{.*}} : vector<16xf32>
     // CHECK-NOT: arith.addf
-    %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>, dims = [1]>} [1]
+    %reduce = vector.multi_reduction <add>, %load, %cst [1]
       : vector<256x64xf32> to vector<256xf32>
     %anchor = xegpu.convert_layout %reduce
       <{
@@ -137,13 +137,13 @@ gpu.module @test_distribution {
   // CHECK-LABEL: vector_transpose
   gpu.func @vector_transpose(%src: memref<256x128xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
-        -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+        -> !xegpu.tensor_desc<256x128xf32>
     %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>}
-        : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+        : !xegpu.tensor_desc<256x128xf32>
         -> vector<256x128xf32>
     // CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] : vector<32x16xf32> to vector<16x32xf32>
     // CHECK-NOT: vector.transpose
-    %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
+    %trans = vector.transpose %load, [1, 0]
     : vector<256x128xf32> to vector<128x256xf32>
     %anchor = xegpu.convert_layout %trans
       <{
@@ -157,7 +157,7 @@ gpu.module @test_distribution {
   gpu.func @vector_mask_2D() {
     // CHECK-COUNT-4: vector.create_mask {{.*}}, {{.*}} : vector<16x16xi1>
     // CHECK-NOT: vector.create_mask
-    %constant_mask = vector.constant_mask [16, 16] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>} : vector<256x128xi1>
+    %constant_mask = vector.constant_mask [16, 16] : vector<256x128xi1>
     %anchor = xegpu.convert_layout %constant_mask
       <{
         input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>,
@@ -170,7 +170,7 @@ gpu.module @test_distribution {
     // CHECK-COUNT-4: vector.create_mask {{.*}}, {{.*}} : vector<16x16xi1>
     // CHECK-NOT: vector.create_mask
     %cst16 = arith.constant 16 : index
-    %constant_mask = vector.create_mask %cst16, %cst16 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>} : vector<256x128xi1>
+    %constant_mask = vector.create_mask %cst16, %cst16 : vector<256x128xi1>
     %anchor = xegpu.convert_layout %constant_mask
       <{
         input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>,
@@ -183,13 +183,13 @@ gpu.module @test_distribution {
   // CHECK: %[[CAST:.*]] = vector.shape_cast %[[REDUCE:.*]] : vector<8xf32> to vector<8x1xf32>
   // CHECK: %[[BCAST:.*]] = vector.broadcast %[[CAST]] : vector<8x1xf32> to vector<8x128xf32>
   gpu.func @distribute_shapecast_expandunitdims_broadcast(%arg0: memref<4096x128xf32>, %arg1: memref<4096x128xf32>) {
-    %cst_0 = arith.constant {layout_result_0=#xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>, dims = [1]>} dense<0xFF800000> : vector<256xf32>
+    %cst_0 = arith.constant dense<0xFF800000> : vector<256xf32>
     %block_id_x = gpu.block_id x
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>>
-    %1 = xegpu.load_nd %0[%block_id_x, 0] {layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>}  : !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>> -> vector<256x128xf32>
-    %2 = vector.multi_reduction <maximumf>, %1, %cst_0 {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 1], inst_data = [8, 1]>, dims = [1]>} [1] : vector<256x128xf32> to vector<256xf32>
-    %3 = vector.shape_cast %2 {layout_result_0 =  #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 1], inst_data = [8, 1]>, layout_operand_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 1], inst_data = [8, 1]>, dims = [1]>} : vector<256xf32> to vector<256x1xf32>
-    %4 = vector.broadcast %3 {layout_result_0 =  #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>} : vector<256x1xf32>to vector<256x128xf32>
+    %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
+    %1 = xegpu.load_nd %0[%block_id_x, 0] {layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>}  : !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>> -> vector<256x128xf32>
+    %2 = vector.multi_reduction <maximumf>, %1, %cst_0 [1] : vector<256x128xf32> to vector<256xf32>
+    %3 = vector.shape_cast %2 : vector<256xf32> to vector<256x1xf32>
+    %4 = vector.broadcast %3 : vector<256x1xf32>to vector<256x128xf32>
     %9 = xegpu.create_nd_tdesc %arg0 : memref<4096x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
     xegpu.store_nd %4, %9[%block_id_x, 0] <{layout =#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>}>: vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
     gpu.return
@@ -239,11 +239,11 @@ gpu.module @test_distribution {
     // CHECK: %[[FINAL_RED1:.*]] = vector.multi_reduction <add>, %[[SLM_LOAD1]], %[[FINAL_NEUTRAL]] [1] : vector<4x16xf32> to vector<4xf32>
     // CHECK: %[[RES1:.*]] = arith.addf %[[FINAL_RED1]], %[[CST_ACC1]] : vector<4xf32>
 
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>} dense<0> : vector<8x256xindex>
-    %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>, dims = [1]>} dense<0.000000e+00> : vector<8xf32>
-    %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>} dense<true> : vector<8x256xi1>
+    %offset = arith.constant dense<0> : vector<8x256xindex>
+    %acc = arith.constant dense<0.000000e+00> : vector<8xf32>
+    %mask = arith.constant dense<true> : vector<8x256xi1>
     %val = xegpu.load %arg0[%offset], %mask <{layout = #xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>}> : memref<2048xf32, 1>, vector<8x256xindex>, vector<8x256xi1> -> vector<8x256xf32>
-    %reduce = vector.multi_reduction <add>, %val, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>, dims = [1]>} [1] : vector<8x256xf32> to vector<8xf32>
+    %reduce = vector.multi_reduction <add>, %val, %acc [1] : vector<8x256xf32> to vector<8xf32>
     %anchor = xegpu.convert_layout %reduce
       <{
         input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>, dims = [1]>,
@@ -278,8 +278,8 @@ gpu.module @test_distribution {
     // CHECK: %[[ADD4:.*]] = arith.addi %[[STEP]], %[[BCST4]] : vector<4xindex>
     // CHECK: %[[RES0:.*]] = vector.broadcast %[[ADD0]] : vector<4xindex> to vector<16x4xindex>
     // CHECK: %[[RES1:.*]] = vector.broadcast %[[ADD4]] : vector<4xindex> to vector<16x4xindex>
-    %2 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4]>, dims = [0]>} : vector<8xindex>
-    %bcast = vector.broadcast %2 {layout_result_0 = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4]>} : vector<8xindex> to vector<256x8xindex>
+    %2 = vector.step : vector<8xindex>
+    %bcast = vector.broadcast %2 : vector<8xindex> to vector<256x8xindex>
     %anchor = xegpu.convert_layout %bcast
       <{
         input_layout = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4]>,
@@ -301,14 +301,13 @@ gpu.module @test_distribution {
   // CHECK-SAME: %[[ARG_0:.*]]: memref<128x1xf32>
   gpu.func @broadcast(%src: memref<128x1xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src : memref<128x1xf32>
-      -> !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<128x1xf32>
     %load =  xegpu.load_nd %tdesc[0, 0] {layout =  #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
+      : !xegpu.tensor_desc<128x1xf32>
       -> vector<128x1xf32>
     // CHECK-COUNT-4: vector.broadcast {{.*}} : vector<16x1xf32> to vector<16x32xf32>
     // CHECK-NOT: vector.broadcast
     %broadcast = vector.broadcast %load
-      {layout_result_0 = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
       : vector<128x1xf32> to vector<128x64xf32>
     %anchor = xegpu.convert_layout %broadcast
       <{
@@ -324,12 +323,12 @@ gpu.module @test_distribution {
     %c0 = arith.constant 0 : index
     %c256 = arith.constant 256 : index
     %c1024 = arith.constant 1024 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
-    %1 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
+    %1 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
     // CHECK-LABEL: scf.for
     scf.for %arg2 = %c0 to %c1024 step %c256 {
-      %3 = xegpu.load_nd %0[%arg2] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
-      xegpu.store_nd %3, %1[%arg2]  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+      %3 = xegpu.load_nd %0[%arg2] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
+      xegpu.store_nd %3, %1[%arg2] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : vector<256xf32>, !xegpu.tensor_desc<256xf32>
     }
     gpu.return
   }
@@ -339,9 +338,9 @@ gpu.module @test_distribution {
     %c10_i32 = arith.constant 10 : i32
     %c0_i32 = arith.constant 0 : i32
     %c256 = arith.constant 256 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
-    %1 = xegpu.load_nd %0[0] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
-    %2 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
+    %1 = xegpu.load_nd %0[0] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
+    %2 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
     // CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32)
     %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) {
       %4 = arith.cmpi slt, %arg3, %c10_i32 : i32
@@ -350,9 +349,9 @@ gpu.module @test_distribution {
     } do {
     // CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: vector<16xf32>, [[arg4:%.+]]: i32)
     ^bb0(%arg2: vector<256xf32>, %arg3: i32):
-      xegpu.store_nd %arg2, %2[0]  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+      xegpu.store_nd %arg2, %2[0] {layout =  #xegpu.layout<sg_layout = [8], sg_data = [16]>} : vector<256xf32>, !xegpu.tensor_desc<256xf32>
       %4 = arith.addi %arg3, %c1_i32 : i32
-      %6 = xegpu.load_nd %0[%c256] {layout =  #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+      %6 = xegpu.load_nd %0[%c256] {layout =  #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
       scf.yield %6, %4 : vector<256xf32>, i32
     }
     gpu.return
@@ -361,23 +360,23 @@ gpu.module @test_distribution {
   gpu.func @scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c10 = arith.constant 10 : index
     %0 = gpu.subgroup_id : index
-    %1 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
-    %2 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    %1 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
+    %2 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
     %3 = arith.cmpi eq, %0, %c10 : index
     // CHECK-LABEL: scf.if
     // CHECK-SAME: (vector<16xf32>, vector<16xf32>)
     %4 = scf.if %3 -> (vector<256xf32>) {
-      %5 = xegpu.load_nd %1[0] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+      %5 = xegpu.load_nd %1[0] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
       // CHECK-LABEL: scf.yield
       // CHECK-SAME: vector<16xf32>, vector<16xf32>
       scf.yield %5 : vector<256xf32>
     } else {
-      %5 = xegpu.load_nd %2[0] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+      %5 = xegpu.load_nd %2[0] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
       // CHECK-LABEL: scf.yield
       // CHECK-SAME: vector<16xf32>, vector<16xf32>
       scf.yield %5 : vector<256xf32>
-    } {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [16]>}
-    xegpu.store_nd %4, %1[0]  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    }
+    xegpu.store_nd %4, %1[0]  : vector<256xf32>, !xegpu.tensor_desc<256xf32>
     gpu.return
   }
 
@@ -407,10 +406,10 @@ gpu.module @test_distribution {
   }
 
   gpu.func @convert_layout_optimal(%arg0: memref<32x64xf32>) {
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>>
+    %0 = xegpu.create_nd_tdesc %arg0 : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
     // CHECK-COUNT-2: xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32>
     // CHECK-COUNT-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32>
-    %1 = xegpu.load_nd %0[0, 0] {layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>} : !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>> -> vector<32x64xf32>
+    %1 = xegpu.load_nd %0[0, 0] {layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>} : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
     %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>,
                                    target_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]>}> : vector<32x64xf32>
     gpu.return

>From 3b6aa925017c508d8bb02946398eb2b65c9eb760 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 2 May 2026 03:31:20 +0000
Subject: [PATCH 06/11] pass all tests

---
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      |  20 +-
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir   | 325 +++++++++---------
 2 files changed, 170 insertions(+), 175 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 5cd1a8e9c83ec..b295c74884447 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -175,9 +175,21 @@ static void propagateRegionResultsToYieldOperands(
       if (successor.isParent()) {
         // For parent successor, get layout from external use points of the
         // parent op's results.
-        layout = getLayoutFromUsePoints(regionBranchOp->getResult(i));
+        auto regionResult = regionBranchOp->getResult(i);
+        layout = getLayoutFromUsePoints(regionResult);
         if (layout)
-          xegpu::setTemporaryLayout(regionBranchOp->getResult(i), layout);
+          xegpu::setTemporaryLayout(regionResult, layout);
+        if (auto tensorDescTy =
+                dyn_cast<xegpu::TensorDescType>(regionResult.getType())) {
+          auto tDescLayout = tensorDescTy.getLayoutAttr();
+          if (!tDescLayout) {
+            auto typeWithLayout = xegpu::TensorDescType::get(
+                tensorDescTy.getContext(), tensorDescTy.getShape(),
+                tensorDescTy.getElementType(), tensorDescTy.getEncoding(),
+                layout);
+            regionResult.setType(typeWithLayout);
+          }
+        }
       } else {
         // For region successor, get layout from the target region's block
         // arg use points (e.g., "before/cond" region args for scf.while
@@ -186,7 +198,9 @@ static void propagateRegionResultsToYieldOperands(
       }
       if (!layout)
         continue;
-      if (isa<VectorType>(succOps[i].getType()))
+      auto operandType = succOps[i].getType();
+      if (isa<VectorType>(operandType) ||
+          dyn_cast<xegpu::TensorDescType>(operandType))
         xegpu::setTemporaryLayout(yieldOp->getOpOperand(beginIdx + i), layout);
     }
   }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index f9697d83baf58..0e79a8056418a 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -39,9 +39,9 @@ gpu.module @test_distribution {
     //CHECK-DAG: %[[OFF_X:.*]] = arith.remui %[[L_OFF_X]], %[[C128]] : index
     //CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32>
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
-      -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<256x128xf32>
     %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+      : !xegpu.tensor_desc<256x128xf32>
       -> vector<256x128xf32>
     %anchor = xegpu.convert_layout %load
       <{
@@ -56,24 +56,25 @@ gpu.module @test_distribution {
   gpu.func @store_nd(%src: memref<256x128xf32>) {
     //CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
-      -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<256x128xf32>
     %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+      : !xegpu.tensor_desc<256x128xf32>
       -> vector<256x128xf32>
     xegpu.store_nd %load, %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
-      : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+      : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32>
     gpu.return
 }
 
   // CHECK-LABEL: prefetch_nd
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @prefetch_nd(%src: memref<256x128xf32>) {
-    //CHECK: xegpu.prefetch_nd %{{.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    //CHECK: xegpu.prefetch_nd %{{.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %cst0 = arith.constant 0 : index
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
-      -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<256x128xf32>
     xegpu.prefetch_nd %tdesc[%cst0, %cst0]
-      : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+      <{layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}>
+      : !xegpu.tensor_desc<256x128xf32>
     gpu.return
   }
 
@@ -81,14 +82,14 @@ gpu.module @test_distribution {
   gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
     %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16>
-      -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<128x128xf16>
     %load_a =  xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
+      : !xegpu.tensor_desc<128x128xf16>
       -> vector<128x128xf16>
     %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16>
-      -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
+      -> !xegpu.tensor_desc<128x128xf16>
     %load_b =  xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>}
-      : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
+      : !xegpu.tensor_desc<128x128xf16>
       -> vector<128x128xf16>
     %dpas = xegpu.dpas %load_a, %load_b
        {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>,
@@ -102,19 +103,15 @@ gpu.module @test_distribution {
   gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
     %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16>
-      -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1],
-      order = [1, 0]>>
+      -> !xegpu.tensor_desc<128x128xf16>
     %load_a =  xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1],
       order = [1, 0]>}
-      : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1],
-      order = [1, 0]>>
+      : !xegpu.tensor_desc<128x128xf16>
       -> vector<128x128xf16>
     %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16>
-      -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1],
-      order = [1, 0]>>
+      -> !xegpu.tensor_desc<128x128xf16>
     %load_b =  xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]> }
-      : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1],
-      order = [1, 0]>>
+      : !xegpu.tensor_desc<128x128xf16>
       -> vector<128x128xf16>
     %dpas = xegpu.dpas %load_a, %load_b
       {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1],
@@ -130,13 +127,12 @@ gpu.module @test_distribution {
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x1xf32>
   gpu.func @broadcast_dim1(%src: memref<256x1xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x1xf32>
-      -> !xegpu.tensor_desc<256x1xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<256x1xf32>
     %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<256x1xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
+      : !xegpu.tensor_desc<256x1xf32>
       -> vector<256x1xf32>
     // CHECK: vector.broadcast {{.*}}  : vector<32x1xf32> to vector<32x32xf32>
     %broadcast = vector.broadcast %load
-      {layout_result_0 = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
       : vector<256x1xf32> to vector<256x32xf32>
     %anchor = xegpu.convert_layout %broadcast
       <{input_layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 32], lane_layout = [8, 1], lane_data = [1, 1]>,
@@ -149,13 +145,12 @@ gpu.module @test_distribution {
   // CHECK-SAME: %[[ARG_0:.*]]: memref<1x128xf32>
   gpu.func @broadcast_dim0(%src: memref<1x128xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src : memref<1x128xf32>
-      -> !xegpu.tensor_desc<1x128xf32, #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<1x128xf32>
     %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<1x128xf32, #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+      : !xegpu.tensor_desc<1x128xf32>
       -> vector<1x128xf32>
     // CHECK: vector.broadcast {{.*}} : vector<1x32xf32> to vector<32x32xf32>
     %broadcast = vector.broadcast %load
-      {layout_result_0 = #xegpu.layout<sg_layout = [1, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<1x128xf32> to vector<32x128xf32>
     %anchor = xegpu.convert_layout %broadcast
       <{
@@ -178,15 +173,15 @@ gpu.module @test_distribution {
     %block_id_y = gpu.block_id y
     %0 = arith.muli %block_id_x, %c128 : index
     %1 = arith.muli %block_id_y, %c128 : index
-    %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
+    %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32>
     // CHECK: [[DESC_A:%.+]] = xegpu.create_nd_tdesc %[[ARG_0]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x128xf16>
     // CHECK: [[DESC_B:%.+]] = xegpu.create_nd_tdesc %[[ARG_1]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x16xf16>
-    %3 = xegpu.create_nd_tdesc %arg0 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
-    %4 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
+    %3 = xegpu.create_nd_tdesc %arg0 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16>
+    %4 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16>
     // load_nd with offset
-    %5 = xegpu.load_nd %2[%0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}: !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>> -> vector<128x128xf32>
-    %6 = xegpu.load_nd %3[%0, %c0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
-    %7 = xegpu.load_nd %4[%c0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
+    %5 = xegpu.load_nd %2[%0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}: !xegpu.tensor_desc<128x128xf32> -> vector<128x128xf32>
+    %6 = xegpu.load_nd %3[%0, %c0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}: !xegpu.tensor_desc<128x128xf16> -> vector<128x128xf16>
+    %7 = xegpu.load_nd %4[%c0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}: !xegpu.tensor_desc<128x128xf16> -> vector<128x128xf16>
     // scf.for loop
     //      CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]]
     // CHECK-SAME: iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) ->
@@ -203,14 +198,12 @@ gpu.module @test_distribution {
            layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>,
            layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
           : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
-      %10 = xegpu.load_nd %3[%arg3, %c0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
-      %11 = xegpu.load_nd %4[%c0, %arg3] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
+      %10 = xegpu.load_nd %3[%arg3, %c0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}: !xegpu.tensor_desc<128x128xf16> -> vector<128x128xf16>
+      %11 = xegpu.load_nd %4[%c0, %arg3] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}: !xegpu.tensor_desc<128x128xf16> -> vector<128x128xf16>
       scf.yield %10, %11, %9 : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>
-    }  {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>,
-        layout_result_1 = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>,
-        layout_result_2 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
+    }
     // store_nd with offset
-    xegpu.store_nd %8#2, %2[%0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>} : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
+    xegpu.store_nd %8#2, %2[%0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>} : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32>
     gpu.return
   }
 
@@ -228,9 +221,9 @@ gpu.module @test_distribution {
     scf.if %cond {
         // CHECK-NOT: index.sub
         %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
-          -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
+          -> !xegpu.tensor_desc<256x128xf32>
         %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>}
-          : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
+          : !xegpu.tensor_desc<256x128xf32>
           -> vector<256x128xf32>
     } {sg_id_range = #xegpu.range<[0, 32]>}
     %cond3 = arith.cmpi sge, %sg_id, %c2 : index
@@ -241,11 +234,11 @@ gpu.module @test_distribution {
       // CHECK: %[[C2:.*]] = arith.constant 2 : index
       // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]]
       %tdesc = xegpu.create_nd_tdesc %src2 : memref<128x64xf32>
-        -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
+        -> !xegpu.tensor_desc<128x64xf32>
       %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>}
-        : !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
+        : !xegpu.tensor_desc<128x64xf32>
         -> vector<128x64xf32>
-      %exp = math.exp %load {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>} : vector<128x64xf32>
+      %exp = math.exp %load : vector<128x64xf32>
       %anchor = xegpu.convert_layout %exp
         <{
           input_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>,
@@ -262,9 +255,9 @@ gpu.module @test_distribution {
     %c3 = arith.constant 3 : index
     %c32 = arith.constant 32 : index
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
-      -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
+      -> !xegpu.tensor_desc<256x128xf32>
     %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
+      : !xegpu.tensor_desc<256x128xf32>
       -> vector<256x128xf32>
     %cond1 = arith.cmpi sge, %sg_id, %c3 : index
     %cond2 = arith.cmpi slt, %sg_id, %c32 : index
@@ -275,11 +268,11 @@ gpu.module @test_distribution {
         // CHECK: %[[C3:.*]] = arith.constant 3 : index
         // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C3]]
         %td = xegpu.create_nd_tdesc %src1 : memref<128x64xf32>
-          -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
+          -> !xegpu.tensor_desc<128x64xf32>
         %ld =  xegpu.load_nd %td[0, 0] {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>}
-          : !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
+          : !xegpu.tensor_desc<128x64xf32>
           -> vector<128x64xf32>
-        %exp = math.exp %ld {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>} : vector<128x64xf32>
+        %exp = math.exp %ld : vector<128x64xf32>
         %anchor = xegpu.convert_layout %exp
           <{
             input_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>,
@@ -297,8 +290,8 @@ gpu.module @test_distribution {
     // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<32x4xi1>
     // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}>
     // CHECK-SAME: : memref<?xf16>, vector<32x4xindex>, vector<32x4xi1> -> vector<32x4xf16>
-    %offset =  arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 4]>} dense<0> : vector<256x16xindex>
-    %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 4]>} dense<1> : vector<256x16xi1>
+    %offset =  arith.constant dense<0> : vector<256x16xindex>
+    %mask = arith.constant dense<1> : vector<256x16xi1>
     %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 4]>, l1_hint = #xegpu.cache_hint<cached>}
       : memref<?xf16>, vector<256x16xindex>, vector<256x16xi1> -> vector<256x16xf16>
     gpu.return
@@ -312,12 +305,10 @@ gpu.module @test_distribution {
     // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<8xi1>
     // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>, layout = #xegpu.layout<inst_data = [8]>}>
      // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
-    %val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<25.5> : vector<256xf16>
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<0> : vector<256xindex>
-    %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<1> : vector<256xi1>
+    %val = arith.constant dense<25.5> : vector<256xf16>
+    %offset = arith.constant dense<0> : vector<256xindex>
+    %mask = arith.constant dense<1> : vector<256xi1>
     xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
-                                             layout_operand_2 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
-                                             layout_operand_3 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
                                              l1_hint = #xegpu.cache_hint<cached>}
       : vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1>
     gpu.return
@@ -330,8 +321,8 @@ gpu.module @test_distribution {
     // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<8xi1>
     // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 4 : i64, l1_hint = #xegpu.cache_hint<cached>}>
     // CHECK-SAME: : memref<?xf16>, vector<8xindex>, vector<8xi1> -> vector<8x4xf16>
-    %offset =  arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<0> : vector<256xindex>
-    %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<1> : vector<256xi1>
+    %offset =  arith.constant dense<0> : vector<256xindex>
+    %mask = arith.constant dense<1> : vector<256xi1>
     %load = xegpu.load %src[%offset], %mask {chunk_size = 4, layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 4]>, l1_hint = #xegpu.cache_hint<cached>}
       : memref<?xf16>, vector<256xindex>, vector<256xi1> -> vector<256x4xf16>
     gpu.return
@@ -381,7 +372,7 @@ gpu.module @test_distribution {
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
     //CHECK: [[off_x:%.+]] = arith.remui [[l_off_x]], [[c128]] : index
     //CHECK: xegpu.store_matrix [[cst]], [[mdesc]][[[off_y]], [[off_x]]] : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>, index, index
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32]>} dense<1.0> : vector<64x128xf32>
+    %cst = arith.constant dense<1.0> : vector<64x128xf32>
     %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
     xegpu.store_matrix %cst, %mdesc[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32]>} : vector<64x128xf32>, !xegpu.mem_desc<64x128xf32>
     gpu.return
@@ -389,14 +380,14 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: @vector_reduce_dim_0
   gpu.func @vector_reduce_dim_0(%src: memref<4x128xf32>) {
-    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>, dims = [0]>} dense<1.0> : vector<128xf32>
+    %cst = arith.constant dense<1.0> : vector<128xf32>
     %tdesc = xegpu.create_nd_tdesc %src : memref<4x128xf32>
-      -> !xegpu.tensor_desc<4x128xf32, #xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>>
+      -> !xegpu.tensor_desc<4x128xf32>
     %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>}
-      : !xegpu.tensor_desc<4x128xf32, #xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>>
+      : !xegpu.tensor_desc<4x128xf32>
       -> vector<4x128xf32>
     // CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [0] : vector<4x4xf32> to vector<4xf32>
-    %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>, dims = [0]>} [0]
+    %reduce = vector.multi_reduction <add>, %load, %cst [0]
       : vector<4x128xf32> to vector<128xf32>
     %anchor = xegpu.convert_layout %reduce
       <{input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>, dims = [0]>, 
@@ -407,14 +398,14 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: @vector_reduce_dim_1
   gpu.func @vector_reduce_dim_1(%src: memref<256x64xf32>) {
-    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>, dims = [1]>} dense<1.0> : vector<256xf32>
+    %cst = arith.constant dense<1.0> : vector<256xf32>
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32>
-      -> !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>>
+      -> !xegpu.tensor_desc<256x64xf32>
     %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>}
-      : !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>>
+      : !xegpu.tensor_desc<256x64xf32>
       -> vector<256x64xf32>
     // CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [1] : vector<16x64xf32> to vector<16xf32>
-    %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>, dims = [1]>} [1]
+    %reduce = vector.multi_reduction <add>, %load, %cst [1]
       : vector<256x64xf32> to vector<256xf32>
     %anchor = xegpu.convert_layout %reduce
       <{input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>, dims = [1]>, 
@@ -425,12 +416,12 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: @vector_reduce_4D
    gpu.func @vector_reduce_4D(%src: ui64) {
-      %cst_acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>, dims = [3]>} dense<0.0> : vector<4x2x6xf16>
-      %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} dense<0>  : vector<4x2x6x32xindex>
-      %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} dense<true> : vector<4x2x6x32xi1>
+      %cst_acc = arith.constant dense<0.0> : vector<4x2x6xf16>
+      %offset = arith.constant dense<0>  : vector<4x2x6x32xindex>
+      %mask = arith.constant dense<true> : vector<4x2x6x32xi1>
       %load = xegpu.load %src[%offset], %mask  {layout = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} : ui64, vector<4x2x6x32xindex>, vector<4x2x6x32xi1> -> vector<4x2x6x32xf16>
       // CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [3] : vector<1x1x1x32xf16> to vector<1x1x1xf16>
-      %reduce = vector.multi_reduction <add>, %load, %cst_acc {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>, dims = [3]>} [3]
+      %reduce = vector.multi_reduction <add>, %load, %cst_acc [3]
       : vector<4x2x6x32xf16> to vector<4x2x6xf16>
       %anchor = xegpu.convert_layout %reduce
         <{
@@ -456,13 +447,13 @@ gpu.module @test_distribution {
   // CHECK-DAG: %[[FINAL:.*]] = vector.multi_reduction <add>, %[[LOAD_SLM]], %[[CST_FINAL]] [0, 1] : vector<4x4xf32> to f32
   // CHECK-DAG: arith.addf %[[FINAL]], %[[CST]] : f32
   gpu.func @vector_reduce_scalar_cross_sg(%src: memref<32x32xf32>) {
-    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 4], sg_data = [8, 8]>, dims = [0, 1]>} 0.0 : f32
+    %cst = arith.constant 0.0 : f32
     %tdesc = xegpu.create_nd_tdesc %src : memref<32x32xf32>
-      -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [8, 8]>>
+      -> !xegpu.tensor_desc<32x32xf32>
     %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [8, 8]>}
-      : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [8, 8]>>
+      : !xegpu.tensor_desc<32x32xf32>
       -> vector<32x32xf32>
-    %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 4], sg_data = [8, 8]>, dims = [0, 1]>} [0, 1]
+    %reduce = vector.multi_reduction <add>, %load, %cst [0, 1]
       : vector<32x32xf32> to f32
     %anchor = xegpu.convert_layout %reduce
         <{
@@ -487,7 +478,7 @@ gpu.module @test_distribution {
     //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
     //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
-    %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex>
+    %step = vector.step : vector<128xindex>
     %anchor = xegpu.convert_layout %step
       <{
         input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>,
@@ -507,7 +498,7 @@ gpu.module @test_distribution {
     //CHECK: [[BASE:%.+]] = vector.step : vector<8xindex>
     //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex>
-    %step = vector.step {layout_result_0 = #xegpu.layout<sg_layout = [16], sg_data = [8]>}: vector<128xindex>
+    %step = vector.step : vector<128xindex>
     %anchor = xegpu.convert_layout %step
       <{
         input_layout = #xegpu.layout<sg_layout = [16], sg_data = [8]>,
@@ -530,11 +521,11 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: vector_shape_cast
   gpu.func @vector_shape_cast() {
-    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, dims = [0, 1, 2]>} dense<10> : vector<128xindex>
-    %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, dims = [0, 1, 2]>} : vector<128xindex>
-    %muli = arith.muli %cst, %step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, dims = [0, 1, 2]>} : vector<128xindex>
+    %cst = arith.constant dense<10> : vector<128xindex>
+    %step = vector.step : vector<128xindex>
+    %muli = arith.muli %cst, %step : vector<128xindex>
     //CHECK: vector.shape_cast {{.*}} : vector<32xindex> to vector<1x1x1x32xindex>
-    %shape_cast = vector.shape_cast %muli {layout_result_0 = #xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, layout_operand_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, dims = [0, 1, 2]>} : vector<128xindex> to vector<1x1x1x128xindex>
+    %shape_cast = vector.shape_cast %muli : vector<128xindex> to vector<1x1x1x128xindex>
     %anchor = xegpu.convert_layout %shape_cast
       <{
         input_layout = #xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>,
@@ -547,7 +538,7 @@ gpu.module @test_distribution {
   gpu.func @vector_broadcast(%arg0: index, %arg1: index) {
     %muli = arith.muli %arg0, %arg1 : index
     // CHECK: vector.broadcast {{.*}} : index to vector<1x1x1x32xindex>
-    %broadcast = vector.broadcast %muli {layout_result_0 = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} : index to vector<4x2x6x32xindex>
+    %broadcast = vector.broadcast %muli : index to vector<4x2x6x32xindex>
     %anchor = xegpu.convert_layout %broadcast
       <{
         input_layout = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>,
@@ -559,12 +550,12 @@ gpu.module @test_distribution {
   // CHECK-LABEL: vector_transpose
   gpu.func @vector_transpose(%src: memref<256x32xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x32xf32>
-        -> !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+        -> !xegpu.tensor_desc<256x32xf32>
     %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>}
-        : !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+        : !xegpu.tensor_desc<256x32xf32>
         -> vector<256x32xf32>
     //CHECK: vector.transpose {{.*}}, [1, 0] : vector<64x32xf32> to vector<32x64xf32>
-    %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], lane_layout = [1, 16], lane_data = [1, 1]>}
+    %trans = vector.transpose %load, [1, 0]
       : vector<256x32xf32> to vector<32x256xf32>
     %anchor = xegpu.convert_layout %trans
       <{
@@ -586,7 +577,7 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[T6:.*]] = arith.addi %[[T4]], %[[T5]] : index
     // CHECK-DAG: %[[T7:.*]] = vector.broadcast %[[T6]] : index to vector<1x1xindex>
     // CHECK-DAG: %[[T8:.*]] = arith.addi %[[CST]], %[[T7]] : vector<1x1xindex>
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 1]>} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex>
+    %cst = arith.constant dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex>
     %anchor = xegpu.convert_layout %cst
       <{
         input_layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 1]>,
@@ -659,7 +650,7 @@ gpu.module @test_distribution {
   // CHECK-LABEL: scalar_broadcast
   gpu.func @scalar_broadcast(%arg0: index) {
     // CHECK: vector.broadcast {{.*}} : index to vector<1x1x1xindex>
-    %broadcast = vector.broadcast %arg0 {layout_result_0 = #xegpu.layout<sg_layout = [4, 8, 1], sg_data = [1, 1, 1]>} : index to vector<4x1x1xindex>
+    %broadcast = vector.broadcast %arg0 : index to vector<4x1x1xindex>
     %anchor = xegpu.convert_layout %broadcast
       <{
         input_layout = #xegpu.layout<sg_layout = [4, 8, 1], sg_data = [1, 1, 1]>,
@@ -678,7 +669,7 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[MAX:.*]] = arith.maxsi %[[SUB]], %[[C0:.*]] : index
     // CHECK-DAG: %[[MIN:.*]] = arith.minsi %[[MAX]], %[[C16:.*]] : index
     // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MIN]] : vector<16xi1>
-    %constant_mask = vector.constant_mask [8] {layout_result_0 = #xegpu.layout<sg_layout = [2], sg_data = [16]>} : vector<32xi1>
+    %constant_mask = vector.constant_mask [8] : vector<32xi1>
     %anchor = xegpu.convert_layout %constant_mask
       <{
         input_layout = #xegpu.layout<sg_layout = [2], sg_data = [16]>,
@@ -704,7 +695,7 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[MAXCOL:.*]] = arith.maxsi %[[SUBCOL]], %[[C7:.*]] : index
     // CHECK-DAG: %[[MINCOL:.*]] = arith.minsi %[[MAXCOL]], %[[C32:.*]] : index
     // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MINROW]], %[[MINCOL]] : vector<32x32xi1>
-    %constant_mask = vector.constant_mask [16, 16] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xi1>
+    %constant_mask = vector.constant_mask [16, 16] : vector<256x128xi1>
     %anchor = xegpu.convert_layout %constant_mask
       <{
         input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>,
@@ -724,7 +715,7 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[MIN:.*]] = arith.minsi %[[MAX]], %[[C16:.*]] : index
     // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MIN]] : vector<16xi1>
     %cst8 = arith.constant 8 : index
-    %constant_mask = vector.create_mask %cst8 {layout_result_0 = #xegpu.layout<sg_layout = [2], sg_data = [16]>} : vector<32xi1>
+    %constant_mask = vector.create_mask %cst8 : vector<32xi1>
     %anchor = xegpu.convert_layout %constant_mask
       <{
         input_layout = #xegpu.layout<sg_layout = [2], sg_data = [16]>,
@@ -751,7 +742,7 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[MINCOL:.*]] = arith.minsi %[[MAXCOL]], %[[C32:.*]] : index
     // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MINROW]], %[[MINCOL]] : vector<32x32xi1>
     %cst16 = arith.constant 16 : index
-    %constant_mask = vector.create_mask %cst16, %cst16 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xi1>
+    %constant_mask = vector.create_mask %cst16, %cst16 : vector<256x128xi1>
     %anchor = xegpu.convert_layout %constant_mask
       <{
         input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>,
@@ -763,16 +754,15 @@ gpu.module @test_distribution {
   // CHECK-LABEL: distribute_load_slice_attr
   gpu.func @distribute_load_slice_attr() {
     %2 = memref.alloca() {alignment = 1024} : memref<4096xf32>
-    %offset =  arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<0> : vector<256xindex>
-    %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<1> : vector<256xi1>
+    %offset =  arith.constant dense<0> : vector<256xindex>
+    %mask = arith.constant dense<1> : vector<256xi1>
 
     // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>}>
     // CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32>
     %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32>
 
     // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[LOAD]] : vector<32xf32> to vector<32x32xf32>
-    %4 = vector.broadcast %3 {layout_result_0 =
-        #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>} : vector<256xf32> to vector<256x256xf32>
+    %4 = vector.broadcast %3 : vector<256xf32> to vector<256x256xf32>
     %anchor = xegpu.convert_layout %4
       <{
         input_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>,
@@ -801,11 +791,11 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD_SLM]], %[[CST_3]] [1] : vector<1x32x32xf32> to vector<1x32xf32>
     // CHECK-DAG: %[[ADD:.*]] = arith.addf %[[FINAL_REDUCE]], %[[CST]] : vector<1x32xf32>
     // CHECK-DAG: gpu.return
-    %cst_3 = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>, dims = [1]>} dense<1.0> : vector<1x32xf32>
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>} dense<0> : vector<1x32x32xindex>
-    %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>} dense<true> : vector<1x32x32xi1>
+    %cst_3 = arith.constant dense<1.0> : vector<1x32xf32>
+    %offset = arith.constant dense<0> : vector<1x32x32xindex>
+    %mask = arith.constant dense<true> : vector<1x32x32xi1>
     %14 = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>} : memref<?xf32>, vector<1x32x32xindex>, vector<1x32x32xi1> -> vector<1x32x32xf32>
-    %15 = vector.multi_reduction <add>, %14, %cst_3 {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>, dims = [1]>} [1] : vector<1x32x32xf32> to vector<1x32xf32>
+    %15 = vector.multi_reduction <add>, %14, %cst_3 [1] : vector<1x32x32xf32> to vector<1x32xf32>
     %anchor = xegpu.convert_layout %15
       <{
         input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>, dims = [1]>,
@@ -840,13 +830,13 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[CST_CROSS_SG_1:.*]] = arith.constant dense<0.000000e+00> : vector<32xf32>
     // CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD_SLM]], %[[CST_CROSS_SG_1]] [0] : vector<8x32xf32> to vector<32xf32>
     // CHECK-DAG: arith.addf %[[FINAL_REDUCE]], %[[CST]] : vector<32xf32>
-    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} dense<0.0> : vector<128xf32>
+    %cst = arith.constant dense<0.0> : vector<128xf32>
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
-      -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
+      -> !xegpu.tensor_desc<256x128xf32>
     %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>}
-      : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
+      : !xegpu.tensor_desc<256x128xf32>
       -> vector<256x128xf32>
-    %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0]
+    %reduce = vector.multi_reduction <add>, %load, %cst [0]
       : vector<256x128xf32> to vector<128xf32>
     %anchor = xegpu.convert_layout %reduce
       <{
@@ -876,11 +866,11 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD_SLM]], %[[CST_3]] [2, 3] : vector<1x1x4x4xf32> to vector<1x1xf32>
     // CHECK-DAG: %[[FINAL_ADD:.*]] = arith.addf %[[FINAL_REDUCE]], %[[CST]] : vector<1x1xf32>
     // CHECK-DAG: gpu.return
-    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>, dims = [2, 3]>} dense<0.0> : vector<2x2xf32>
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>} dense<0> : vector<2x2x128x128xindex>
-    %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>} dense<true> : vector<2x2x128x128xi1>
+    %cst = arith.constant dense<0.0> : vector<2x2xf32>
+    %offset = arith.constant dense<0> : vector<2x2x128x128xindex>
+    %mask = arith.constant dense<true> : vector<2x2x128x128xi1>
     %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>} : memref<?xf32>, vector<2x2x128x128xindex>, vector<2x2x128x128xi1> -> vector<2x2x128x128xf32>
-    %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>, dims = [2, 3]>} [2, 3] : vector<2x2x128x128xf32> to vector<2x2xf32>
+    %reduce = vector.multi_reduction <add>, %load, %cst [2, 3] : vector<2x2x128x128xf32> to vector<2x2xf32>
     %anchor = xegpu.convert_layout %reduce
       <{
         input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>, dims = [2, 3]>,
@@ -909,11 +899,11 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD_SLM]], %[[CST_3]] [2, 3] : vector<16x16x4x4xf32> to vector<16x16xf32>
     // CHECK-DAG: %[[FINAL_ADD:.*]] = arith.addf %[[FINAL_REDUCE]], %[[CST]] : vector<16x16xf32>
     // CHECK-DAG: gpu.return
-    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>, dims = [2, 3]>} dense<0.0> : vector<32x32xf32>
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>} dense<0> : vector<32x32x128x128xindex>
-    %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>} dense<true> : vector<32x32x128x128xi1>
+    %cst = arith.constant dense<0.0> : vector<32x32xf32>
+    %offset = arith.constant dense<0> : vector<32x32x128x128xindex>
+    %mask = arith.constant dense<true> : vector<32x32x128x128xi1>
     %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>} : memref<?xf32>, vector<32x32x128x128xindex>, vector<32x32x128x128xi1> -> vector<32x32x128x128xf32>
-    %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>, dims = [2, 3]>} [2, 3] : vector<32x32x128x128xf32> to vector<32x32xf32>
+    %reduce = vector.multi_reduction <add>, %load, %cst [2, 3] : vector<32x32x128x128xf32> to vector<32x32xf32>
     %anchor = xegpu.convert_layout %reduce
       <{
         input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>, dims = [2, 3]>,
@@ -924,19 +914,13 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: load_nd_tdesc_with_anchor_layout
   gpu.func @load_nd_tdesc_with_anchor_layout(%src: memref<256x128xf32>) {
-    //CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    //CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<inst_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
-      -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-    // CHECK: xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<inst_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1]>}>
-    // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32>
+      -> !xegpu.tensor_desc<256x128xf32>
+    // CHECK: xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<inst_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<inst_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32>
     %load =  xegpu.load_nd %tdesc[0, 0] <{layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16],lane_layout = [1, 16], lane_data = [1, 1]>}>
-      : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+      : !xegpu.tensor_desc<256x128xf32>
       -> vector<256x128xf32>
-    %anchor = xegpu.convert_layout %load
-      <{
-        input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
-        target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1]>
-      }> : vector<256x128xf32>
     gpu.return
   }
 
@@ -963,7 +947,7 @@ gpu.module @test_distribution {
       %10 = xegpu.convert_layout %8 <{input_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [32, 16]>, target_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [16, 16]>}> : vector<32x256xf16>
       %11 = xegpu.dpas %9, %10, %arg4 {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [16, 16]>, layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>} : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf32> -> vector<256x256xf32>
       scf.yield %11 : vector<256x256xf32>
-    } {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>}
+    }
     xegpu.store_nd %6, %2[%0, %1] <{layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>}> : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>>
     gpu.return
   }
@@ -1001,8 +985,8 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[LOAD_OFF_Y:.*]] = arith.remui %[[LOAD_MUL_Y]], %[[C128:.*]] : index
     // CHECK-DAG: %[[LOAD_OFF_X:.*]] = arith.remui %[[LOAD_MUL_X]], %[[C256:.*]] : index
     // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MDESC]][%[[LOAD_OFF_Y]], %[[LOAD_OFF_X]]] <{layout = #xegpu.layout<inst_data = [16, 16]>}>: !xegpu.mem_desc<128x256xf32>, index, index -> vector<16x32xf32>
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<128x256xf32> -> !xegpu.tensor_desc<128x256xf32, #xegpu.layout<sg_layout = [4, 16], sg_data = [32, 16], inst_data = [16, 16]>>
-    %1 = xegpu.load_nd %0[0, 0] {layout = #xegpu.layout<sg_layout = [4, 16], sg_data = [32, 16], inst_data = [16, 16]>} : !xegpu.tensor_desc<128x256xf32, #xegpu.layout<sg_layout = [4, 16], sg_data = [32, 16], inst_data = [16, 16]>> -> vector<128x256xf32>
+    %0 = xegpu.create_nd_tdesc %arg0 : memref<128x256xf32> -> !xegpu.tensor_desc<128x256xf32>
+    %1 = xegpu.load_nd %0[0, 0] {layout = #xegpu.layout<sg_layout = [4, 16], sg_data = [32, 16], inst_data = [16, 16]>} : !xegpu.tensor_desc<128x256xf32> -> vector<128x256xf32>
     %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [4, 16], sg_data = [32, 16], inst_data = [16, 16]>,
                                    target_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 32], inst_data = [16, 16]>}> : vector<128x256xf32>
     %anchor = xegpu.convert_layout %2
@@ -1043,8 +1027,8 @@ gpu.module @test_distribution {
     // CHECK-DAG: %[[LOAD_OFF_Y:.*]] = arith.remui %[[LOAD_MUL_Y]], %[[C128:.*]] : index
     // CHECK-DAG: %[[LOAD_OFF_X:.*]] = arith.remui %[[LOAD_MUL_X]], %[[C256:.*]] : index
     // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MDESC]][%[[LOAD_OFF_Z]], %[[LOAD_OFF_Y]], %[[LOAD_OFF_X]]] <{layout = #xegpu.layout<inst_data = [1, 16, 16]>}>: !xegpu.mem_desc<8x128x256xf32>, index, index, index -> vector<1x16x32xf32>
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4, 16], sg_data = [1, 32, 16], inst_data = [1, 16, 16]>} dense<0> : vector<8x128x256xindex>
-    %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4, 16], sg_data = [1, 32, 16], inst_data = [1, 16, 16]>} dense<true> : vector<8x128x256xi1>
+    %offset = arith.constant dense<0> : vector<8x128x256xindex>
+    %mask = arith.constant dense<true> : vector<8x128x256xi1>
     %1 = xegpu.load %arg0[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [8, 4, 16], sg_data = [1, 32, 16], inst_data = [1, 16, 16]>} : memref<?xf32>, vector<8x128x256xindex>, vector<8x128x256xi1> -> vector<8x128x256xf32>
     %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [8, 4, 16], sg_data = [1, 32, 16], inst_data = [1, 16, 16]>,
                                    target_layout = #xegpu.layout<sg_layout = [8, 8, 8], sg_data = [1, 16, 32], inst_data = [1, 16, 16]>}> : vector<8x128x256xf32>
@@ -1058,13 +1042,13 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: convert_layout_reduce_to_scalar
   gpu.func @convert_layout_reduce_to_scalar(%arg0: memref<32x32xf32>) {
-    %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>} dense<true> : vector<32x32xi1>
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>} dense<0> : vector<32x32xindex>
+    %mask = arith.constant dense<true> : vector<32x32xi1>
+    %offset = arith.constant dense<0> : vector<32x32xindex>
     %cst_0 = arith.constant 0.000000e+00 : f32
     %intptr = memref.extract_aligned_pointer_as_index %arg0 : memref<32x32xf32> -> index
     %10 = arith.index_cast %intptr : index to i64
-    %11 = xegpu.load %10[%offset], %mask <{layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>}> {layout_operand_1 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>, layout_operand_2 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>} : i64, vector<32x32xindex>, vector<32x32xi1> -> vector<32x32xf32>
-    %12 = vector.multi_reduction <add>, %11, %cst_0 {layout_operand_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>, layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>, dims = [0, 1]>} [0, 1] : vector<32x32xf32> to f32
+    %11 = xegpu.load %10[%offset], %mask <{layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>}> : i64, vector<32x32xindex>, vector<32x32xi1> -> vector<32x32xf32>
+    %12 = vector.multi_reduction <add>, %11, %cst_0 [0, 1] : vector<32x32xf32> to f32
     // CHECK-NOT: xegpu.convert_layout
     %13 = xegpu.convert_layout %12 <{input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>, dims = [0, 1]>, target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>, dims = [0, 1]>}> : f32
     gpu.return
@@ -1080,28 +1064,25 @@ gpu.module @test_distribution {
   gpu.func @distribute_nested_slice(%src: memref<256x256xf32>) {
 
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x256xf32>
-      -> !xegpu.tensor_desc<256x256xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32]>>
+      -> !xegpu.tensor_desc<256x256xf32>
 
     %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32]>}
-      : !xegpu.tensor_desc<256x256xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32]>>
+      : !xegpu.tensor_desc<256x256xf32>
       -> vector<256x256xf32>
 
     %load2 = xegpu.convert_layout %load <{input_layout = #xegpu.layout<sg_layout = [8, 8],  sg_data = [32, 32]>, target_layout = #xegpu.slice<#xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 1, 16, 32, 1, 1]>, dims=[2]>, dims=[4]>, dims=[1, 3]>}> : vector<256x256xf32>
 
-    %scast = vector.shape_cast %load2 {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 1, 16, 32, 1, 1]>, dims=[2]>, dims=[4]>, layout_operand_0 = #xegpu.slice<#xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 1, 16, 32, 1, 1]>, dims=[2]>, dims=[4]>, dims=[1, 3]>} : vector<256x256xf32> to vector<256x1x256x1xf32>
+    %scast = vector.shape_cast %load2 : vector<256x256xf32> to vector<256x1x256x1xf32>
 
-    %bcast = vector.broadcast %scast {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 1]>, dims=[2]>, dims=[4]>, layout_operand_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 1, 16, 32, 1, 1]>, dims=[2]>, dims=[4]>} : vector<256x1x256x1xf32> to vector<256x16x256x16xf32>
+    %bcast = vector.broadcast %scast : vector<256x1x256x1xf32> to vector<256x16x256x16xf32>
 
-    %scast1 = vector.shape_cast %bcast {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 1]>, dims=[2]>, layout_operand_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 1]>, dims=[2]>, dims=[4]>} : vector<256x16x256x16xf32> to vector<256x16x256x16x1xf32>
+    %scast1 = vector.shape_cast %bcast : vector<256x16x256x16xf32> to vector<256x16x256x16x1xf32>
 
-    %bcast1 = vector.broadcast %scast1 {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 16]>, dims=[2]>, layout_operand_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 1]>, dims=[2]>}  : vector<256x16x256x16x1xf32> to vector<256x16x256x16x16xf32>
+    %bcast1 = vector.broadcast %scast1 : vector<256x16x256x16x1xf32> to vector<256x16x256x16x16xf32>
 
-    %scast2 = vector.shape_cast %bcast1 {layout_result_0 =
-        #xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 1, 32, 16, 16]>, layout_operand_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 1, 32, 16, 16]>, dims=[2]>} : vector<256x16x256x16x16xf32> to vector<256x16x1x256x16x16xf32>
+    %scast2 = vector.shape_cast %bcast1 : vector<256x16x256x16x16xf32> to vector<256x16x1x256x16x16xf32>
 
-    %bcast2 = vector.broadcast %scast2 {layout_result_0 =
-        #xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 16]>, layout_operand_0 =
-        #xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 1, 32, 16, 16]>} : vector<256x16x1x256x16x16xf32> to vector<256x16x16x256x16x16xf32>
+    %bcast2 = vector.broadcast %scast2 : vector<256x16x1x256x16x16xf32> to vector<256x16x16x256x16x16xf32>
     %anchor = xegpu.convert_layout %bcast2
       <{
         input_layout = #xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 16]>,
@@ -1114,11 +1095,11 @@ gpu.module @test_distribution {
   // CHECK: arith.constant dense<1.000000e+00> : vector<16x128xf32>
   // CHECK: xegpu.store_nd %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] <{layout = #xegpu.layout<inst_data = [8, 16]>}>
   gpu.func @preserve_anchor_layout(%dst: memref<256x128xf32>) {
-    %val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 128]>} dense<1.0> : vector<256x128xf32>
+    %val = arith.constant dense<1.0> : vector<256x128xf32>
     %tdesc = xegpu.create_nd_tdesc %dst : memref<256x128xf32>
-      -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 128], inst_data = [8, 16]>>
+      -> !xegpu.tensor_desc<256x128xf32>
     xegpu.store_nd %val, %tdesc[0, 0] <{layout = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 128], inst_data = [8, 16]>}>
-      : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 128], inst_data = [8, 16]>>
+      : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32>
     gpu.return
   }
 
@@ -1130,7 +1111,7 @@ gpu.module @test_distribution {
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
   }
-
+  
   gpu.func @scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) {
     // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
     // CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index
@@ -1142,10 +1123,10 @@ gpu.module @test_distribution {
     %block_id_y = gpu.block_id y
     %0 = arith.muli %block_id_x, %c128 : index
     %1 = arith.muli %block_id_y, %c128 : index
-    %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
-    %3 = xegpu.load_nd %2[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>} : !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>> -> vector<128x128xf32>
-    %4 = xegpu.create_nd_tdesc %arg0 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
-    %5 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
+    %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32>
+    %3 = xegpu.load_nd %2[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>} : !xegpu.tensor_desc<128x128xf32> -> vector<128x128xf32>
+    %4 = xegpu.create_nd_tdesc %arg0 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16>
+    %5 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16>
 
     // CHECK: %[[SCF:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C1024]] step %[[C128]]
     // CHECK-SAME: iter_args(%[[ARG6:.*]] = {{.*}}) ->
@@ -1156,18 +1137,18 @@ gpu.module @test_distribution {
     // CHECK: scf.yield %[[C]] : vector<16x16xf32>
     %6 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg6 = %3)
         -> (vector<128x128xf32>) {
-      %8 = xegpu.load_nd %4[0, %arg3] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
-      %9 = xegpu.load_nd %5[%arg3, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
+      %8 = xegpu.load_nd %4[0, %arg3] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>} : !xegpu.tensor_desc<128x128xf16> -> vector<128x128xf16>
+      %9 = xegpu.load_nd %5[%arg3, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>} : !xegpu.tensor_desc<128x128xf16> -> vector<128x128xf16>
       %10 = xegpu.dpas %8, %9, %arg6
         {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>,
          layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>,
          layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
         : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
       scf.yield %10 : vector<128x128xf32>
-    } {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
+    }
     %7 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32>
-            -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
-    xegpu.store_nd %6, %7[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]> } : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
+            -> !xegpu.tensor_desc<128x128xf32>
+    xegpu.store_nd %6, %7[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]> } : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32>
     gpu.return
   }
 
@@ -1176,9 +1157,9 @@ gpu.module @test_distribution {
     %c10_i32 = arith.constant 10 : i32
     %c0_i32 = arith.constant 0 : i32
     %c256 = arith.constant 256 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
-    %1 = xegpu.load_nd %0[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
-    %2 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
+    %1 = xegpu.load_nd %0[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
+    %2 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
 
     // CHECK: scf.while {{.*}} : (vector<16xf32>, i32) -> (vector<16xf32>, i32)
     %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) {
@@ -1188,9 +1169,9 @@ gpu.module @test_distribution {
     } do {
     // CHECK: (%[[ARG2:.*]]: vector<16xf32>, %[[ARG3:.*]]: i32)
     ^bb0(%arg2: vector<256xf32>, %arg3: i32):
-      xegpu.store_nd %arg2, %2[0]  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+      xegpu.store_nd %arg2, %2[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : vector<256xf32>, !xegpu.tensor_desc<256xf32>
       %4 = arith.addi %arg3, %c1_i32 : i32
-      %6 = xegpu.load_nd %0[%c256] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+      %6 = xegpu.load_nd %0[%c256] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
       scf.yield %6, %4 : vector<256xf32>, i32
     }
     gpu.return
@@ -1200,8 +1181,8 @@ gpu.module @test_distribution {
     %c10 = arith.constant 10 : index
     %id = gpu.subgroup_id : index
 
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
-    %1 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
+    %1 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
 
     %4 = arith.cmpi eq, %id, %c10 : index
     // CHECK-LABEL: scf.if
@@ -1209,19 +1190,19 @@ gpu.module @test_distribution {
     %5 = scf.if %4 -> (vector<256xf32>) {
       // CHECK-LABEL: xegpu.load_nd
       //  CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32>
-      %2 = xegpu.load_nd %0[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+      %2 = xegpu.load_nd %0[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
       // CHECK-LABEL: scf.yield
       //  CHECK-SAME: vector<16xf32>
       scf.yield %2 : vector<256xf32>
     } else {
       // CHECK-LABEL: xegpu.load_nd
       //  CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32>
-      %3 = xegpu.load_nd %1[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+      %3 = xegpu.load_nd %1[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
       // CHECK-LABEL: scf.yield
       //  CHECK-SAME: vector<16xf32>
       scf.yield %3 : vector<256xf32>
-    } {layout_result_0 = #xegpu.layout<sg_layout = [16], sg_data = [16]>}
-    xegpu.store_nd %5, %0[0] : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    }
+    xegpu.store_nd %5, %0[0] : vector<256xf32>, !xegpu.tensor_desc<256xf32>
     gpu.return
   }
 
@@ -1229,28 +1210,28 @@ gpu.module @test_distribution {
     %c10 = arith.constant 10 : index
     %id = gpu.subgroup_id : index
 
-    %t = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
-    %d = xegpu.load_nd %t[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+    %t = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
+    %d = xegpu.load_nd %t[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
 
     %0 = arith.cmpi eq, %id, %c10 : index
     // CHECK-LABEL: scf.if
     //  CHECK-SAME: (!xegpu.tensor_desc<16xf32>)
-    %1 = scf.if %0 -> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>) {
+    %1 = scf.if %0 -> (!xegpu.tensor_desc<256xf32>) {
       // CHECK-LABEL: xegpu.create_nd_tdesc
       //  CHECK-SAME: memref<1024xf32> -> !xegpu.tensor_desc<16xf32>
-      %2 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+      %2 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
       // CHECK-LABEL: scf.yield
       //  CHECK-SAME: !xegpu.tensor_desc<16xf32>
-      scf.yield %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+      scf.yield %2 : !xegpu.tensor_desc<256xf32>
     } else {
       // CHECK-LABEL: xegpu.create_nd_tdesc
       //  CHECK-SAME: memref<1024xf32> -> !xegpu.tensor_desc<16xf32>
-      %3 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+      %3 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
       // CHECK-LABEL: scf.yield
       //  CHECK-SAME: !xegpu.tensor_desc<16xf32>
-      scf.yield %3 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+      scf.yield %3 : !xegpu.tensor_desc<256xf32>
     }
-    xegpu.store_nd %d, %1[0] : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    xegpu.store_nd %d, %1[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : vector<256xf32>, !xegpu.tensor_desc<256xf32>
     gpu.return
   }
 

>From fc237d44bc8608702ae3733880016d422afd670c Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 2 May 2026 04:07:11 +0000
Subject: [PATCH 07/11] polish

---
 mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp | 4 +++-
 mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir    | 7 +------
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir        | 5 -----
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir           | 7 +------
 4 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index b295c74884447..08a9f92448b1c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -134,7 +134,9 @@ static void propagateResultsToRegularOperands(Operation *op) {
       result.setType(typeWithLayout);
     }
   }
-  if (resLayout)
+  // Multi-reduction op may reduce to scalar which needs layout.
+  if (isa<VectorType>(resultType) && resLayout ||
+      isa<vector::MultiDimReductionOp>(op))
     xegpu::setTemporaryLayout(result, resLayout);
 
   for (OpOperand &opr : op->getOpOperands()) {
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
index ec553aa33f49b..c8a9530641951 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
@@ -244,12 +244,7 @@ gpu.module @xevm_module{
       : vector<8x16xf32>
     %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> ->
       !xegpu.tensor_desc<8x16xf32>
-    %anchor = xegpu.convert_layout %5
-      <{
-        input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-        target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-      }> : vector<8x16xf32>
-    xegpu.store_nd %anchor, %6[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
+    xegpu.store_nd %5, %6[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
       !xegpu.tensor_desc<8x16xf32>
     gpu.return
   }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index b1a6d81bc1140..b015943a54897 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -21,11 +21,6 @@ gpu.module @test_distribution {
     %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<256x128xf32>
       -> vector<256x128xf32>
-    %anchor = xegpu.convert_layout %load
-      <{
-        input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
-        target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>
-      }> : vector<256x128xf32>
     gpu.return
   }
 
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index 0e79a8056418a..ff4e0db629083 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -43,11 +43,6 @@ gpu.module @test_distribution {
     %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<256x128xf32>
       -> vector<256x128xf32>
-    %anchor = xegpu.convert_layout %load
-      <{
-        input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>,
-        target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>
-      }> : vector<256x128xf32>
     gpu.return
   }
 
@@ -1111,7 +1106,7 @@ gpu.module @test_distribution {
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
   }
-  
+
   gpu.func @scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) {
     // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
     // CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index

>From a1a8d2f93ab5944de4a498cd9c39e3fdd7509ffe Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Mon, 4 May 2026 20:20:14 +0000
Subject: [PATCH 08/11] polish

---
 .../XeGPU/Transforms/XeGPULayoutImpl.h        |  2 +
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      | 77 +++++++++----------
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp |  4 -
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   |  9 +--
 4 files changed, 41 insertions(+), 51 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
index cafd3f392ff72..bac8f413acd40 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
@@ -123,6 +123,8 @@ DistributeLayoutAttr inferInsertSourceLayout(DistributeLayoutAttr resLayout,
                                              ArrayRef<int64_t> resShape,
                                              ArrayRef<int64_t> srcShape);
 
+/// Infers the source layout attribute for an extract operation. Adds
+/// leading dimensions to the source layout to match the source shape size.
 DistributeLayoutAttr inferExtractSourceLayout(DistributeLayoutAttr resLayout,
                                               ArrayRef<int64_t> resShape,
                                               ArrayRef<int64_t> srcShape);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 08a9f92448b1c..b35224f032d42 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -69,6 +69,18 @@ xegpu::dropInstDataOnAttrs(ArrayRef<NamedAttribute> attrs) {
   return out;
 }
 
+// Sets the layout on a TensorDesc value by updating its type to include
+// the given layout, if the type does not already have a layout attached.
+static void setTensorDescLayout(Value val, xegpu::DistributeLayoutAttr layout) {
+  auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(val.getType());
+  if (!tensorDescTy || tensorDescTy.getLayoutAttr())
+    return;
+  auto typeWithLayout = xegpu::TensorDescType::get(
+      tensorDescTy.getContext(), tensorDescTy.getShape(),
+      tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout);
+  val.setType(typeWithLayout);
+}
+
 // the walkRegionBackward() is a recursive function
 // the input rootOp is the function operation, which is also a region op.
 // it recursively processes the region op in reverse topological order.
@@ -122,26 +134,23 @@ static void propagateResultsToRegularOperands(Operation *op) {
   xegpu::DistributeLayoutAttr resLayout = getLayoutFromUsePoints(result);
   Type resultType = result.getType();
 
-  // recover layout for tensor Descriptor type, which is a special case since
-  // its layout is not stored as an attribute but encoded in the type itself.
-  // For vector type, we attach the layout as an attribute to op.
-  if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType)) {
-    auto layout = tensorDescTy.getLayoutAttr();
-    if (!layout) {
-      auto typeWithLayout = xegpu::TensorDescType::get(
-          tensorDescTy.getContext(), tensorDescTy.getShape(),
-          tensorDescTy.getElementType(), tensorDescTy.getEncoding(), resLayout);
-      result.setType(typeWithLayout);
-    }
-  }
-  // Multi-reduction op may reduce to scalar which needs layout.
-  if (isa<VectorType>(resultType) && resLayout ||
-      isa<vector::MultiDimReductionOp>(op))
+  if (!resLayout)
+    return;
+
+  // Recover layout for TensorDesc type results by updating the type to include
+  // the layout. For vector type
+  if (isa<xegpu::TensorDescType>(resultType))
+    setTensorDescLayout(result, resLayout);
+
+  // Recover layout for vector type results, or for multi-reduction ops which
+  // may reduce to a scalar that still needs a layout.
+  if (isa<VectorType>(resultType) || isa<vector::MultiDimReductionOp>(op))
     xegpu::setTemporaryLayout(result, resLayout);
 
   for (OpOperand &opr : op->getOpOperands()) {
     xegpu::DistributeLayoutAttr operandLayout =
         xegpu::inferSourceLayoutFromResult(opr, resLayout);
+    // Recover layout for vector operands
     if (isa<VectorType>(opr.get().getType()) && operandLayout)
       xegpu::setTemporaryLayout(opr, operandLayout);
   }
@@ -179,18 +188,11 @@ static void propagateRegionResultsToYieldOperands(
         // parent op's results.
         auto regionResult = regionBranchOp->getResult(i);
         layout = getLayoutFromUsePoints(regionResult);
-        if (layout)
+        if (layout) {
+          // set layout for the region op, like scf.loop
           xegpu::setTemporaryLayout(regionResult, layout);
-        if (auto tensorDescTy =
-                dyn_cast<xegpu::TensorDescType>(regionResult.getType())) {
-          auto tDescLayout = tensorDescTy.getLayoutAttr();
-          if (!tDescLayout) {
-            auto typeWithLayout = xegpu::TensorDescType::get(
-                tensorDescTy.getContext(), tensorDescTy.getShape(),
-                tensorDescTy.getElementType(), tensorDescTy.getEncoding(),
-                layout);
-            regionResult.setType(typeWithLayout);
-          }
+          if (isa<xegpu::TensorDescType>(regionResult.getType()))
+            setTensorDescLayout(regionResult, layout);
         }
       } else {
         // For region successor, get layout from the target region's block
@@ -203,6 +205,7 @@ static void propagateRegionResultsToYieldOperands(
       auto operandType = succOps[i].getType();
       if (isa<VectorType>(operandType) ||
           dyn_cast<xegpu::TensorDescType>(operandType))
+        // recover layout for yield op operands
         xegpu::setTemporaryLayout(yieldOp->getOpOperand(beginIdx + i), layout);
     }
   }
@@ -228,17 +231,10 @@ static void propagateRegionArgsToInits(mlir::RegionBranchOpInterface regionOp) {
         continue;
 
       // Recover layout for tensor_desc block args by updating the type.
-      if (auto tensorDescTy =
-              dyn_cast<xegpu::TensorDescType>(regionArg.getType())) {
-        if (!tensorDescTy.getLayoutAttr()) {
-          auto typeWithLayout = xegpu::TensorDescType::get(
-              tensorDescTy.getContext(), tensorDescTy.getShape(),
-              tensorDescTy.getElementType(), tensorDescTy.getEncoding(),
-              layout);
-          regionArg.setType(typeWithLayout);
-        }
-      }
+      if (isa<xegpu::TensorDescType>(regionArg.getType()))
+        setTensorDescLayout(regionArg, layout);
 
+      // Recover layout for region op operands, like scf.for's init operands.
       // Find all predecessor values that flow into this block argument.
       SmallVector<Value> predValues;
       regionOp.getPredecessorValues(regionSuccessor, inputIdx, predValues);
@@ -302,11 +298,6 @@ bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
   rootOp->walk([&](gpu::GPUFuncOp func) {
     processFunc(func.getBody(), func.getName());
   });
-  // dump out the root op here for debug purpose
-
-  llvm::dbgs() << "After recovering temporary layout attributes for function: "
-               << rootOp->getName() << "\n";
-  rootOp->dump();
 
   return true;
 }
@@ -494,6 +485,7 @@ xegpu::DistributeLayoutAttr xegpu::inferInsertStridedSliceSourceLayout(
 /// Infers the source layout attribute for an insert operation
 /// given the result layout attribute, result shape, and source shape. Removes
 /// leading dimensions from the result layout to match the source shape size.
+// TODO: add propagation support for insert op
 xegpu::DistributeLayoutAttr
 xegpu::inferInsertSourceLayout(xegpu::DistributeLayoutAttr resLayout,
                                ArrayRef<int64_t> resShape,
@@ -521,6 +513,8 @@ xegpu::inferInsertSourceLayout(xegpu::DistributeLayoutAttr resLayout,
 /// Infers the source layout attribute for extract operation
 /// given the result layout attribute, result shape, and source shape. Adds
 /// leading dimensions to the source layout to match the source shape size.
+// TODO: add layout attribute interface: expandDims() and use it here.
+// TODO: add propagation support for extract op
 xegpu::DistributeLayoutAttr
 xegpu::inferExtractSourceLayout(xegpu::DistributeLayoutAttr resLayout,
                                 ArrayRef<int64_t> resShape,
@@ -567,7 +561,6 @@ xegpu::inferExtractSourceLayout(xegpu::DistributeLayoutAttr resLayout,
         resInstData.empty() ? nullptr : toAttr(instData),
         resLaneLayout.empty() ? nullptr : toAttr(laneLayout),
         resLaneData.empty() ? nullptr : toAttr(laneData), nullptr);
-    // TODO: add layout attribute interface: expandDims
     return srcLayout;
   }
   return resLayout;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 1a3bc28cec002..6613af2dfc164 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -1403,9 +1403,6 @@ struct ResolveLayoutConflicts {
 } // namespace
 
 LogicalResult ResolveLayoutConflicts::run() {
-  // dump the IR before resolving layout conflicts for debugging purposes.
-  DBGS() << "IR before resolving layout conflicts:\n";
-  parentOp->dump();
   // Scan all operations in the parent op and resolve layout conflicts at
   // tensor descriptor and vector use points.
   auto r = parentOp->walk([&](Operation *op) -> WalkResult {
@@ -1448,7 +1445,6 @@ LogicalResult ResolveLayoutConflicts::run() {
     return WalkResult::advance();
   });
 
-  // dump the IR after resolving layout conflicts for debugging purposes.
   DBGS() << "IR after resolving layout conflicts:\n";
   parentOp->dump();
 
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 13288a377e69a..12eb553b3bddc 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -136,10 +136,6 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
   if (!value)
     return nullptr;
 
-  if (auto tdescTy =
-          dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
-    return tdescTy.getLayoutAttr();
-
   if (auto result = dyn_cast<OpResult>(value)) {
     Operation *defOp = result.getDefiningOp();
     assert(defOp && "result must have a defining op");
@@ -162,11 +158,14 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
     if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
       OpOperand *tiedInit = loop.getTiedLoopInit(arg);
       if (tiedInit)
-        // return getDistributeLayoutAttr(tiedInit->get());
         return getTemporaryLayout(*tiedInit);
     }
   }
 
+  if (auto tdescTy =
+          dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
+    return tdescTy.getLayoutAttr();
+
   return nullptr;
 }
 xegpu::DistributeLayoutAttr

>From a772e2d5c9fe487965dbba1f7b82b218c93c8f5d Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Mon, 4 May 2026 20:52:18 +0000
Subject: [PATCH 09/11] polish the legalization condition of wg distribution

---
 .../Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp   | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index af82effb9d379..8aa0758943cd1 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -1562,6 +1562,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
 
   target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp,
                                vector::TransposeOp, vector::BroadcastOp,
+                               vector::MultiDimReductionOp,
                                vector::ConstantMaskOp, vector::CreateMaskOp>(
       [=](Operation *op) -> bool {
         // Check for either a SliceAttr or LayoutAttr on the result.
@@ -1569,13 +1570,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
             xegpu::getTemporaryLayout(dyn_cast<OpResult>(op->getResult(0)));
         return isLegal(layout);
       });
-  target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
-      [=](Operation *op) -> bool {
-        // Check operand since the result maybe scalar not bearing layout..
-        auto layout =
-            xegpu::getTemporaryLayout(dyn_cast<vector::MultiDimReductionOp>(op)->getOpOperand(0));
-        return isLegal(layout);
-      });
+
   target.addDynamicallyLegalOp<xegpu::LoadGatherOp>(
       [=](xegpu::LoadGatherOp op) -> bool {
         auto layout = op.getLayoutAttr();

>From d5e6054bf4b02ef491a2be33d38e7410a49e65e8 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Tue, 5 May 2026 16:17:12 +0000
Subject: [PATCH 10/11] address feedback

---
 .../XeGPU/Transforms/XeGPULayoutImpl.h        |  6 ---
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      | 48 +++++++++----------
 2 files changed, 22 insertions(+), 32 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
index bac8f413acd40..5c6fb1397864f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
@@ -39,12 +39,6 @@ LogicalResult propagateLayouts(OpBuilder &builder, Operation *target,
 
 LogicalResult resolveLayoutConflicts(Operation *target);
 
-/// [to-be-deprecated] Set the DistributeLayoutAttr for each OpOperand and
-/// OpResult of of the given operation. If the operation contains regions, it is
-/// also applied recursively to the contained operations operation.
-/// TODO: To be replaced by recoverTemporaryLayouts()
-void recoverTemporaryLayoutsDeprecated(Operation *op);
-
 /// Attach layout attributes to all vector-type operands of operations within
 /// the given operation's nested region. Reports an error if any vector operand
 /// lacks a layout attribute.
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index b4928fc8ed0f8..735d6d98e1251 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -526,29 +526,24 @@ xegpu::inferExtractSourceLayout(xegpu::DistributeLayoutAttr resLayout,
   auto context = resLayout.getContext();
   // construct the source layout by adding unit dimensions to the front of
   // result layout
-
-  SmallVector<int64_t> sgLayout(srcShapeSize, 1);
-  SmallVector<int64_t> sgData(srcShapeSize, 1);
-  SmallVector<int64_t> instData(srcShapeSize, 1);
-  SmallVector<int64_t> laneLayout(srcShapeSize, 1);
-  SmallVector<int64_t> laneData(srcShapeSize, 1);
-
   if (dimDiff > 0) {
-    auto resSgLayout = resLayout.getEffectiveSgLayoutAsInt();
-    auto resSgData = resLayout.getEffectiveSgDataAsInt();
-    auto resInstData = resLayout.getEffectiveInstDataAsInt();
-    auto resLaneLayout = resLayout.getEffectiveLaneLayoutAsInt();
-    auto resLaneData = resLayout.getEffectiveLaneDataAsInt();
-
-    for (int i = 0; i < resShapeSize; i++) {
-      sgLayout[dimDiff + i] = (resSgLayout.size() == 0) ? 1 : resSgLayout[i];
-      sgData[dimDiff + i] = (resSgData.size() == 0) ? 1 : resSgData[i];
-      instData[dimDiff + i] = (resInstData.size() == 0) ? 1 : resInstData[i];
-      laneLayout[dimDiff + i] =
-          (resLaneLayout.size() == 0) ? 1 : resLaneLayout[i];
-      laneData[dimDiff + i] = (resLaneData.size() == 0) ? 1 : resLaneData[i];
+    auto sgLayout = resLayout.getEffectiveSgLayoutAsInt();
+    auto sgData = resLayout.getEffectiveSgDataAsInt();
+    auto instData = resLayout.getEffectiveInstDataAsInt();
+    auto laneLayout = resLayout.getEffectiveLaneLayoutAsInt();
+    auto laneData = resLayout.getEffectiveLaneDataAsInt();
+    auto order = resLayout.getEffectiveOrderAsInt();
+
+    for (int i = resShapeSize; i < dimDiff; i++) {
+      sgLayout.insert(sgLayout.begin(), 1);
+      sgData.insert(sgData.begin(), 1);
+      instData.insert(instData.begin(), 1);
+      laneLayout.insert(laneLayout.begin(), 1);
+      laneData.insert(laneData.begin(), 1);
+      order.insert(order.begin(), i);
     }
 
+    DenseI32ArrayAttr orderAttr = resLayout ? resLayout.getOrder() : nullptr;
     auto toAttr = [&](ArrayRef<int64_t> v) -> DenseI32ArrayAttr {
       if (v.empty())
         return DenseI32ArrayAttr();
@@ -556,11 +551,12 @@ xegpu::inferExtractSourceLayout(xegpu::DistributeLayoutAttr resLayout,
       return DenseI32ArrayAttr::get(context, v32);
     };
     auto srcLayout = xegpu::LayoutAttr::get(
-        context, resSgLayout.empty() ? nullptr : toAttr(sgLayout),
-        resSgData.empty() ? nullptr : toAttr(sgData),
-        resInstData.empty() ? nullptr : toAttr(instData),
-        resLaneLayout.empty() ? nullptr : toAttr(laneLayout),
-        resLaneData.empty() ? nullptr : toAttr(laneData), nullptr);
+        context, sgLayout.empty() ? nullptr : toAttr(sgLayout),
+        sgData.empty() ? nullptr : toAttr(sgData),
+        instData.empty() ? nullptr : toAttr(instData),
+        laneLayout.empty() ? nullptr : toAttr(laneLayout),
+        laneData.empty() ? nullptr : toAttr(laneData),
+        (orderAttr && !orderAttr.empty()) ? nullptr : toAttr(order));
     return srcLayout;
   }
   return resLayout;
@@ -1687,7 +1683,7 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
     VectorType valueToStoreTy =
         dyn_cast<VectorType>(insert.getValueToStore().getType());
 
-    if (idx == 0) {
+    if ((idx == 0) && valueToStoreTy) {
       return xegpu::inferInsertSourceLayout(resLayout, resVecTy.getShape(),
                                             valueToStoreTy.getShape());
     }

>From e67c4d736d31d20812751fe2e465dbeebb5c0ea8 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Tue, 5 May 2026 16:24:36 +0000
Subject: [PATCH 11/11] address more feedback

---
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 6613af2dfc164..308e5c98d444b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -404,18 +404,18 @@ class LayoutInfoPropagation
   visitOperation(Operation *op, ArrayRef<LayoutInfoLattice *> operands,
                  ArrayRef<const LayoutInfoLattice *> results) override;
 
-  void visitBranchOperand(OpOperand &operand) override {};
+  void visitBranchOperand(OpOperand &operand) override{};
 
-  void visitCallOperand(OpOperand &operand) override {};
+  void visitCallOperand(OpOperand &operand) override{};
 
   void
   visitNonControlFlowArguments(RegionSuccessor &successor,
-                               ArrayRef<BlockArgument> arguments) override {};
+                               ArrayRef<BlockArgument> arguments) override{};
 
-  void visitExternalCall(CallOpInterface call,
-                         ArrayRef<LayoutInfoLattice *> operands,
-                         ArrayRef<const LayoutInfoLattice *> results) override {
-  };
+  void
+  visitExternalCall(CallOpInterface call,
+                    ArrayRef<LayoutInfoLattice *> operands,
+                    ArrayRef<const LayoutInfoLattice *> results) override{};
 
   void setToExitState(LayoutInfoLattice *lattice) override {
     (void)lattice->meet(LayoutInfo());
@@ -1445,8 +1445,10 @@ LogicalResult ResolveLayoutConflicts::run() {
     return WalkResult::advance();
   });
 
-  DBGS() << "IR after resolving layout conflicts:\n";
-  parentOp->dump();
+  LLVM_DEBUG({
+    DBGS() << "IR after resolving layout conflicts:\n";
+    parentOp->dump();
+  });
 
   return r.wasInterrupted() ? failure() : success();
 }