[clang-tools-extra] [mlir] [MLIR][XeGPU] Refactor Layout access interface (PR #172125)

Sat Dec 13 12:04:03 PST 2025

https://github.com/Jianhui-Li updated https://github.com/llvm/llvm-project/pull/172125

>From b3f2a4ab3d57ff906e03dd03b6365ba99d2169bf Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 22 Nov 2025 00:26:44 +0000
Subject: [PATCH 01/23] adding anchor layout for load/store/prefetch_nd and
 dpas

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 29 ++++++++++++-------
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  2 ++
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 28 ++++++++++--------
 .../Transforms/XeGPUSubgroupDistribute.cpp    |  4 +--
 .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp  |  4 +--
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 20 +++++++++----
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   |  9 +++---
 mlir/test/Dialect/XeGPU/invalid.mlir          |  6 ++--
 .../Dialect/XeGPU/subgroup-distribute.mlir    | 12 ++++----
 mlir/test/Dialect/XeGPU/xegpu-blocking.mlir   |  4 +--
 .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir       |  6 ++--
 11 files changed, 72 insertions(+), 52 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 4c67856b559b1..9ddc408a17f7f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -268,7 +268,8 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
                        OptionalAttr<DenseI64ArrayAttr>: $const_offsets,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint,
+                       OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
     xegpu::TensorDescType getTensorDescType() {
@@ -360,7 +361,8 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
                        OptionalAttr<DenseI64ArrayAttr>: $transpose,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint, 
+                       OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
 
   let results = (outs XeGPU_ValueType: $value);
 
@@ -454,7 +456,8 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
                        OptionalAttr<DenseI64ArrayAttr>: $const_offsets,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint,
+                       OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
     VectorType getValueType() {
@@ -1046,7 +1049,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
                     "xegpu::CachePolicyAttr": $l1_hint,
                     "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint,
-                    "xegpu::DistributeLayoutAttr": $layout)>
+                    "xegpu::DistributeLayoutAttr": $anchor_layout)>
    ];
 
   let hasVerifier = 1;
@@ -1133,7 +1136,11 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
   let arguments = (ins
     XeGPU_DpasOprType : $lhs,
     XeGPU_DpasOprType : $rhs,
-    Optional<XeGPU_DpasResType>: $acc);
+    Optional<XeGPU_DpasResType>: $acc, 
+    OptionalAttr<DistributeLayoutAttr>:$anchor_layout_a,
+    OptionalAttr<DistributeLayoutAttr>:$anchor_layout_b,
+    OptionalAttr<DistributeLayoutAttr>:$anchor_layout_cd
+  );
   let results = (outs XeGPU_DpasResType: $result);
 
   let extraClassDeclaration = [{
@@ -1319,7 +1326,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
     OptionalAttr<UnitAttr>:$subgroup_block_io,
-    OptionalAttr<DistributeLayoutAttr>:$layout
+    OptionalAttr<DistributeLayoutAttr>:$anchor_layout
   );
   let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$res);  
   let assemblyFormat = [{
@@ -1338,7 +1345,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
      - `subgroup_block_io`: [optional] An attribute indicating that the operation can be 
                  lowered to a subgroup block load. When this attribute is present, 
                  the offsets are subgroup-uniform across all lanes.
-     - `layout`: [optional] An attribute for guiding distributions among
+     - `anchor_layout`: [optional] An attribute for guiding distributions among
                  subgroups and/or work-items. It currently can accept either
                  LayoutAttr or SliceAttr.
     Results:
@@ -1347,7 +1354,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
 
   let builders = [
     OpBuilder<(ins "Type":$res, "TypedValue<MemDescType>": $mem_desc,
-                    "llvm::ArrayRef<OpFoldResult>": $offsets, "DistributeLayoutAttr": $layout)>,
+                    "llvm::ArrayRef<OpFoldResult>": $offsets, "DistributeLayoutAttr": $anchor_layout)>,
   ];
   let extraClassDeclaration = [{
     SmallVector<OpFoldResult> getMixedOffsets() {
@@ -1373,7 +1380,7 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
     OptionalAttr<UnitAttr>:$subgroup_block_io,
-    OptionalAttr<DistributeLayoutAttr>:$layout
+    OptionalAttr<DistributeLayoutAttr>:$anchor_layout
   );
   let assemblyFormat = [{ $data `,` $mem_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
                           prop-dict attr-dict `` `:` type(operands)}];
@@ -1389,13 +1396,13 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
      - `subgroup_block_io`: [optional] An attribute indicating that the operation can be 
                  lowered to a subgroup block store. When this attribute is present, 
                  the offsets are subgroup-uniform across all lanes.     
-     - `layout`: [optional] An attribute for guiding distributions among
+     - `anchor_layout`: [optional] An attribute for guiding distributions among
                  subgroups and/or work-items. It currently can accept either
                  LayoutAttr or SliceAttr.
   }];
   let builders = [
     OpBuilder<(ins "Value" : $data, "TypedValue<MemDescType>": $mem_desc,
-                   "llvm::ArrayRef<OpFoldResult>": $offsets, "DistributeLayoutAttr": $layout)>,
+                   "llvm::ArrayRef<OpFoldResult>": $offsets, "DistributeLayoutAttr": $anchor_layout)>,
   ];
   let extraClassDeclaration = [{
     SmallVector<OpFoldResult> getMixedOffsets() {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index fb5d1e758dbd1..b3d2c40712c96 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -22,6 +22,8 @@ using std::optional;
 namespace mlir {
 namespace xegpu {
 
+//#include "mlir/Dialect/XeGPU/IR/XeGPUOpInterface.cpp.inc"
+
 void XeGPUDialect::initialize() {
   addTypes<
 #define GET_TYPEDEF_LIST
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 85c9a966f0fe8..3240c0f40ce58 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -465,7 +465,7 @@ void PrefetchNdOp::build(OpBuilder &builder, OperationState &state,
                          xegpu::CachePolicyAttr l3_hint) {
 
   return build(builder, state, tensorDesc, ValueRange(), DenseI64ArrayAttr(),
-               l1_hint, l2_hint, l3_hint);
+               l1_hint, l2_hint, l3_hint, /*anchor_layout=*/nullptr);
 }
 
 void PrefetchNdOp::build(OpBuilder &builder, OperationState &state,
@@ -480,7 +480,7 @@ void PrefetchNdOp::build(OpBuilder &builder, OperationState &state,
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
 
   build(builder, state, tensorDesc, dynamicOffsets, staticOffsetsAttr, l1_hint,
-        l2_hint, l3_hint);
+        l2_hint, l3_hint, /*anchor_layout=*/nullptr);
 }
 
 LogicalResult PrefetchNdOp::verify() {
@@ -519,7 +519,7 @@ void LoadNdOp::build(OpBuilder &builder, OperationState &state, Type retType,
 
   return build(builder, state, retType, tensorDesc, ValueRange(),
                DenseI64ArrayAttr(), packed, transpose, l1_hint, l2_hint,
-               l3_hint);
+               l3_hint, /*anchor_layout=*/nullptr);
 }
 
 void LoadNdOp::build(OpBuilder &builder, OperationState &state, Type retType,
@@ -535,7 +535,8 @@ void LoadNdOp::build(OpBuilder &builder, OperationState &state, Type retType,
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
 
   build(builder, state, retType, tensorDesc, dynamicOffsets, staticOffsetsAttr,
-        packed, transpose, l1_hint, l2_hint, l3_hint);
+        packed, transpose, l1_hint, l2_hint, l3_hint,
+        /*anchor_layout=*/nullptr);
 }
 
 LogicalResult LoadNdOp::verify() {
@@ -638,7 +639,8 @@ void StoreNdOp::build(OpBuilder &builder, OperationState &state, Value value,
                       xegpu::CachePolicyAttr l3_hint) {
 
   return build(builder, state, value, tensorDesc, ValueRange(),
-               DenseI64ArrayAttr(), l1_hint, l2_hint, l3_hint);
+               DenseI64ArrayAttr(), l1_hint, l2_hint, l3_hint,
+               /*anchor_layout=*/nullptr);
 }
 
 void StoreNdOp::build(OpBuilder &builder, OperationState &state, Value value,
@@ -653,7 +655,7 @@ void StoreNdOp::build(OpBuilder &builder, OperationState &state, Value value,
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
 
   build(builder, state, value, tensorDesc, dynamicOffsets, staticOffsetsAttr,
-        l1_hint, l2_hint, l3_hint);
+        l1_hint, l2_hint, l3_hint, /*anchor_layout=*/nullptr);
 }
 
 LogicalResult StoreNdOp::verify() {
@@ -876,7 +878,7 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
                          xegpu::CachePolicyAttr l2_hint,
                          xegpu::CachePolicyAttr l3_hint) {
   build(builder, state, valueType, source, Value(), mask, IntegerAttr(),
-        l1_hint, l2_hint, l3_hint, /*layout=*/nullptr);
+        l1_hint, l2_hint, l3_hint, /*anchor_layout=*/nullptr);
 }
 
 void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
@@ -892,7 +894,7 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
   auto offset = vector::FromElementsOp::create(builder, loc, type, values);
 
   build(builder, state, valueType, source, offset, mask, chunk_size, l1_hint,
-        l2_hint, l3_hint, /*layout=*/nullptr);
+        l2_hint, l3_hint, /*anchor_layout=*/nullptr);
 }
 
 void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
@@ -960,7 +962,7 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
                            xegpu::CachePolicyAttr l2_hint,
                            xegpu::CachePolicyAttr l3_hint) {
   build(builder, state, value, dest, Value(), mask, IntegerAttr(), l1_hint,
-        l2_hint, l3_hint, /*layout=*/nullptr);
+        l2_hint, l3_hint, /*anchor_layout=*/nullptr);
 }
 
 void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
@@ -978,7 +980,7 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
 
   // Call the correct builder overload that does not expect result types.
   build(builder, state, value, dest, offset, mask, chunk_size, l1_hint, l2_hint,
-        l3_hint, /*layout=*/nullptr);
+        l3_hint, /*anchor_layout=*/nullptr);
 }
 
 void StoreScatterOp::build(
@@ -1155,7 +1157,8 @@ LogicalResult LoadMatrixOp::verify() {
   MemDescType mdescTy = getMemDesc().getType();
 
   return IsValidMatrixOpParams(resTy, mdescTy, subgroup_block_io,
-                               getLayoutAttr(), [&]() { return emitError(); });
+                               getAnchorLayoutAttr(),
+                               [&]() { return emitError(); });
 }
 
 //===----------------------------------------------------------------------===//
@@ -1179,7 +1182,8 @@ LogicalResult StoreMatrixOp::verify() {
   UnitAttr subgroup_block_io = getSubgroupBlockIoAttr();
   MemDescType mdescTy = getMemDesc().getType();
   return IsValidMatrixOpParams(dataTy, mdescTy, subgroup_block_io,
-                               getLayoutAttr(), [&]() { return emitError(); });
+                               getAnchorLayoutAttr(),
+                               [&]() { return emitError(); });
 }
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 4455811a2e681..ac65babfcb4cb 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -965,7 +965,7 @@ struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern {
     SmallVector<Value> offsetsAsValues =
         vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
 
-    auto layout = matrixOp.getLayoutAttr();
+    auto layout = matrixOp.getAnchorLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           matrixOp, "the matrix operation lacks layout attribute");
@@ -1041,7 +1041,7 @@ struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
     SmallVector<Value> offsetsAsValues =
         vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
 
-    auto layout = matrixOp.getLayoutAttr();
+    auto layout = matrixOp.getAnchorLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           matrixOp, "the matrix operation lacks layout attribute");
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 330553564f81a..b0b748c3409c3 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -954,7 +954,7 @@ struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
 
     Type elemTy = valueTy.getElementType();
     ArrayRef<int64_t> shape = valueTy.getShape();
-    auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
+    auto layout = dyn_cast<xegpu::LayoutAttr>(op.getAnchorLayoutAttr());
 
     VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy);
 
@@ -993,7 +993,7 @@ struct UnrollStoreMatrixOp : public UnrollPattern<xegpu::StoreMatrixOp> {
     VectorType valueTy = llvm::dyn_cast<VectorType>(op.getData().getType());
     assert(valueTy && "the value type must be vector type!");
     ArrayRef<int64_t> shape = valueTy.getShape();
-    auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
+    auto layout = dyn_cast<xegpu::LayoutAttr>(op.getAnchorLayoutAttr());
 
     SmallVector<Type> convertedValTypes =
         getUnrolledTypes(valueTy, *targetShape);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 33d4b0457e5d3..2562c46adfa8d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -86,8 +86,16 @@ genOffsetsList(ConversionPatternRewriter &rewriter, OpType op,
   if (origOffsets.empty())
     return failure();
 
+  // if op is xegpu::CreateNdDescOp, call op.getLayoutAttr()
+  xegpu::DistributeLayoutAttr layout;
+  if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp> ||
+                std::is_same_v<OpType, xegpu::StoreMatrixOp>) {
+    layout = op.getAnchorLayoutAttr();
+  } else {
+    layout = op.getLayoutAttr();
+  }
+
   // not applicable to ops without workgroup layout attributes
-  xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
   if (!layout || !layout.isForWorkgroup())
     return failure();
 
@@ -190,7 +198,7 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
     xegpu::TensorDescType tdescTy = op.getType();
     ArrayRef<int64_t> wgShape = tdescTy.getShape();
     Type elemTy = tdescTy.getElementType();
-    xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
+    xegpu::DistributeLayoutAttr layout = tdescTy.getLayoutAttr();
     SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
     auto newTdescTy =
         xegpu::TensorDescType::get(ctx, sgShape, elemTy, tdescTy.getEncoding(),
@@ -999,7 +1007,7 @@ struct WgToSgLoadMatrixOp : public OpConversionPattern<xegpu::LoadMatrixOp> {
     assert(valueTy && "the value type must be vector type!");
     Type elemTy = valueTy.getElementType();
 
-    xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
+    xegpu::DistributeLayoutAttr layout = op.getAnchorLayoutAttr();
     SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
     VectorType newResTy = VectorType::get(sgShape, elemTy);
     SmallVector<Value> newOps;
@@ -1025,7 +1033,7 @@ struct WgToSgStoreMatrixOp : public OpConversionPattern<xegpu::StoreMatrixOp> {
     if (failed(genOffsetsList(rewriter, op, offsetsList)))
       return failure();
 
-    xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
+    xegpu::DistributeLayoutAttr layout = op.getAnchorLayoutAttr();
     for (auto [v, offsets] : llvm::zip(adaptor.getData(), offsetsList))
       xegpu::StoreMatrixOp::create(rewriter, op.getLoc(), v, op.getMemDesc(),
                                    offsets, layout.dropSgLayoutAndData());
@@ -1409,12 +1417,12 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
 
   target.addDynamicallyLegalOp<xegpu::LoadMatrixOp>(
       [=](xegpu::LoadMatrixOp op) -> bool {
-        return isLegal(op.getLayoutAttr());
+        return isLegal(op.getAnchorLayoutAttr());
       });
 
   target.addDynamicallyLegalOp<xegpu::StoreMatrixOp>(
       [=](xegpu::StoreMatrixOp op) -> bool {
-        return isLegal(op.getLayoutAttr());
+        return isLegal(op.getAnchorLayoutAttr());
       });
 
   target.addDynamicallyLegalOp<arith::ConstantOp>(
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index b0905c4e9203b..4fe35a16b3994 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -135,12 +135,11 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
 
     // for LoadMatrixOp, the layout is attached to the property of the op
     if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(defOp))
-      return loadOp.getLayoutAttr();
+      return loadOp.getAnchorLayoutAttr();
 
     // for StoreMatrixOp, the layout is attached to the property of the op
     if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(defOp))
-      return storeOp.getLayoutAttr();
-
+      return storeOp.getAnchorLayoutAttr();
     std::string layoutName = getLayoutName(result);
     if (defOp->hasAttr(layoutName))
       return defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
@@ -168,10 +167,10 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
   Operation *op = opr.getOwner();
 
   if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(op))
-    return loadOp.getLayoutAttr();
+    return loadOp.getAnchorLayoutAttr();
 
   if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(op))
-    return storeOp.getLayoutAttr();
+    return storeOp.getAnchorLayoutAttr();
 
   std::string layoutName = xegpu::getLayoutName(opr);
   if (op->hasAttr(layoutName))
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 92f353717ac59..62ac880030cda 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -894,7 +894,7 @@ func.func @store_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>, %arg1: ve
 // -----
 func.func @simt_store_matrix_vector_nonlinear(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1]>>, %arg1: vector<2x16xf32>) {
   // expected-error at +1 {{With subgroup_block_io, accessed data must be contiguous and coalesced}}
-  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
         vector<2x16xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1]>>
   return
 }
@@ -902,7 +902,7 @@ func.func @simt_store_matrix_vector_nonlinear(%arg0: !xegpu.mem_desc<32x32xf32,
 // -----
 func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [1, 16]>>, %arg1: vector<16x2xf32>) {
   // expected-error at +1 {{With subgroup_block_io, the distributed dimensions must be contiguous}}
-  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} :
+  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} :
         vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [1, 16]>>
   return
 }
@@ -910,7 +910,7 @@ func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf3
 // -----
 func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1], block = [1, 17]>>, %arg1: vector<16x2xf32>) {
   // expected-error at +1 {{With subgroup_block_io, the block shape must match the lane layout}}
-  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
         vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1], block = [1, 17]>>
   return
 }
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 8fd3cca5594cb..a7ce2c05b9d44 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -281,8 +281,8 @@ gpu.module @xevm_module{
 gpu.module @xevm_module{
   gpu.func @load_store_matrix_1(%arg0: !xegpu.mem_desc<32x32xf32>) {
     %c0 = arith.constant 0 : index
-    %1 = xegpu.load_matrix %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32>
-    xegpu.store_matrix %1, %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+    %1 = xegpu.load_matrix %arg0[%c0, %c0] <{anchor_layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32>
+    xegpu.store_matrix %1, %arg0[%c0, %c0] <{anchor_layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index
     gpu.return
   }
 }
@@ -307,8 +307,8 @@ gpu.module @xevm_module{
   gpu.func @load_store_matrix_2(%arg0: !xegpu.mem_desc<32x32xf32>) {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
-    %1 = xegpu.load_matrix %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32>
-    xegpu.store_matrix %1, %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+    %1 = xegpu.load_matrix %arg0[%c0, %c1] <{anchor_layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32>
+    xegpu.store_matrix %1, %arg0[%c0, %c1] <{anchor_layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index
     gpu.return
   }
 }
@@ -323,9 +323,9 @@ gpu.module @xevm_module{
   gpu.func @load_store_matrix_3(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>) {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
-    %1 = xegpu.load_matrix %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
+    %1 = xegpu.load_matrix %arg0[%c0, %c1] {subgroup_block_io, anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
       !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index -> vector<16x2xf32>
-    xegpu.store_matrix %1, %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
+    xegpu.store_matrix %1, %arg0[%c0, %c1] {subgroup_block_io, anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
       vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index
     gpu.return
   }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index d61908b422194..456d8e8a03cfc 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -569,7 +569,7 @@ gpu.module @test_kernel {
     %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x32xf32>
     //CHECK-COUNT-8: xegpu.load_matrix {{.*}} : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x16xf32>
     //CHECK-COUNT-8: vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<32x32xf32>
-    %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout<inst_data = [8, 16]>}>: !xegpu.mem_desc<32x32xf32> -> vector<32x32xf32>
+    %1 = xegpu.load_matrix %0[0, 0] <{anchor_layout = #xegpu.layout<inst_data = [8, 16]>}>: !xegpu.mem_desc<32x32xf32> -> vector<32x32xf32>
     gpu.return %1: vector<32x32xf32>
   }
 }
@@ -580,7 +580,7 @@ gpu.module @test_kernel {
   gpu.func @unroll_store_matrix(%value: vector<32x32xf32>, %arg0 : memref<32768xi8, 3>) {
     %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
     // CHECK-COUNT-8:  xegpu.store_matrix {{.*}} : vector<8x16xf32>, !xegpu.mem_desc<64x128xf32>, index, index
-    xegpu.store_matrix %value, %mdesc[0, 0] {layout = #xegpu.layout<inst_data = [8, 16]>} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>
+    xegpu.store_matrix %value, %mdesc[0, 0] {anchor_layout = #xegpu.layout<inst_data = [8, 16]>} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>
     gpu.return
   }
 }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 5dde84e8e0bc2..3760737cf51f5 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -333,9 +333,9 @@ gpu.module @test_distribution {
     //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]]
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
     //CHECK: [[off_x:%.+]] = index.remu [[l_off_x]], [[c128]]
-    //CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32>
+    //CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{anchor_layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32>
     %0 = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
-    %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32], lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32>
+    %1 = xegpu.load_matrix %0[0, 0] <{anchor_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32], lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32>
     gpu.return
   }
 
@@ -361,7 +361,7 @@ gpu.module @test_distribution {
     //CHECK: xegpu.store_matrix [[cst]], [[mdesc]][[[off_y]], [[off_x]]] : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>, index, index
     %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32]>} dense<1.0> : vector<64x128xf32>
     %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
-    xegpu.store_matrix %cst, %mdesc[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32]>} : vector<64x128xf32>, !xegpu.mem_desc<64x128xf32>
+    xegpu.store_matrix %cst, %mdesc[0, 0] {anchor_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32]>} : vector<64x128xf32>, !xegpu.mem_desc<64x128xf32>
     gpu.return
   }
 

>From bfae01fa3f6453ee1d0f67e98c3d6c2b1fcee8f2 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 22 Nov 2025 07:46:04 +0000
Subject: [PATCH 02/23] propogation hornor pre-defined layout at anchor op

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |   6 +-
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 448 +++++++++++-------
 .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp  |   4 +-
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   |   8 +-
 .../XeGPU/propagate-layout-inst-data.mlir     |  16 +-
 mlir/test/Dialect/XeGPU/propagate-layout.mlir |  79 +--
 .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir       |   4 +-
 7 files changed, 328 insertions(+), 237 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 9ddc408a17f7f..70c61a445e8ae 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -847,7 +847,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
       OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
-      OptionalAttr<DistributeLayoutAttr>:$layout);
+      OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
   let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value);
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
@@ -906,7 +906,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
                     "xegpu::CachePolicyAttr": $l1_hint,
                     "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint,
-                    "xegpu::DistributeLayoutAttr": $layout)>
+                    "xegpu::DistributeLayoutAttr": $anchor_layout)>
    ];
 
   let hasVerifier = 1;
@@ -991,7 +991,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
       OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
-      OptionalAttr<DistributeLayoutAttr>:$layout);
+      OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
 
   let extraClassDeclaration = extraBaseClassDeclaration#[{
     Type getDestType() {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index b3a780abd3f12..6d45a51ab0267 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -387,6 +387,8 @@ class LayoutInfoPropagation
                         ArrayRef<LayoutInfoLattice *> operands,
                         ArrayRef<const LayoutInfoLattice *> results);
 
+  bool hasAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout);
+
 public:
   LayoutInfoPropagation(DataFlowSolver &solver,
                         SymbolTableCollection &symbolTable,
@@ -475,49 +477,72 @@ LogicalResult LayoutInfoPropagation::visitOperation(
   return success();
 }
 
+bool LayoutInfoPropagation::hasAnchorLayout(
+    xegpu::DistributeLayoutAttr anchorLayout) {
+  if (anchorLayout == nullptr) {
+    return false;
+  }
+  if (layoutKind == LayoutKind::InstData) {
+    return !(anchorLayout.getEffectiveInstDataAsInt().empty());
+  } else if (layoutKind == LayoutKind::Lane) {
+    return !(anchorLayout.getEffectiveLaneLayoutAsInt().empty() ||
+             anchorLayout.getEffectiveLaneDataAsInt().empty());
+  }
+  return false;
+}
+
 void LayoutInfoPropagation::visitPrefetchNdOp(
     xegpu::PrefetchNdOp prefetch, ArrayRef<LayoutInfoLattice *> operands,
     ArrayRef<const LayoutInfoLattice *> results) {
-  // Here we assign the default layout to the tensor descriptor operand of
-  // prefetch.
-  auto tdescTy = prefetch.getTensorDescType();
-
-  auto uArch = getUArch(getChipStr(prefetch).value_or(""));
-  const auto *uArchInstruction =
-      dyn_cast<xegpu::uArch::Subgroup2DBlockPrefetchInstruction>(
-          uArch->getInstruction(
-              xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch));
-
-  auto blockWHC =
-      uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType());
-  if (!blockWHC)
-    prefetch.emitWarning("No known block params found for the element type.");
-  auto [bWidth, bHeight, bCount] = blockWHC.value();
-  SmallVector<int> instData;
-  int instWidth = xegpu::getLargestDivisor(
-      static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth,
-      bCount);
-  if (instWidth == -1)
-    prefetch.emitWarning(
-        "No suitable instruction multiple found for the given shape.");
-  if (tdescTy.getRank() == 1)
-    instData = {instWidth};
-  else {
-    int instHeight = xegpu::getLargestDivisor(
-        static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight);
-    if (instHeight == -1)
+
+  LayoutInfo prefetchLayout;
+  xegpu::DistributeLayoutAttr anchorLayout = prefetch.getAnchorLayoutAttr();
+  if (hasAnchorLayout(anchorLayout)) {
+    prefetchLayout = LayoutInfo(anchorLayout);
+  } else {
+    // Here we assign the default layout to the tensor descriptor operand of
+    // prefetch.
+    auto tdescTy = prefetch.getTensorDescType();
+
+    auto uArch = getUArch(getChipStr(prefetch).value_or(""));
+    const auto *uArchInstruction =
+        dyn_cast<xegpu::uArch::Subgroup2DBlockPrefetchInstruction>(
+            uArch->getInstruction(
+                xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch));
+
+    auto blockWHC =
+        uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType());
+    if (!blockWHC)
+      prefetch.emitWarning("No known block params found for the element type.");
+    auto [bWidth, bHeight, bCount] = blockWHC.value();
+    SmallVector<int> instData;
+    int instWidth = xegpu::getLargestDivisor(
+        static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth,
+        bCount);
+    if (instWidth == -1)
       prefetch.emitWarning(
           "No suitable instruction multiple found for the given shape.");
-    instData = {instHeight, instWidth};
-  }
-  LayoutInfo prefetchLayout;
-  if (layoutKind == LayoutKind::InstData)
-    prefetchLayout =
-        LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), instData));
-  else
-    prefetchLayout = getDefaultSIMTLayoutInfo(
-        tdescTy, uArch, uArchInstruction->getPackedFormatBitSize());
+    if (tdescTy.getRank() == 1)
+      instData = {instWidth};
+    else {
+      int instHeight = xegpu::getLargestDivisor(
+          static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight);
+      if (instHeight == -1)
+        prefetch.emitWarning(
+            "No suitable instruction multiple found for the given shape.");
+      instData = {instHeight, instWidth};
+    }
+
+    if (layoutKind == LayoutKind::InstData)
+      prefetchLayout =
+          LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), instData));
+    else
+      prefetchLayout = getDefaultSIMTLayoutInfo(
+          tdescTy, uArch, uArchInstruction->getPackedFormatBitSize());
 
+    prefetch.setAnchorLayoutAttr(
+        dyn_cast<xegpu::DistributeLayoutAttr>(prefetchLayout.get()));
+  }
   // Propagate the layout to the source tensor descriptor.
   propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout));
 }
@@ -617,69 +642,96 @@ void LayoutInfoPropagation::visitUpdateNdOffsetOp(
 void LayoutInfoPropagation::visitDpasOp(
     xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
     ArrayRef<const LayoutInfoLattice *> results) {
-  VectorType aTy = dpas.getLhsType();
-  VectorType bTy = dpas.getRhsType();
-
-  auto uArch = getUArch(getChipStr(dpas).value_or(""));
-  const int subgroupSize = uArch->getSubgroupSize();
-  const auto *uArchInstruction =
-      dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction(
-          xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc));
-
-  const unsigned dataALen = aTy.getShape().front();
-  auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType());
-  const int maxALen =
-      xegpu::getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen));
-  if (maxALen == -1)
-    dpas.emitWarning(
-        "No suitable instruction multiple found for the given shape.");
-
-  const unsigned dataBLen = bTy.getShape().back();
-  auto supportedBLen = uArchInstruction->getSupportedK(bTy.getElementType());
-  const int maxBLen =
-      xegpu::getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen));
-  if (maxBLen == -1)
-    dpas.emitWarning(
-        "No suitable instruction multiple found for the given shape.");
-  SmallVector<int> instDataA = {maxALen, subgroupSize};
-  SmallVector<int> instDataB = {subgroupSize, maxBLen};
 
   LayoutInfo dpasALayout;
   LayoutInfo dpasBLayout;
   LayoutInfo dpasCLayout;
 
-  if (layoutKind == LayoutKind::InstData) {
-    dpasALayout =
-        LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataA));
-    dpasBLayout =
-        LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataB));
+  xegpu::DistributeLayoutAttr anchorLayoutC = dpas.getAnchorLayoutCdAttr();
+  if (hasAnchorLayout(anchorLayoutC)) {
+    xegpu::DistributeLayoutAttr anchorLayoutA = dpas.getAnchorLayoutAAttr();
+    xegpu::DistributeLayoutAttr anchorLayoutB = dpas.getAnchorLayoutBAttr();
+    assert(hasAnchorLayout(anchorLayoutA) &&
+           "Expected anchor layout for DPAS A operand.");
+    assert(hasAnchorLayout(anchorLayoutB) &&
+           "Expected anchor layout for DPAS B operand.");
+    dpasALayout = LayoutInfo(anchorLayoutA);
+    dpasBLayout = LayoutInfo(anchorLayoutB);
+    dpasCLayout = LayoutInfo(anchorLayoutC);
+
   } else {
-    dpasALayout = getSIMTLayoutInfoForDPASOperand(
-        aTy, 0, uArch, uArchInstruction->getPackedFormatBitSizeA());
-    dpasBLayout = getSIMTLayoutInfoForDPASOperand(
-        bTy, 1, uArch, uArchInstruction->getPackedFormatBitSizeB());
-  }
 
-  propagateIfChanged(operands[0], operands[0]->meet(dpasALayout));
-  propagateIfChanged(operands[1], operands[1]->meet(dpasBLayout));
-  if (operands.size() > 2) {
-    VectorType cTy = dpas.getAccType();
-    const unsigned dataCLen = bTy.getShape().back();
-    auto supportedCLen = uArchInstruction->getSupportedN(bTy.getElementType());
-    const int maxCLen =
-        xegpu::getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen));
-    if (maxCLen == -1)
+    VectorType aTy = dpas.getLhsType();
+    VectorType bTy = dpas.getRhsType();
+
+    auto uArch = getUArch(getChipStr(dpas).value_or(""));
+    const int subgroupSize = uArch->getSubgroupSize();
+    const auto *uArchInstruction =
+        dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction(
+            xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc));
+
+    const unsigned dataALen = aTy.getShape().front();
+    auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType());
+    const int maxALen =
+        xegpu::getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen));
+    if (maxALen == -1)
       dpas.emitWarning(
           "No suitable instruction multiple found for the given shape.");
-    SmallVector<int> instDataC = {maxALen, maxCLen};
 
-    if (layoutKind == LayoutKind::InstData)
-      dpasCLayout =
-          LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataC));
-    else
-      dpasCLayout = getSIMTLayoutInfoForDPASOperand(
-          cTy, 2, uArch, uArchInstruction->getPackedFormatBitSizeB());
+    const unsigned dataBLen = bTy.getShape().back();
+    auto supportedBLen = uArchInstruction->getSupportedN(bTy.getElementType());
+
+    const int maxBLen =
+        xegpu::getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen));
+
+    if (maxBLen == -1)
+      dpas.emitWarning(
+          "No suitable instruction multiple found for the given shape.");
+    SmallVector<int> instDataA = {maxALen, subgroupSize};
+    SmallVector<int> instDataB = {subgroupSize, maxBLen};
+
+    if (layoutKind == LayoutKind::InstData) {
+      dpasALayout =
+          LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataA));
+      dpasBLayout =
+          LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataB));
+    } else {
+      dpasALayout = getSIMTLayoutInfoForDPASOperand(
+          aTy, 0, uArch, uArchInstruction->getPackedFormatBitSizeA());
+      dpasBLayout = getSIMTLayoutInfoForDPASOperand(
+          bTy, 1, uArch, uArchInstruction->getPackedFormatBitSizeB());
+    }
 
+    if (operands.size() > 2) {
+      VectorType cTy = dpas.getAccType();
+      if (layoutKind == LayoutKind::InstData) {
+        const unsigned dataCLen = bTy.getShape().back();
+        auto supportedCLen =
+            uArchInstruction->getSupportedN(bTy.getElementType());
+        const int maxCLen = xegpu::getLargestDivisor(
+            dataCLen, ArrayRef<unsigned>(supportedCLen));
+        if (maxCLen == -1)
+          dpas.emitWarning(
+              "No suitable instruction multiple found for the given shape.");
+        SmallVector<int> instDataC = {maxALen, maxCLen};
+        dpasCLayout =
+            LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataC));
+      } else
+        dpasCLayout = getSIMTLayoutInfoForDPASOperand(
+            cTy, 2, uArch, uArchInstruction->getPackedFormatBitSizeB());
+
+      dpas.setAnchorLayoutCdAttr(
+          dyn_cast<xegpu::DistributeLayoutAttr>(dpasCLayout.get()));
+    }
+    dpas.setAnchorLayoutAAttr(
+        dyn_cast<xegpu::DistributeLayoutAttr>(dpasALayout.get()));
+    dpas.setAnchorLayoutBAttr(
+        dyn_cast<xegpu::DistributeLayoutAttr>(dpasBLayout.get()));
+  }
+
+  propagateIfChanged(operands[0], operands[0]->meet(dpasALayout));
+  propagateIfChanged(operands[1], operands[1]->meet(dpasBLayout));
+  if (operands.size() > 2) {
     propagateIfChanged(operands[2], operands[2]->meet(dpasCLayout));
   }
 }
@@ -689,43 +741,51 @@ void LayoutInfoPropagation::visitStoreNdOp(
     xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands,
     ArrayRef<const LayoutInfoLattice *> results) {
 
-  auto uArch = getUArch(getChipStr(store).value_or(""));
-  const auto *uArchInstruction =
-      dyn_cast<xegpu::uArch::Subgroup2DBlockStoreInstruction>(
-          uArch->getInstruction(
-              xegpu::uArch::InstructionKind::Subgroup2DBlockStore));
-  VectorType dataTy = store.getValueType();
-  auto blockWHC = uArchInstruction->getBlockWidthHeightCount(
-      store.getValueType().getElementType());
-  if (!blockWHC)
-    store.emitWarning("No known block params found for the element type.");
-  auto [bWidth, bHeight, bCount] = blockWHC.value();
-  SmallVector<int> instData;
-  int instWidth = xegpu::getLargestDivisor(
-      static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth,
-      bCount);
-  if (instWidth == -1)
-    store.emitWarning(
-        "No suitable instruction multiple found for the given shape.");
-  if (dataTy.getRank() == 1)
-    instData = {instWidth};
-  else {
-    int instHeight = xegpu::getLargestDivisor(
-        static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight);
-    if (instHeight == -1)
+  LayoutInfo storeLayout;
+  xegpu::DistributeLayoutAttr anchorLayout = store.getAnchorLayoutAttr();
+  if (hasAnchorLayout(anchorLayout)) {
+    storeLayout = LayoutInfo(anchorLayout);
+  } else {
+    auto uArch = getUArch(getChipStr(store).value_or(""));
+    const auto *uArchInstruction =
+        dyn_cast<xegpu::uArch::Subgroup2DBlockStoreInstruction>(
+            uArch->getInstruction(
+                xegpu::uArch::InstructionKind::Subgroup2DBlockStore));
+    VectorType dataTy = store.getValueType();
+    auto blockWHC = uArchInstruction->getBlockWidthHeightCount(
+        store.getValueType().getElementType());
+    if (!blockWHC)
+      store.emitWarning("No known block params found for the element type.");
+    auto [bWidth, bHeight, bCount] = blockWHC.value();
+    SmallVector<int> instData;
+    int instWidth = xegpu::getLargestDivisor(
+        static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth,
+        bCount);
+    if (instWidth == -1)
       store.emitWarning(
           "No suitable instruction multiple found for the given shape.");
-    instData = {instHeight, instWidth};
-  }
+    if (dataTy.getRank() == 1)
+      instData = {instWidth};
+    else {
+      int instHeight = xegpu::getLargestDivisor(
+          static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight);
+      if (instHeight == -1)
+        store.emitWarning(
+            "No suitable instruction multiple found for the given shape.");
+      instData = {instHeight, instWidth};
+    }
 
-  LayoutInfo storeLayout;
-  if (layoutKind == LayoutKind::InstData)
-    storeLayout =
-        LayoutInfo(xegpu::LayoutAttr::get(dataTy.getContext(), instData));
-  else
-    storeLayout =
-        getDefaultSIMTLayoutInfo(store.getValueType(), uArch,
-                                 uArchInstruction->getPackedFormatBitSize());
+    if (layoutKind == LayoutKind::InstData)
+      storeLayout =
+          LayoutInfo(xegpu::LayoutAttr::get(dataTy.getContext(), instData));
+    else
+      storeLayout =
+          getDefaultSIMTLayoutInfo(store.getValueType(), uArch,
+                                   uArchInstruction->getPackedFormatBitSize());
+    store.setAnchorLayoutAttr(
+        dyn_cast<xegpu::DistributeLayoutAttr>(storeLayout.get()));
+  }
+  // Propagate the layout to the value operand.
   // Both operands should have the same layout
   for (LayoutInfoLattice *operand : operands)
     propagateIfChanged(operand, operand->meet(storeLayout));
@@ -736,21 +796,31 @@ void LayoutInfoPropagation::visitStoreNdOp(
 void LayoutInfoPropagation::visitLoadNdOp(
     xegpu::LoadNdOp load, ArrayRef<LayoutInfoLattice *> operands,
     ArrayRef<const LayoutInfoLattice *> results) {
-  LayoutInfo valueLayout = results[0]->getValue();
-  // Need the layout of the value to propagate to the tensor descriptor.
-  if (!valueLayout.isAssigned())
-    return;
-  LayoutInfo tensorDescLayout = valueLayout;
-  // LoadNdOp has the transpose effect. However, at the stage of this analysis
-  // this effect is not expected and should be abstracted away. Emit a
-  // warning.
-  if (auto transpose = load.getTranspose()) {
-    load.emitWarning("Transpose effect is not expected for LoadNdOp at "
-                     "LayoutInfoPropagation stage.");
-    tensorDescLayout = valueLayout.transpose(transpose.value());
+
+  LayoutInfo loadLayout;
+  xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr();
+  if (hasAnchorLayout(anchorLayout)) {
+    loadLayout = LayoutInfo(anchorLayout);
+  } else {
+
+    LayoutInfo valueLayout = results[0]->getValue();
+    // Need the layout of the value to propagate to the tensor descriptor.
+    if (!valueLayout.isAssigned())
+      return;
+    loadLayout = valueLayout;
+    // LoadNdOp has the transpose effect. However, at the stage of this analysis
+    // this effect is not expected and should be abstracted away. Emit a
+    // warning.
+    if (auto transpose = load.getTranspose()) {
+      load.emitWarning("Transpose effect is not expected for LoadNdOp at "
+                       "LayoutInfoPropagation stage.");
+      loadLayout = valueLayout.transpose(transpose.value());
+    }
+    load.setAnchorLayoutAttr(
+        dyn_cast<xegpu::DistributeLayoutAttr>(loadLayout.get()));
   }
   // Propagate the new layout to the tensor descriptor operand.
-  propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout));
+  propagateIfChanged(operands[0], operands[0]->meet(loadLayout));
 }
 
 /// For vector::TransposeOp, the layout of the result is transposed and
@@ -840,37 +910,49 @@ void LayoutInfoPropagation::visitVectorBitcastOp(
 void LayoutInfoPropagation::visitLoadGatherOp(
     xegpu::LoadGatherOp load, ArrayRef<LayoutInfoLattice *> operands,
     ArrayRef<const LayoutInfoLattice *> results) {
-  // The layout is strictly determined by the payload type.
-  auto payloadTy = dyn_cast<VectorType>(load.getValueType());
-  if (!payloadTy) {
-    load.emitWarning("Not propagating, non-vector payload supplied.");
-    return;
-  }
-  auto uArch = getUArch(getChipStr(load).value_or(""));
-  const int subgroupSize = uArch->getSubgroupSize();
-  SmallVector<int> instData{subgroupSize};
-  if (auto chunkSize = load.getChunkSize().value_or(0); chunkSize > 1)
-    instData.push_back(chunkSize);
-  else if (auto srcTdescTy =
-               dyn_cast<xegpu::TensorDescType>(load.getSourceType())) {
-    if (srcTdescTy.getChunkSizeAsInt() > 1)
+
+  LayoutInfo loadLayout;
+  LayoutInfo maskLayout;
+  xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr();
+  if (hasAnchorLayout(anchorLayout)) {
+    loadLayout = LayoutInfo(anchorLayout);
+    maskLayout = loadLayout;
+  } else {
+
+    // The layout is strictly determined by the payload type.
+    auto payloadTy = dyn_cast<VectorType>(load.getValueType());
+    if (!payloadTy) {
+      load.emitWarning("Not propagating, non-vector payload supplied.");
+      return;
+    }
+    auto uArch = getUArch(getChipStr(load).value_or(""));
+    const int subgroupSize = uArch->getSubgroupSize();
+    SmallVector<int> instData{subgroupSize};
+    if (auto chunkSize = load.getChunkSize().value_or(0); chunkSize > 1)
       instData.push_back(chunkSize);
-  }
-  LayoutInfo layout;
-  if (layoutKind == LayoutKind::InstData)
-    layout = LayoutInfo(xegpu::LayoutAttr::get(load.getContext(), instData));
-  else
-    layout = getDefaultSIMTLayoutInfo(payloadTy, uArch,
-                                      uArch->getGeneralPackedFormatBitSize(),
-                                      /*scattered*/ true);
-
-  // Mask operand should have 1D default layout.
-  LayoutInfo maskLayout =
-      getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize);
+    else if (auto srcTdescTy =
+                 dyn_cast<xegpu::TensorDescType>(load.getSourceType())) {
+      if (srcTdescTy.getChunkSizeAsInt() > 1)
+        instData.push_back(chunkSize);
+    }
+
+    if (layoutKind == LayoutKind::InstData)
+      loadLayout =
+          LayoutInfo(xegpu::LayoutAttr::get(load.getContext(), instData));
+    else
+      loadLayout = getDefaultSIMTLayoutInfo(
+          payloadTy, uArch, uArch->getGeneralPackedFormatBitSize(),
+          /*scattered*/ true);
+
+    // Mask operand should have 1D default layout.
+    maskLayout = getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize);
 
+    load.setAnchorLayoutAttr(
+        dyn_cast<xegpu::DistributeLayoutAttr>(loadLayout.get()));
+  }
   // Propagate the new layout to the tensor descriptor operand.
   if (isa<xegpu::TensorDescType>(load.getSourceType()))
-    propagateIfChanged(operands[0], operands[0]->meet(layout));
+    propagateIfChanged(operands[0], operands[0]->meet(loadLayout));
   // Propagate the new layout to the mask and optional offset operand.
   propagateIfChanged(operands[1], operands[1]->meet(maskLayout));
   if (load.getOffsets())
@@ -898,21 +980,26 @@ void LayoutInfoPropagation::visitCreateDescOp(
 void LayoutInfoPropagation::visitStoreScatterOp(
     xegpu::StoreScatterOp storeScatter, ArrayRef<LayoutInfoLattice *> operands,
     ArrayRef<const LayoutInfoLattice *> results) {
-  // Currently, for 2D StoreScatterOp we expect that the height dimension of
-  // the tensor descriptor is equal to the subgroup size. This is ensured by
-  // the op verifier.
-  auto payloadTy = dyn_cast<VectorType>(storeScatter.getValueType());
-  if (!payloadTy) {
-    storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
-    return;
-  }
-  LayoutInfo payloadLayout;
-  auto uArch = getUArch(getChipStr(storeScatter).value_or(""));
-  const int subgroupSize = uArch->getSubgroupSize();
 
-  if (auto layout = storeScatter.getLayoutAttr()) {
-    payloadLayout = LayoutInfo(layout);
+  LayoutInfo payloadLayout;
+  LayoutInfo maskLayout;
+  xegpu::DistributeLayoutAttr anchorLayout = storeScatter.getAnchorLayoutAttr();
+  if (hasAnchorLayout(anchorLayout)) {
+    payloadLayout = LayoutInfo(anchorLayout);
+    maskLayout = payloadLayout;
   } else {
+    // Currently, for 2D StoreScatterOp we expect that the height dimension of
+    // the tensor descriptor is equal to the subgroup size. This is ensured by
+    // the op verifier.
+    auto payloadTy = dyn_cast<VectorType>(storeScatter.getValueType());
+    if (!payloadTy) {
+      storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
+      return;
+    }
+
+    auto uArch = getUArch(getChipStr(storeScatter).value_or(""));
+    const int subgroupSize = uArch->getSubgroupSize();
+
     if (layoutKind == LayoutKind::InstData) {
       SmallVector<int> instData{subgroupSize};
       if (auto chunkSize = storeScatter.getChunkSize().value_or(0);
@@ -936,10 +1023,13 @@ void LayoutInfoPropagation::visitStoreScatterOp(
           payloadTy, uArch, uArch->getGeneralPackedFormatBitSize(),
           /*scattered=*/true);
     }
-  }
 
-  LayoutInfo maskLayout =
-      getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize);
+    maskLayout =
+        getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize);
+
+    storeScatter.setAnchorLayoutAttr(
+        dyn_cast<xegpu::DistributeLayoutAttr>(payloadLayout.get()));
+  }
   // Propagate the payload operand layout
   propagateIfChanged(operands[0], operands[0]->meet(payloadLayout));
   // Propagate the destination (if tdesc) operand layout
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index b0b748c3409c3..c644f784606e9 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -678,7 +678,7 @@ struct UnrollLoadGatherOpWithOffset
           pack(offsets, convertedOffsetTypes, *targetShape, loc, rewriter);
     }
 
-    auto layout = op.getLayoutAttr();
+    auto layout = op.getAnchorLayoutAttr();
     if (layout)
       layout = layout.dropInstData();
 
@@ -778,7 +778,7 @@ struct UnrollStoreScatterOpWithOffsets
     SmallVector<Value> convertedValues =
         pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter);
 
-    auto layout = op.getLayoutAttr();
+    auto layout = op.getAnchorLayoutAttr();
     if (layout)
       layout = layout.dropInstData();
 
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 4fe35a16b3994..572e5442760bc 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -147,7 +147,7 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
     // check for "permament" layout only after "temporary" layout name lookup
     // for backward compatibility
     if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(defOp))
-      return loadGatherOp.getLayoutAttr();
+      return loadGatherOp.getAnchorLayoutAttr();
   }
 
   if (auto arg = dyn_cast<BlockArgument>(value)) {
@@ -178,7 +178,7 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
 
   // check for "permament" layout only after "temporary" layout name lookup
   if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
-    if (auto layout = storeScatterOp.getLayoutAttr())
+    if (auto layout = storeScatterOp.getAnchorLayoutAttr())
       return layout;
 
   return getDistributeLayoutAttr(opr.get());
@@ -193,7 +193,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
   xegpu::DistributeLayoutAttr candidate = layout;
 
   if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(owner)) {
-    if (auto perm = loadOp.getLayoutAttr())
+    if (auto perm = loadOp.getAnchorLayoutAttr())
       candidate = perm;
   }
 
@@ -211,7 +211,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
 
   if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(owner)) {
     if (idx == 0) {
-      if (auto perm = storeOp.getLayoutAttr())
+      if (auto perm = storeOp.getAnchorLayoutAttr())
         candidate = perm;
     }
   }
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
index c31ef323a94d2..62a33a4797d2b 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -5,14 +5,14 @@
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} dense<0.000000e+00> : vector<8x16xf32>
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>
 // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>>
-// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]]  {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]]  <{anchor_layout = #xegpu.layout<inst_data = [8, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
 // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]]  {layout_result_0 = #xegpu.layout<inst_data = [16, 16]>} :
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{anchor_layout = #xegpu.layout<inst_data = [16, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [16, 16]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf16>
-// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {anchor_layout_a = #xegpu.layout<inst_data = [8, 16]>, anchor_layout_b = #xegpu.layout<inst_data = [16, 16]>, anchor_layout_cd = #xegpu.layout<inst_data = [8, 16]>, layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
 // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
 // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>
-// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{anchor_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>>
 gpu.module @test {
 
 func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
@@ -46,7 +46,7 @@ gpu.module @test_kernel {
     %out:3 = scf.for %k = %c0 to %c1024 step %c32
       iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
       -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) {
-      //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+      //CHECK: xegpu.load_nd {{.*}} <{anchor_layout = #xegpu.layout<inst_data = [8, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
       //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<16x32xf16>
       %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
       %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
@@ -85,7 +85,7 @@ gpu.module @test_kernel {
     %out:3 = scf.for %k = %c0 to %c1024 step %c32
       iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
       -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) {
-      //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16]>} :
+      //CHECK: xegpu.load_nd {{.*}} <{anchor_layout = #xegpu.layout<inst_data = [4, 16]>}>  {layout_result_0 = #xegpu.layout<inst_data = [4, 16]>} :
       //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>> -> vector<12x32xf16>
       %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
       %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
@@ -113,9 +113,9 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}>
+// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{anchor_layout = #xegpu.layout<inst_data = [16, 8]>, chunk_size = 8 : i64}>
 // CHECK-SAME: {layout_result_0 = #xegpu.layout<inst_data = [16, 8]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{anchor_layout = #xegpu.layout<inst_data = [16, 8]>, chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
 func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
   %1 = arith.constant dense<1>: vector<16xi1>
   %offset = arith.constant dense<12> : vector<16xindex>
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index eb004932af4be..d1bee47dd6d37 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -6,14 +6,14 @@ gpu.module @test {
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]]  <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {anchor_layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, anchor_layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, anchor_layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
 // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
 // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
@@ -32,7 +32,8 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me
 gpu.module @test {
 // CHECK-LABEL: func.func @dpas_i8(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) {
-// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16],
+// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {anchor_layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>, anchor_layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} 
+
 func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
@@ -46,8 +47,8 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre
 gpu.module @test {
 // CHECK-LABEL: func.func @load_with_transpose_effect(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{transpose = array<i64: 1, 0>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
-// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>> -> vector<16x16xf16>
+// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
 func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
@@ -108,7 +109,7 @@ gpu.module @test {
 // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> ->
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
-// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]]  <{anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}> {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>, vector<16xi1> -> vector<16x16xf16>
 func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
@@ -135,7 +136,7 @@ gpu.module @test {
 // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> ->
 // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]]  {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
+// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, vector<16xi1> -> vector<16xf32>
 func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
   %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
@@ -183,9 +184,9 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64}>
+// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>, chunk_size = 8 : i64}>
 // CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]  <{anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>, chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
 func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
   %1 = arith.constant dense<1>: vector<16xi1>
   %offset = arith.constant dense<12> : vector<16xindex>
@@ -204,7 +205,7 @@ gpu.module @test {
 // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
 // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
 // CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
-// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]  <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
 func.func @scatter_ops(%src: memref<256xf16>) {
   %1 = arith.constant dense<1>: vector<16xi1>
   %offset = arith.constant dense<12> : vector<16xindex>
@@ -220,10 +221,10 @@ gpu.module @test {
 // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
 // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
-// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16>
+// CHECK-SAME:  {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : vector<16xf16>
 // CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
-// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK-SAME  <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
 func.func @scatter_ops_custom_perm_layout(%src: memref<256xf16>) {
   %1 = arith.constant dense<1>: vector<16xi1>
   %offset = arith.constant dense<12> : vector<16xindex>
@@ -239,11 +240,11 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}>
+// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] 
 // CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
-// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16>
+// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : vector<16xf16>
 // CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
-// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK-SAME <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
 func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) {
   %1 = arith.constant dense<1>: vector<16xi1>
   %offset = arith.constant dense<12> : vector<16xindex>
@@ -256,9 +257,9 @@ func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) {
 // -----
 gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16(
-// CHECK:       %[[LOAD0:.*]] = xegpu.load_nd %{{.*}}  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK:       %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:     !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16>
-// CHECK:       %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+// CHECK:       %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
 // CHECK-SAME:     !xegpu.tensor_desc<16x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xi16>
 // CHECK:       %{{.*}} = vector.bitcast %[[LOAD0]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:      vector<8x16xi16> to vector<8x16xf16>
@@ -281,7 +282,7 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1
 // -----
 gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i32_to_f16(
-// CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
 // CHECK-SAME:     !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
 // CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
 // CHECK-SAME:     vector<16x8xi32> to vector<16x16xf16>
@@ -302,7 +303,7 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8
 // -----
 gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i16_to_i32(
-// CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}}  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
+// CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
 // CHECK-SAME:     !xegpu.tensor_desc<8x32xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>> -> vector<8x32xi16>
 // CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:     vector<8x32xi16> to vector<8x16xi32>
@@ -339,9 +340,9 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
 // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT: %{{.*}} = arith.addf %[[T1]], %[[T2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16>
 func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) {
@@ -362,9 +363,9 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
 // CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]]  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {anchor_layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, anchor_layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -385,11 +386,11 @@ gpu.module @test {
 // CHECK-NEXT: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
 // CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) ->
 // CHECK-SAME: (!xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, vector<8x16xf32>) {
-// CHECK-NEXT:   %[[T4:.*]] = xegpu.load_nd %[[ARG4]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT:   %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-// CHECK-NEXT:   %[[T5:.*]] = xegpu.load_nd %[[ARG5]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT:   %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK-NEXT:   %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT:   %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {anchor_layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, anchor_layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, anchor_layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
 // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
 // CHECK-NEXT:   %[[T7:.*]] = xegpu.update_nd_offset %[[ARG4]], [{{.*}}] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 // CHECK-NEXT:   %[[T8:.*]] = xegpu.update_nd_offset %[[ARG5]], [{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
@@ -397,7 +398,7 @@ gpu.module @test {
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, vector<8x16xf32>
 // CHECK-NEXT: } {layout_result_2 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-NEXT: %[[T3:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
@@ -425,11 +426,11 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
 // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
 // CHECK:  %{{.*}} = scf.if %[[ARG2]] -> (vector<16x16xf16>) {
-// CHECK-NEXT:    %[[T3:.*]] = xegpu.load_nd %[[ARG1]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT:    %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:    scf.yield %[[T3]] : vector<16x16xf16>
 // CHECK-NEXT:  } else {
-// CHECK-NEXT:    %[[T4:.*]] = xegpu.load_nd %[[ARG1]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT:    %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:    scf.yield %[[T4]] : vector<16x16xf16>
 // CHECK-NEXT:  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
@@ -455,11 +456,11 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG4:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
 // CHECK: %[[T1:.*]] = scf.if %[[ARG2]] -> (vector<16x16xf16>) {
-// CHECK-NEXT:       %[[T3:.*]] = xegpu.load_nd %[[ARG1]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT:       %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:       scf.yield %[[T3]] : vector<16x16xf16>
 // CHECK-NEXT:     } else {
-// CHECK-NEXT:       %[[T4:.*]] = xegpu.load_nd %[[ARG1]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT:       %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:       scf.yield %[[T4]] : vector<16x16xf16>
 // CHECK-NEXT:     } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
@@ -539,7 +540,7 @@ gpu.module @test {
 // CHECK-LABEL: func.func @prefetch_2d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 func.func @prefetch_2d(%arg0: memref<256x256xf16>){
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
@@ -552,7 +553,7 @@ gpu.module @test {
 // CHECK-LABEL: func.func @prefetch_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
 func.func @prefetch_1d(%arg0: memref<256xf16>){
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
@@ -599,7 +600,7 @@ gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed(
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK:         %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK:         %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:      !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:    %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
 // CHECK-SAME:       {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
@@ -621,7 +622,7 @@ gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(
 // CHECK-SAME:     %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME:     %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK:          %[[LOAD:.*]] = xegpu.load_nd %arg0  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK:          %[[LOAD:.*]] = xegpu.load_nd %arg0 <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:        !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:     %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
 // CHECK-SAME:        {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1]
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 3760737cf51f5..171cadeeaeaf9 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -286,7 +286,7 @@ gpu.module @test_distribution {
     // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<2.550000e+01> : vector<8xf16>
     // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<0> : vector<8xindex>
     // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<true> : vector<8xi1>
-    // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>, layout = #xegpu.layout<inst_data = [8]>}>
+    // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{anchor_layout = #xegpu.layout<inst_data = [8]>, chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}>
     // CHECK-SAME: {layout_operand_0 = #xegpu.layout<inst_data = [8]>, layout_operand_2 = #xegpu.layout<inst_data = [8]>,
     // CHECK-SAME: layout_operand_3 = #xegpu.layout<inst_data = [8]>}
     // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
@@ -554,7 +554,7 @@ gpu.module @test_distribution {
     %offset =  arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<0> : vector<256xindex>
     %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<1> : vector<256xi1>
 
-    // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>}>
+    // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{anchor_layout = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>, chunk_size = 1 : i64}>
     // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>} :
     // CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32>
     %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32>

>From 0482234e56256ac0824a4fb85bac492b50080fdc Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Mon, 24 Nov 2025 01:58:41 +0000
Subject: [PATCH 03/23] adding documentation

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 192 ++++++++++++++----
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |   2 +-
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp |  20 +-
 3 files changed, 165 insertions(+), 49 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 70c61a445e8ae..344fb23ba7b8d 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -253,6 +253,22 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
     It issues an instruction to prefetch a block of data from continuous
     memory regions to each level of the cache based on their cache policy.
 
+    Arguments:
+    - `TensorDesc`: A tensor descriptor specifying the base nd-region of
+      memory and tensor tile to be prefetched.
+
+    - `offsets`: index values representing per-dimension offsets from the
+      base position encoded in `TensorDesc`. It is encoded via "offsets"
+      and "const_offsets".
+
+    - `l1_hint`, `l2_hint`, `l3_hint`: [optional] An cache-hint attribute
+      indicating the desired behavior at the L1, L2, and L3 cache levels.
+
+    - `anchor_layout`: [optional] An attribute that identifies the operation
+      as an anchor, enabling users to assign a layout that governs distribution
+      at the subgroup and/or work-item level. Only valid at workgroup and subgroup 
+      level.
+
     Example:
     ```mlir
       xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>,
@@ -326,16 +342,37 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
     a block of data from memory to register. It takes a set of optional cache
     hints for each level of cache, L1, L2 and L3. If hardware does not have a
     correspoding cache, Corresponding cache hint attribute will be masked.
-    VNNI transformation is an hardware feature for Intel GPU, which is used to
-    do data packing during the load for B operand of matrix operation, if
-    the bit width of the data type is less then 32 bits, e.g., fp16. And
-    transpose is another Intel hardware feature, which will do transpose
-    operation when loading the data if the bit width of the data type is
-    fp32 or fp64. It implies that vnni and transpose cannot exit at the
-    same time. It is only available to 1D or 2D blocked tensor_desc.
+
+    On Intel GPUs, hardware-supported packing rearranges data elements during
+    the load of the B operand when the element bit-width is less than 32 bits
+    (for example, fp16). The transpose feature reorders data during the load
+    when the element type is fp32 or fp64. These two features are mutually
+    exclusive and shall not be enabled simultaneously. Both features support only
+    2D blocked tensor_desc.
 
     In SIMT mode, result vector represents the data to be loaded by each work-item.
 
+    Arguments:
+
+    - `TensorDesc`: A tensor descriptor specifying the base nd-region of memory
+      and the tensor tile to be loaded.
+
+    - `offsets`: Index values representing per-dimension offsets from the base position
+      encoded in `TensorDesc`. They are encoded via `offsets` and `const_offsets`.
+
+    - `packed`: [optional] A unit attribute indicating that packing is applied
+      during the load when supported by the hardware. Only valid at lane level.
+
+    - `transpose`: [optional] An attribute describing a hardware-supported transpose
+      to be applied during the load. Only valid at Lane level.
+
+    - `l1_hint`, `l2_hint`, `l3_hint`: [optional] Cache-hint attributes indicating the
+      desired behavior at the L1, L2, and L3 cache levels.
+
+    - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
+      enabling users to assign a layout that governs distribution at the subgroup and/or
+      work-item level. Only valid at workgroup and subgroup levels.
+
     Example 1:
     ```mlir
       xegpu.load_nd %1 {transpose = [1, 0],
@@ -391,7 +428,6 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
       return getTensorDescType().getShape();
     }
 
-
   }];
 
   let assemblyFormat = [{
@@ -432,6 +468,23 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
 
     In SIMT mode, the input vector represents the data to be stored by each work-item.
 
+    Arguments:
+
+    - `value`: A vector value representing the tensor tile to be stored.
+
+    - `TensorDesc`: A tensor descriptor specifying the base nd-region of memory and
+      the tensor tile to be stored.
+
+    - `offsets`: Index values representing per-dimension offsets from the base position
+      encoded in `TensorDesc`. They are encoded via `offsets` and `const_offsets`.
+
+    - `l1_hint`, `l2_hint`, `l3_hint`: [optional] Cache-hint attributes indicating the
+      desired behavior at the L1, L2, and L3 cache levels.
+
+    - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
+      enabling users to assign a layout that governs distribution at the subgroup and/or
+      work-item level. Only valid at workgroup and subgroup levels.
+
     Example 1:
     ```mlir
       xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
@@ -568,8 +621,10 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
     It accepts the following parameters:
 
     Arguments:
+
     - `source`: a 1D memref or pointer (i64, i32, ui64, ui32) represents the flattened
       memory object.
+
     - `offsets`: a vector containing offsets of each access point. Its size
       is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
       implying each element in the vector corresponds to a work-item (SIMT lane)
@@ -668,17 +723,25 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
     it works on scattered TensorDesc instead.
 
     Arguments:
+
     - `source`: represents the memory region to be loaded from, which can be either a
         tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
         In case of tensor_desc, offsets come from the producer create_tdesc op.
         tensor_desc cannot be used in SIMT mode.
+
     - `offsets`: represents offsets from source. required if `source` in not a TensorDescType.
         offsets is a vector of `index` type and vector length is either the subgroup size
         or 1 in SIMT mode. scalar offset is also valid for SIMT mode.
-    - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache.
-    - `offset_align_byte`: required if `source` is a pointer. If `source` is not a pointer,
+
+    - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache.
+
+    - `offset_align_byte`: [optional] required if `source` is a pointer. If `source` is not a pointer,
         it is not allowed. Represents the alignment in bytes of each offset in offsets.
 
+    - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
+      enabling users to assign a layout that governs distribution at the subgroup and/or
+      work-item level. Only valid at workgroup and subgroup levels.
+
     Example 1:
     ```mlir
       xegpu.prefetch %tdesc {l1_hint = #xegpu.cache_hint<cached>,
@@ -727,7 +790,8 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
       OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
-      OptionalAttr<I64Attr>:$offset_align_byte);
+      OptionalAttr<I64Attr>:$offset_align_byte,
+      OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
     Type getSourceType() {
@@ -779,18 +843,27 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
     each work-item. If size is not 1, size should be equal to the chunk size,
 
     Arguments:
+
     - `source`: represents the memory region to be loaded from, which can be either a
         tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
         In case of tensor_desc, offsets come from the producer create_tdesc op.
         tensor_desc cannot be used in SIMT mode.
+
     - `offsets`: represents offsets from source. required if `source` in not a TensorDescType.
         offsets is a vector of `index` type and vector length is either the subgroup size
         or 1 in SIMT mode. scalar offset is also valid for SIMT mode.
+
     - `mask`: is a vector of `i1` type, which is used to mask out the memory access.
         mask is a vector of size equal to the subgroup size, or 1 in SIMT mode.
         scalar mask is also valid for SIMT mode.
-    - `chunk_size`: (optional) represents contiguous number of elements to load from per work item.
-    - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache.
+
+    - `chunk_size`: [optional] represents contiguous number of elements to load from per work item.
+
+    - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache.
+
+    - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
+      enabling users to assign a layout that governs distribution at the subgroup and/or
+      work-item level. Only valid at workgroup and subgroup levels.
 
     Results:
     - `res`: represents loaded data
@@ -926,19 +999,30 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
   each work-item. If size is not 1, size should be equal to the chunk size.
 
     Arguments:
+
     - `value`: represents the data to be stored.
+
     - `dest`: represents the memory region to be stored to, which can be either a
         tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
         In case of tensor_desc, offsets come from the producer create_tdesc op.
         tensor_desc cannot be used in SIMT mode.
+
     - `offsets`: represents offsets from dest. required if `source` in not a TensorDescType.
         offsets is a vector of `index` type and vector length is either the subgroup size
         or 1 in SIMT mode. scalar offset is also valid for SIMT mode.
+
     - `mask`: is a vector of `i1` type, which is used to mask out the memory access.
         mask is a vector of size equal to the subgroup size, or 1 in SIMT mode.
         scalar mask is also valid for SIMT mode.
-    - `chunk_size`: (optional) represents contiguous number of elements to store to per work item.
-    - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache.
+
+    - `chunk_size`: [optional] represents contiguous number of elements to store to per work item.
+
+    - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache.
+
+    - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
+      enabling users to assign a layout that governs distribution at the subgroup and/or
+      work-item level. Only valid at workgroup and subgroup levels.
+
 
   Example 1:
   ```mlir
@@ -1115,22 +1199,28 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
     size, B of `kxn` size, and accumulate on matrix C of `mxn` to the same size
     matrix , `m=8`, `n=16` and `k=8 * 32/bit_width_of_elem_type`. So for fp16
     data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`,
-    and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS
-    also requires A and B to be loaded with the required data layout. Specially,
-    VNNI layout is required for B operand. It is achieved via adding `packed`
-    attribute to the `load_nd` operator.  Due to the VNNI transformation, B operands
-    can be represented as a 3D vector, with the last dimension representing the VNNI
-    factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>`
-    can be represented as `B: vector<8x16x2xf16>`.
+    and `C/D: vector<8x16xf32>`.
 
     In SIMT code, each work-item from a subgroup holds a data fragment for A, B, C and the result,
     which are represented as 1D vectors. Please refer to [OpenCL Intel extentions]
     (https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html)
     for more details about the fragment distribution.
 
-    Note: on PVC, the hardware can perform load with VNNI transformation when data
-          element type is 16-bit or lower precision, taking 2 or 4 elements from
-          the first dimension and inserted into the newly added innermost dimension.
+    Arguments:
+
+    - `lhs`: A vector value representing the left-hand-side matrix tile (A) participating in the
+      matrix multiply.
+
+    - `rhs`: A vector value representing the right-hand-side matrix tile (B). 
+
+    - `acc`: [optional] A vector value representing the accumulator matrix tile (C). When present, the
+      result is computed as `lhs * rhs + acc`; otherwise, the accumulator is implicitly assumed to be zero.
+
+    - `anchor_layout_a`, `anchor_layout_b`, `anchor_layout_cd`: [optional] Attributes that identify this
+      operation as anchors for operands A, B, and the accumulator/result, enabling users to assign layouts
+      that govern distribution at the subgroup and/or work-item level. Only valid at workgroup and subgroup
+      level.
+
   }];
 
   let arguments = (ins
@@ -1187,13 +1277,31 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
     has the same shape with `TensorDesc`, and is used to enable or disable specific
     data points of the `TensorDesc`. The `value` operand represents the new value to
     be applied during the modification.
+    Arguments:
+    - `kind`: An attribute that specifies the atomic operation to be performed
+      (e.g., add, min, max, exchange, etc.).
+
+    - `tensorDesc`: A `TensorDesc` describing the memory region on which the atomic
+      read-modify-write is performed.
+
+    - `mask`: A predicate mask with the same shape as `tensorDesc`. Only elements
+      with a true (non-zero) mask value participate in the atomic operation;
+      masked-out elements are not modified.
+
+    - `value`: The input values used by the atomic operation. It must have the same
+      shape and element type as `tensorDesc` and `result`.
+
+    - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
+      enabling users to assign a layout that governs distribution at the subgroup
+      and/or work-item level. Only valid at workgroup and subgroup levels.
   }];
 
   let arguments = (ins
     AtomicRMWKindAttr:$kind,
     XeGPU_TensorDesc:$tensorDesc,
     XeGPU_MaskType:$mask,
-    XeGPU_ValueType:$value);
+    XeGPU_ValueType:$value,
+    OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
 
   let results = (outs XeGPU_ValueType:$result);
 
@@ -1275,6 +1383,13 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
       the `target_layout`. Both `input_layout` and `target_layout` must correspond to the same programming
       scope, such as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once
       the IR is lowered to WI level because that is the end result of all distributions.
+      Arguments:
+      - `source`: The input vector whose data is to be redistributed. The source and
+      result types must match.
+      - `input_layout`: The layout attribute describing the current distribution of `source`
+      across subgroups and/or work-items.
+      - `target_layout`: The layout attribute describing the desired distribution of the result
+      across subgroups and/or work-items.
     }];
     let arguments = (ins XeGPU_VectorType: $source,
                          DistributeLayoutAttr: $input_layout,
@@ -1342,12 +1457,13 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
     Arguments:
      - `mem_desc`: the memory descriptor identifying the SLM region.
      - `offsets`: the coordinates within the matrix to read from.
-     - `subgroup_block_io`: [optional] An attribute indicating that the operation can be 
-                 lowered to a subgroup block load. When this attribute is present, 
-                 the offsets are subgroup-uniform across all lanes.
-     - `anchor_layout`: [optional] An attribute for guiding distributions among
-                 subgroups and/or work-items. It currently can accept either
-                 LayoutAttr or SliceAttr.
+     - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered
+        to a subgroup block load. When this attribute is present, the offsets are subgroup-uniform
+        across all lanes. Only used on subgroup and lane level.
+     - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, enabling
+        users to assign a layout that governs distribution at the subgroup and/or work-item level.
+        Only valid at workgroup and subgroup levels.
+
     Results:
      - `res`: the matrix elements loaded from SLM.
   }];
@@ -1393,12 +1509,12 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
      - `mem_desc`: the memory descriptor specifying the SLM region.
      - `offsets`: the coordinates within the matrix where the data will be written.
      - `data`: the values to be stored in the matrix.
-     - `subgroup_block_io`: [optional] An attribute indicating that the operation can be 
-                 lowered to a subgroup block store. When this attribute is present, 
-                 the offsets are subgroup-uniform across all lanes.     
-     - `anchor_layout`: [optional] An attribute for guiding distributions among
-                 subgroups and/or work-items. It currently can accept either
-                 LayoutAttr or SliceAttr.
+     - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered
+        to a subgroup block load. When this attribute is present, the offsets are subgroup-uniform
+        across all lanes. Only used on subgroup and lane level.
+     - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, enabling
+        users to assign a layout that governs distribution at the subgroup and/or work-item level.
+        Only valid at workgroup and subgroup levels.
   }];
   let builders = [
     OpBuilder<(ins "Value" : $data, "TypedValue<MemDescType>": $mem_desc,
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 3240c0f40ce58..29daab384bf7f 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -828,7 +828,7 @@ void PrefetchOp::build(OpBuilder &builder, OperationState &state, Value source,
                        xegpu::CachePolicyAttr l2_hint,
                        xegpu::CachePolicyAttr l3_hint) {
   build(builder, state, source, Value(), l1_hint, l2_hint, l3_hint,
-        IntegerAttr{});
+        IntegerAttr{}, /*anchor_layout=*/nullptr);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 6d45a51ab0267..3b5207dd92285 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -387,7 +387,7 @@ class LayoutInfoPropagation
                         ArrayRef<LayoutInfoLattice *> operands,
                         ArrayRef<const LayoutInfoLattice *> results);
 
-  bool hasAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout);
+  bool hasParamsOfLayoutKind(xegpu::DistributeLayoutAttr anchorLayout);
 
 public:
   LayoutInfoPropagation(DataFlowSolver &solver,
@@ -477,7 +477,7 @@ LogicalResult LayoutInfoPropagation::visitOperation(
   return success();
 }
 
-bool LayoutInfoPropagation::hasAnchorLayout(
+bool LayoutInfoPropagation::hasParamsOfLayoutKind(
     xegpu::DistributeLayoutAttr anchorLayout) {
   if (anchorLayout == nullptr) {
     return false;
@@ -497,7 +497,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
 
   LayoutInfo prefetchLayout;
   xegpu::DistributeLayoutAttr anchorLayout = prefetch.getAnchorLayoutAttr();
-  if (hasAnchorLayout(anchorLayout)) {
+  if (hasParamsOfLayoutKind(anchorLayout)) {
     prefetchLayout = LayoutInfo(anchorLayout);
   } else {
     // Here we assign the default layout to the tensor descriptor operand of
@@ -648,12 +648,12 @@ void LayoutInfoPropagation::visitDpasOp(
   LayoutInfo dpasCLayout;
 
   xegpu::DistributeLayoutAttr anchorLayoutC = dpas.getAnchorLayoutCdAttr();
-  if (hasAnchorLayout(anchorLayoutC)) {
+  if (hasParamsOfLayoutKind(anchorLayoutC)) {
     xegpu::DistributeLayoutAttr anchorLayoutA = dpas.getAnchorLayoutAAttr();
     xegpu::DistributeLayoutAttr anchorLayoutB = dpas.getAnchorLayoutBAttr();
-    assert(hasAnchorLayout(anchorLayoutA) &&
+    assert(hasParamsOfLayoutKind(anchorLayoutA) &&
            "Expected anchor layout for DPAS A operand.");
-    assert(hasAnchorLayout(anchorLayoutB) &&
+    assert(hasParamsOfLayoutKind(anchorLayoutB) &&
            "Expected anchor layout for DPAS B operand.");
     dpasALayout = LayoutInfo(anchorLayoutA);
     dpasBLayout = LayoutInfo(anchorLayoutB);
@@ -743,7 +743,7 @@ void LayoutInfoPropagation::visitStoreNdOp(
 
   LayoutInfo storeLayout;
   xegpu::DistributeLayoutAttr anchorLayout = store.getAnchorLayoutAttr();
-  if (hasAnchorLayout(anchorLayout)) {
+  if (hasParamsOfLayoutKind(anchorLayout)) {
     storeLayout = LayoutInfo(anchorLayout);
   } else {
     auto uArch = getUArch(getChipStr(store).value_or(""));
@@ -799,7 +799,7 @@ void LayoutInfoPropagation::visitLoadNdOp(
 
   LayoutInfo loadLayout;
   xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr();
-  if (hasAnchorLayout(anchorLayout)) {
+  if (hasParamsOfLayoutKind(anchorLayout)) {
     loadLayout = LayoutInfo(anchorLayout);
   } else {
 
@@ -914,7 +914,7 @@ void LayoutInfoPropagation::visitLoadGatherOp(
   LayoutInfo loadLayout;
   LayoutInfo maskLayout;
   xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr();
-  if (hasAnchorLayout(anchorLayout)) {
+  if (hasParamsOfLayoutKind(anchorLayout)) {
     loadLayout = LayoutInfo(anchorLayout);
     maskLayout = loadLayout;
   } else {
@@ -984,7 +984,7 @@ void LayoutInfoPropagation::visitStoreScatterOp(
   LayoutInfo payloadLayout;
   LayoutInfo maskLayout;
   xegpu::DistributeLayoutAttr anchorLayout = storeScatter.getAnchorLayoutAttr();
-  if (hasAnchorLayout(anchorLayout)) {
+  if (hasParamsOfLayoutKind(anchorLayout)) {
     payloadLayout = LayoutInfo(anchorLayout);
     maskLayout = payloadLayout;
   } else {

>From d1652af58eb344251976bc7a7379dff7937495a3 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Tue, 25 Nov 2025 23:44:57 +0000
Subject: [PATCH 04/23] address feedback and add more documentation

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 296 ++++++++++++------
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |   2 -
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp |  16 +-
 3 files changed, 200 insertions(+), 114 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 344fb23ba7b8d..f6b7dc0384e52 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -76,10 +76,10 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
        For the case of dynamic memrefs or pointer, the shape and layout information of the
        memory region should be explicitly passed via `shape` and `strides` parameters.
 
-    - `offsets`: index values represents offsets from the "source" at the each dimension
+    - `offsets`: [optional] index values represents offsets from the "source" at the each dimension
         at which the subview of the target memory will be created. It is encoded via
         "offsets" and "const_offsets", such that it can accept various forms, such as,
-        operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4]).
+        operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4]). Offsets is optional and may be set at load_nd, store_nd, and prefetch_nd.
 
     - `shape`: the shape information of the memory region pointed by the "source". It is
          typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>.
@@ -253,28 +253,32 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
     It issues an instruction to prefetch a block of data from continuous
     memory regions to each level of the cache based on their cache policy.
 
+    This operation serves as an anchor through which users assign a layout attribute
+    to govern computation distribution.
+
     Arguments:
     - `TensorDesc`: A tensor descriptor specifying the base nd-region of
       memory and tensor tile to be prefetched.
 
-    - `offsets`: index values representing per-dimension offsets from the
+    - `offsets`: [optional] index values representing per-dimension offsets from the
       base position encoded in `TensorDesc`. It is encoded via "offsets"
       and "const_offsets".
 
     - `l1_hint`, `l2_hint`, `l3_hint`: [optional] An cache-hint attribute
       indicating the desired behavior at the L1, L2, and L3 cache levels.
 
-    - `anchor_layout`: [optional] An attribute that identifies the operation
-      as an anchor, enabling users to assign a layout that governs distribution
-      at the subgroup and/or work-item level. Only valid at workgroup and subgroup 
-      level.
+    - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand.
+       Only valid at the workgroup and subgroup levels.
 
-    Example:
+    Example (Workgroup level):
     ```mlir
-      xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>,
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      xegpu.prefetch_nd %tdesc[%c0, %c1] {l1_hint = #xegpu.cache_hint<cached>,
                                 l2_hint = #xegpu.cache_hint<cached>,
-                                l3_hint = #xegpu.cache_hint<cached>}
-        : !xegpu.tensor_desc<8x16xf16>
+                                l3_hint = #xegpu.cache_hint<cached>,
+                                anchor_layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [8, 32]> }
+        : !xegpu.tensor_desc<32x256xf16>
     ```
 
   }];
@@ -350,7 +354,10 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
     exclusive and shall not be enabled simultaneously. Both features support only
     2D blocked tensor_desc.
 
-    In SIMT mode, result vector represents the data to be loaded by each work-item.
+    At lane level, result vector represents the data to be loaded by each lane.
+
+    This operation serves as an anchor through which users assign a layout attribute
+    to govern computation distribution.
 
     Arguments:
 
@@ -369,19 +376,18 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
     - `l1_hint`, `l2_hint`, `l3_hint`: [optional] Cache-hint attributes indicating the
       desired behavior at the L1, L2, and L3 cache levels.
 
-    - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
-      enabling users to assign a layout that governs distribution at the subgroup and/or
-      work-item level. Only valid at workgroup and subgroup levels.
+    - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as the result of the load (they are identical). Only valid at workgroup and subgroup levels.
 
-    Example 1:
+    Example 1 (Workgroup level):
     ```mlir
       xegpu.load_nd %1 {transpose = [1, 0],
                         l1_hint = #xegpu.cache_hint<cached>,
                         l2_hint = #xegpu.cache_hint<uncached>,
-                        l3_hint = #xegpu.cache_hint<streaming>}
-              : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
+                        l3_hint = #xegpu.cache_hint<streaming>,
+                        anchor_layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [8, 32]>}
+              : !xegpu.tensor_desc<32x256xf32> -> vector<32x256xf32>
     ```
-    Example 2 (SIMT mode):
+    Example 2 (lane level):
     ```mlir
       xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint<cached>,
                         l2_hint = #xegpu.cache_hint<uncached>}>
@@ -466,7 +472,10 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
     Corresponding cache hint attribute will be masked.
     It is only available to 1D or 2D blocked tensor_desc.
 
-    In SIMT mode, the input vector represents the data to be stored by each work-item.
+    At lane level, the input vector represents the data to be stored by each lane.
+
+    This operation serves as an anchor through which users assign a layout attribute
+    to govern computation distribution.
 
     Arguments:
 
@@ -481,18 +490,18 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
     - `l1_hint`, `l2_hint`, `l3_hint`: [optional] Cache-hint attributes indicating the
       desired behavior at the L1, L2, and L3 cache levels.
 
-    - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
-      enabling users to assign a layout that governs distribution at the subgroup and/or
-      work-item level. Only valid at workgroup and subgroup levels.
+    - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as
+      the value to be stored (they are identical). Only valid at workgroup and subgroup levels.
 
-    Example 1:
+    Example 1 (Workgroup level):
     ```mlir
       xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
                              l2_hint = #xegpu.cache_hint<write_back>,
-                             l3_hint = #xegpu.cache_hint<write_through>}
-                             : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+                             l3_hint = #xegpu.cache_hint<write_through>,
+                             anchor_layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [8, 32]>}
+                             : vector<32x256xf16>, !xegpu.tensor_desc<32x256xf16>
     ```
-    Example 2 (SIMT mode):
+    Example 2 (lane level):
     ```mlir
       xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
                              l2_hint = #xegpu.cache_hint<write_back>,
@@ -617,7 +626,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
     "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates
     a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc"
     is for creating continuous subviews, "create_tdesc" is for creating non-continuous
-    (scattered) subviews, allowing each work-item in a subgroup specifying their own offset.
+    (scattered) subviews, allowing each lane in a subgroup specifying their own offset.
     It accepts the following parameters:
 
     Arguments:
@@ -627,13 +636,12 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
 
     - `offsets`: a vector containing offsets of each access point. Its size
       is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
-      implying each element in the vector corresponds to a work-item (SIMT lane)
-      in the subgroup.
+      implying each element in the vector corresponds to a SIMT lane in the subgroup.
 
     Results:
     - `res`: scattered tensor descriptor
 
-    The first dimension of the result TensorDesc corresponds to work-items, so it should
+    The first dimension of the result TensorDesc corresponds to lanes, so it should
     match the dimension of offsets. It may also has a second dimension corresponding to
     the chunk_size if the chunk size is larger than 1.
 
@@ -722,35 +730,39 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
     As compared to prefetch_nd, which works on non-scattered TensorDesc,
     it works on scattered TensorDesc instead.
 
+    This operation serves as an anchor through which users assign a layout attribute
+    to govern computation distribution.
+
     Arguments:
 
     - `source`: represents the memory region to be loaded from, which can be either a
         tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
         In case of tensor_desc, offsets come from the producer create_tdesc op.
-        tensor_desc cannot be used in SIMT mode.
+        tensor_desc cannot be used at lane level.
 
     - `offsets`: represents offsets from source. required if `source` in not a TensorDescType.
         offsets is a vector of `index` type and vector length is either the subgroup size
-        or 1 in SIMT mode. scalar offset is also valid for SIMT mode.
+        or 1 at lane level. scalar offset is also valid for lane level.
 
     - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache.
 
     - `offset_align_byte`: [optional] required if `source` is a pointer. If `source` is not a pointer,
         it is not allowed. Represents the alignment in bytes of each offset in offsets.
 
-    - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
-      enabling users to assign a layout that governs distribution at the subgroup and/or
-      work-item level. Only valid at workgroup and subgroup levels.
+    - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` or `offsets`
+      operand. Only valid at workgroup and subgroup levels.
 
-    Example 1:
+    Example 1 (Workgroup level):
     ```mlir
       xegpu.prefetch %tdesc {l1_hint = #xegpu.cache_hint<cached>,
                              l2_hint = #xegpu.cache_hint<cached>,
-                             l3_hint = #xegpu.cache_hint<cached>}
-        : !xegpu.tensor_desc<16xf16>
+                             l3_hint = #xegpu.cache_hint<cached>, 
+                             anchor_layout = #xegpu.layout<sg_layout = [8], sg_data = [32]>
+                             }
+        : !xegpu.tensor_desc<256xf16>
     ```
 
-    Example 2:
+    Example 2 (lane level):
     A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
     It combines "create scattered TensorTdesc" and "prefetch with scattered TensorTdesc".
     The source operand could be a raw pointer (ui64, ui32, i64, i32).
@@ -764,8 +776,8 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
         : memref<1024xf32>, vector<4xindex>
     ```
 
-    Example 3 (SIMT mode):
-    SIMT mode only accepts the offsets variant.
+    Example 3 (lane level):
+    lane level only accepts the offsets variant.
     ```mlir
       xegpu.prefetch %0[%1] {l1_hint = #xegpu.cache_hint<cached>,
                              l2_hint = #xegpu.cache_hint<cached>,
@@ -773,8 +785,8 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
         : memref<256xf32>, vector<1xindex>
     ```
 
-    Example 4 (SIMT mode):
-    SIMT mode only accepts the offsets variant.
+    Example 4 (lane level):
+    lane level only accepts the offsets variant.
     ```mlir
       xegpu.prefetch %0[%1] {l1_hint = #xegpu.cache_hint<cached>,
                              l2_hint = #xegpu.cache_hint<cached>,
@@ -831,63 +843,67 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
 def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
   let summary = "load a set of scattered data points from memory.";
 
-  let description = [{ It (aka. load) load data per each work-item. The output
+  let description = [{ It (aka. load) load data per each lane. The output
     describes the data being loaded at the subgroup level, so its size is
-    consistent with the number of work-items in a subgroup. When the chunk size
+    consistent with the number of lanes in a subgroup. When the chunk size
     is larger than 2, the output vector is a 2D vector, with dim-0 correspoding
-    to work-items, and dim-1 corresponding to the chunk size loaded by each work-item.
+    to lanes, and dim-1 corresponding to the chunk size loaded by each lane.
     The mask operand masks out memory access so that it is safe to pass out-of-boundary
-    addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
+    addresses/offsets as long as they are masked. Each mask element applies to one lane.
+
+    In lane level, the result is a 1D vector that represents the data to be loaded by
+    each lane. If size is not 1, size should be equal to the chunk size.
 
-    In SIMT mode, the result is a 1D vector that represents the data to be loaded by
-    each work-item. If size is not 1, size should be equal to the chunk size,
+    This operation serves as an anchor through which users assign a layout attribute
+    to govern computation distribution.
 
     Arguments:
 
     - `source`: represents the memory region to be loaded from, which can be either a
         tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
         In case of tensor_desc, offsets come from the producer create_tdesc op.
-        tensor_desc cannot be used in SIMT mode.
+        tensor_desc cannot be used at lane level.
 
     - `offsets`: represents offsets from source. required if `source` in not a TensorDescType.
         offsets is a vector of `index` type and vector length is either the subgroup size
-        or 1 in SIMT mode. scalar offset is also valid for SIMT mode.
+        or 1 at lane level. scalar offset is also valid for lane level.
 
     - `mask`: is a vector of `i1` type, which is used to mask out the memory access.
-        mask is a vector of size equal to the subgroup size, or 1 in SIMT mode.
-        scalar mask is also valid for SIMT mode.
+        mask is a vector of size equal to the subgroup size, or 1 at lane level.
+        scalar mask is also valid for lane level.
 
     - `chunk_size`: [optional] represents contiguous number of elements to load from per work item.
 
     - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache.
 
-    - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
-      enabling users to assign a layout that governs distribution at the subgroup and/or
-      work-item level. Only valid at workgroup and subgroup levels.
+    - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand or the result 
+      of load. Only valid at workgroup and subgroup levels.
 
     Results:
     - `res`: represents loaded data
 
 
-  Example 1:
+  Example 1 (Workgroup level):
   ```mlir
     %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>,
                              l2_hint = #xegpu.cache_hint<uncached>,
-                             l3_hint = #xegpu.cache_hint<uncached>}>
-          : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
-            vector<16xi1> -> vector<16xf32>
+                             l3_hint = #xegpu.cache_hint<uncached>}, 
+                             anchor_layout = #xegpu.layout<sg_layout = [8], sg_data = [32]>>
+          : !xegpu.tensor_desc<256xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
+            vector<256xi1> -> vector<256xf32>
   ```
 
-  Example 2:
+  Example 2 (Subgroup level):
   ```mlir
     %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>,
                              l2_hint = #xegpu.cache_hint<uncached>,
-                             l3_hint = #xegpu.cache_hint<uncached>}>
+                             l3_hint = #xegpu.cache_hint<uncached>},
+                             anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 8]>>
           : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
             vector<16xi1> -> vector<16x8xf32>
   ```
 
-  Example 3:
+  Example 3 (Subgroup level):
   A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
   It combines "create scattered TensorTdesc" and "load with scattered TensorTdesc".
   The source operand could be a raw pointer (ui64, ui32, i64, i32). Please refer to create_tdesc
@@ -898,12 +914,13 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
     %mask = vector.constant_mask [16]: vector<16xi1>
     %val = xegpu.load %a[%offsets], %mask {l1_hint = #xegpu.cache_hint<cached>,
                            l2_hint = #xegpu.cache_hint<cached>,
-                           l3_hint = #xegpu.cache_hint<cached>}
+                           l3_hint = #xegpu.cache_hint<cached>, 
+                           anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
       : memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32>
   ```
 
-  Example 4 (SIMT mode):
-  SIMT mode only accepts the offsets variant. chunk_size can be inferred from result
+  Example 4 (lane level):
+  lane level only accepts the offsets variant. chunk_size can be inferred from result
   type. In this example, chunk_size is 8.
   ```mlir
     %2 = xegpu.load %1[%2], %0 <{l1_hint = #xegpu.cache_hint<cached>,
@@ -979,7 +996,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
                     "xegpu::CachePolicyAttr": $l1_hint,
                     "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint,
-                    "xegpu::DistributeLayoutAttr": $anchor_layout)>
+                    "xegpu::DistributeLayoutAttr": $layout)>
    ];
 
   let hasVerifier = 1;
@@ -995,8 +1012,11 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
   has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
   introduced on purpose, making sure users are aware of this implicit transformation.
 
-  In SIMT mode, the result is a 1D vector that represents the data to be stored by
-  each work-item. If size is not 1, size should be equal to the chunk size.
+  In lane level, the result is a 1D vector that represents the data to be stored by
+  each lane. If size is not 1, size should be equal to the chunk size.
+
+  This operation serves as an anchor through which users assign a layout attribute
+  to govern computation distribution.
 
     Arguments:
 
@@ -1005,42 +1025,43 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
     - `dest`: represents the memory region to be stored to, which can be either a
         tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
         In case of tensor_desc, offsets come from the producer create_tdesc op.
-        tensor_desc cannot be used in SIMT mode.
+        tensor_desc cannot be used at lane level.
 
     - `offsets`: represents offsets from dest. required if `source` in not a TensorDescType.
         offsets is a vector of `index` type and vector length is either the subgroup size
-        or 1 in SIMT mode. scalar offset is also valid for SIMT mode.
+        or 1 at lane level. scalar offset is also valid for lane level.
 
     - `mask`: is a vector of `i1` type, which is used to mask out the memory access.
-        mask is a vector of size equal to the subgroup size, or 1 in SIMT mode.
-        scalar mask is also valid for SIMT mode.
+        mask is a vector of size equal to the subgroup size, or 1 at lane level.
+        scalar mask is also valid for lane level.
 
     - `chunk_size`: [optional] represents contiguous number of elements to store to per work item.
 
     - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache.
 
-    - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
-      enabling users to assign a layout that governs distribution at the subgroup and/or
-      work-item level. Only valid at workgroup and subgroup levels.
+    - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand or the value
+      to be stored. Only valid at workgroup and subgroup levels.
 
 
-  Example 1:
+  Example 1 (Workgroup level):
   ```mlir
     xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint<uncached>,
                              l2_hint = #xegpu.cache_hint<write_back>,
-                             l3_hint = #xegpu.cache_hint<write_through>}>
-          : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1>
+                             l3_hint = #xegpu.cache_hint<write_through>,
+                             anchor_layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>}>
+          : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.scattered_tdesc_attr<>>, vector<256xi1>
   ```
 
-  Example 2:
+  Example 2 (Subgroup level):
   ```mlir
     xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint<uncached>,
                              l2_hint = #xegpu.cache_hint<write_back>,
-                             l3_hint = #xegpu.cache_hint<write_through>}>
+                             l3_hint = #xegpu.cache_hint<write_through>,
+                             anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 8]>}>
           : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
   ```
 
-  Example 3:
+  Example 3 (Subgroup level):
   A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
   It combines "create scattered TensorTdesc" and "store with scattered TensorTdesc".
   The dest operand could be a raw pointer (uint64_t).
@@ -1052,12 +1073,13 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
     %mask = vector.constant_mask [16]: vector<16xi1>
     xegpu.store %val, %a[%offsets], %mask {l1_hint = #xegpu.cache_hint<cached>,
                            l2_hint = #xegpu.cache_hint<cached>,
-                           l3_hint = #xegpu.cache_hint<cached>}
+                           l3_hint = #xegpu.cache_hint<cached>,
+                           anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
       : memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32>
   ```
 
-  Example 4 (SIMT mode):
-  SIMT mode only accepts the offsets variant. chunk_size can be inferred from value
+  Example 4 (Lane level):
+  Lane level IR only accepts the offsets variant. chunk_size can be inferred from value
   type. In this example, chunk_size is 8.
   ```mlir
     xegpu.store %0, %1[%2], %3 <{l1_hint = #xegpu.cache_hint<uncached>,
@@ -1133,7 +1155,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
                     "xegpu::CachePolicyAttr": $l1_hint,
                     "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint,
-                    "xegpu::DistributeLayoutAttr": $anchor_layout)>
+                    "xegpu::DistributeLayoutAttr": $layout)>
    ];
 
   let hasVerifier = 1;
@@ -1148,8 +1170,8 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
     the current position in the number of elements. However, `update_nd_offset`
     is to update the start point of a 2D block, so its offset constains two
     elements representing the shift in each dimension. `update_offset` is to
-    update the offset per work-item, so its offsets contains values representing
-    shifts for each work-item.
+    update the offset per lane, so its offsets contains values representing
+    shifts for each lane.
 
     Example:
     ```mlir
@@ -1201,11 +1223,14 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
     data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`,
     and `C/D: vector<8x16xf32>`.
 
-    In SIMT code, each work-item from a subgroup holds a data fragment for A, B, C and the result,
+    In lane level code, each lane from a subgroup holds a data fragment for A, B, C and the result,
     which are represented as 1D vectors. Please refer to [OpenCL Intel extentions]
     (https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html)
     for more details about the fragment distribution.
 
+    This operation serves as an anchor through which users assign a layout attribute
+    to govern computation distribution.
+
     Arguments:
 
     - `lhs`: A vector value representing the left-hand-side matrix tile (A) participating in the
@@ -1217,10 +1242,26 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
       result is computed as `lhs * rhs + acc`; otherwise, the accumulator is implicitly assumed to be zero.
 
     - `anchor_layout_a`, `anchor_layout_b`, `anchor_layout_cd`: [optional] Attributes that identify this
-      operation as anchors for operands A, B, and the accumulator/result, enabling users to assign layouts
-      that govern distribution at the subgroup and/or work-item level. Only valid at workgroup and subgroup
+      operation as anchor for operands A, B, and the accumulator/result, enabling users to assign layouts
+      that govern distribution at the subgroup and/or lane level. Only valid at workgroup and subgroup
       level.
 
+    Example 1 (Workgroup level):
+
+    ```mlir
+      %d = xegpu.dpas %a, %b, %c <{
+          anchor_layout_a = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 128]>,
+          anchor_layout_b = #xegpu.layout<sg_layout = [4, 8], sg_data = [128, 16]>,
+          anchor_layout_cd = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16]>}
+          : vector<64x128xf16>, vector<128x128xf16>, vector<64x128xf32> -> vector<64x128xf32>
+    ```
+
+    Example 2 (Lane level):
+
+    ```mlir
+      %d = xegpu.dpas %a, %b, %c
+            :  vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
+    ```
   }];
 
   let arguments = (ins
@@ -1277,6 +1318,10 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
     has the same shape with `TensorDesc`, and is used to enable or disable specific
     data points of the `TensorDesc`. The `value` operand represents the new value to
     be applied during the modification.
+
+    This operation serves as an anchor through which users assign a layout attribute
+    to govern computation distribution.    
+
     Arguments:
     - `kind`: An attribute that specifies the atomic operation to be performed
       (e.g., add, min, max, exchange, etc.).
@@ -1293,7 +1338,7 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
 
     - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
       enabling users to assign a layout that governs distribution at the subgroup
-      and/or work-item level. Only valid at workgroup and subgroup levels.
+      and/or lane level. Only valid at workgroup and subgroup levels.
   }];
 
   let arguments = (ins
@@ -1379,17 +1424,29 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
 def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> {
     let summary = "Convert the layout of the input operand";
     let description = [{
-      `convert_layout` redistribute data across subgroups and/or work-items from the `input_layout` to
+      `convert_layout` redistribute data across subgroups and/or lanes from the `input_layout` to
       the `target_layout`. Both `input_layout` and `target_layout` must correspond to the same programming
-      scope, such as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once
+      scope, such as workgroup level (wg) or subgroup level (sg) code. This operation is not valid once
       the IR is lowered to WI level because that is the end result of all distributions.
+
+      This operation serves as an anchor through which users assign a layout attribute
+      to govern computation distribution.
+
       Arguments:
       - `source`: The input vector whose data is to be redistributed. The source and
       result types must match.
       - `input_layout`: The layout attribute describing the current distribution of `source`
-      across subgroups and/or work-items.
+      across subgroups and/or lanes.
       - `target_layout`: The layout attribute describing the desired distribution of the result
-      across subgroups and/or work-items.
+      across subgroups and/or lanes.
+
+      Example (Subgroup level):
+        ```mlir
+          %coop_a = xegpu.convert_layout %a <{
+                input_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>,
+                target_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}>
+            : vector<128x128xf16>
+        ```
     }];
     let arguments = (ins XeGPU_VectorType: $source,
                          DistributeLayoutAttr: $input_layout,
@@ -1427,8 +1484,17 @@ def XeGPU_CreateMemDescOp: XeGPU_Op<"create_mem_desc", [Pure,
 
     Arguments:
      - `source` : a 1D statically shaped memref with element type i8, representing the raw SLM buffer.
+
     Results:
      - `mem_desc` : the memory descriptor.
+
+    Example:
+    ```mlir
+      %mdesc = xegpu.create_mem_desc %mref
+        : memref<4096xi8, 3>
+          -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>>
+    ```
+
   }];
   let arguments = (ins StaticShared1DMemRefOf<[I8]>:$source);
   let results = (outs XeGPU_MemDesc:$mem_desc);
@@ -1454,23 +1520,35 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
     by the provided 2D `mem_desc`. Only 2D memory descriptors are supported; use the
     subview operation to obtain a compatible 2D `mem_desc` from a higher-rank descriptor if needed.
 
+    This operation serves as an anchor through which users assign a layout attribute
+    to govern computation distribution.
+
     Arguments:
      - `mem_desc`: the memory descriptor identifying the SLM region.
      - `offsets`: the coordinates within the matrix to read from.
      - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered
         to a subgroup block load. When this attribute is present, the offsets are subgroup-uniform
         across all lanes. Only used on subgroup and lane level.
-     - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, enabling
-        users to assign a layout that governs distribution at the subgroup and/or work-item level.
+     - `anchor_layout`: [optional] Describes the expected layout of the `mem_desc` operand as well as
+      the result of load (they are identical).
         Only valid at workgroup and subgroup levels.
 
     Results:
      - `res`: the matrix elements loaded from SLM.
+
+    Example (Workgroup level):
+    ```mlir
+        %c0 = arith.constant 0 : index
+        %1 = xegpu.load_matrix %0[%c0, %c0] <{
+                anchor_layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [32, 16]> }>
+          : !xegpu.mem_desc<128x128xf16, #xegpu.mem_layout<stride = [1, 128], block = [16, 16]>>
+          , index, index -> vector<128x128xf16>
+    ```
   }];
 
   let builders = [
     OpBuilder<(ins "Type":$res, "TypedValue<MemDescType>": $mem_desc,
-                    "llvm::ArrayRef<OpFoldResult>": $offsets, "DistributeLayoutAttr": $anchor_layout)>,
+                    "llvm::ArrayRef<OpFoldResult>": $offsets, "DistributeLayoutAttr": $layout)>,
   ];
   let extraClassDeclaration = [{
     SmallVector<OpFoldResult> getMixedOffsets() {
@@ -1505,6 +1583,9 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
     specified by a 2D `mem_desc`. Only 2D memory descriptors are supported; use the
     subview operation to obtain a 2D `mem_desc` from a higher-rank descriptor if needed.
 
+    This operation serves as an anchor through which users assign a layout attribute
+    to govern computation distribution.
+
     Arguments:
      - `mem_desc`: the memory descriptor specifying the SLM region.
      - `offsets`: the coordinates within the matrix where the data will be written.
@@ -1512,13 +1593,20 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
      - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered
         to a subgroup block load. When this attribute is present, the offsets are subgroup-uniform
         across all lanes. Only used on subgroup and lane level.
-     - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, enabling
-        users to assign a layout that governs distribution at the subgroup and/or work-item level.
-        Only valid at workgroup and subgroup levels.
+     - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as
+        the value to be stored (they are identical). Only valid at workgroup and subgroup levels.
+
+    Example (Workgroup level):
+    ```mlir
+        %c0 = arith.constant 0 : index
+        xegpu.store_matrix %1, %0[%c0, %c0] <{
+                anchor_layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [32, 16]> }>
+          : vector<128x128xf16>, !xegpu.mem_desc<128x128xf16>>, index, index
+    ```
   }];
   let builders = [
     OpBuilder<(ins "Value" : $data, "TypedValue<MemDescType>": $mem_desc,
-                   "llvm::ArrayRef<OpFoldResult>": $offsets, "DistributeLayoutAttr": $anchor_layout)>,
+                   "llvm::ArrayRef<OpFoldResult>": $offsets, "DistributeLayoutAttr": $layout)>,
   ];
   let extraClassDeclaration = [{
     SmallVector<OpFoldResult> getMixedOffsets() {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index b3d2c40712c96..fb5d1e758dbd1 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -22,8 +22,6 @@ using std::optional;
 namespace mlir {
 namespace xegpu {
 
-//#include "mlir/Dialect/XeGPU/IR/XeGPUOpInterface.cpp.inc"
-
 void XeGPUDialect::initialize() {
   addTypes<
 #define GET_TYPEDEF_LIST
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 3b5207dd92285..8fb63da8cb0a0 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -645,10 +645,10 @@ void LayoutInfoPropagation::visitDpasOp(
 
   LayoutInfo dpasALayout;
   LayoutInfo dpasBLayout;
-  LayoutInfo dpasCLayout;
+  LayoutInfo dpasCDLayout;
 
-  xegpu::DistributeLayoutAttr anchorLayoutC = dpas.getAnchorLayoutCdAttr();
-  if (hasParamsOfLayoutKind(anchorLayoutC)) {
+  xegpu::DistributeLayoutAttr anchorLayoutCD = dpas.getAnchorLayoutCdAttr();
+  if (hasParamsOfLayoutKind(anchorLayoutCD)) {
     xegpu::DistributeLayoutAttr anchorLayoutA = dpas.getAnchorLayoutAAttr();
     xegpu::DistributeLayoutAttr anchorLayoutB = dpas.getAnchorLayoutBAttr();
     assert(hasParamsOfLayoutKind(anchorLayoutA) &&
@@ -657,7 +657,7 @@ void LayoutInfoPropagation::visitDpasOp(
            "Expected anchor layout for DPAS B operand.");
     dpasALayout = LayoutInfo(anchorLayoutA);
     dpasBLayout = LayoutInfo(anchorLayoutB);
-    dpasCLayout = LayoutInfo(anchorLayoutC);
+    dpasCDLayout = LayoutInfo(anchorLayoutCD);
 
   } else {
 
@@ -714,14 +714,14 @@ void LayoutInfoPropagation::visitDpasOp(
           dpas.emitWarning(
               "No suitable instruction multiple found for the given shape.");
         SmallVector<int> instDataC = {maxALen, maxCLen};
-        dpasCLayout =
+        dpasCDLayout =
             LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataC));
       } else
-        dpasCLayout = getSIMTLayoutInfoForDPASOperand(
+        dpasCDLayout = getSIMTLayoutInfoForDPASOperand(
             cTy, 2, uArch, uArchInstruction->getPackedFormatBitSizeB());
 
       dpas.setAnchorLayoutCdAttr(
-          dyn_cast<xegpu::DistributeLayoutAttr>(dpasCLayout.get()));
+          dyn_cast<xegpu::DistributeLayoutAttr>(dpasCDLayout.get()));
     }
     dpas.setAnchorLayoutAAttr(
         dyn_cast<xegpu::DistributeLayoutAttr>(dpasALayout.get()));
@@ -732,7 +732,7 @@ void LayoutInfoPropagation::visitDpasOp(
   propagateIfChanged(operands[0], operands[0]->meet(dpasALayout));
   propagateIfChanged(operands[1], operands[1]->meet(dpasBLayout));
   if (operands.size() > 2) {
-    propagateIfChanged(operands[2], operands[2]->meet(dpasCLayout));
+    propagateIfChanged(operands[2], operands[2]->meet(dpasCDLayout));
   }
 }
 

>From b186bc2c20b0f1703170491f35466fd48950dabb Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Wed, 26 Nov 2025 01:41:52 +0000
Subject: [PATCH 05/23] rename anchor_layout to layout

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 82 +++++++++----------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |  6 +-
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 34 ++++----
 .../Transforms/XeGPUSubgroupDistribute.cpp    |  4 +-
 .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp  |  8 +-
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 14 ++--
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 16 ++--
 mlir/test/Dialect/XeGPU/invalid.mlir          |  6 +-
 .../XeGPU/propagate-layout-inst-data.mlir     | 16 ++--
 mlir/test/Dialect/XeGPU/propagate-layout.mlir | 80 +++++++++---------
 .../Dialect/XeGPU/subgroup-distribute.mlir    | 12 +--
 mlir/test/Dialect/XeGPU/xegpu-blocking.mlir   |  4 +-
 .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir       | 10 +--
 13 files changed, 144 insertions(+), 148 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index f6b7dc0384e52..abcaa1da82e67 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -236,7 +236,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
       return static_cast<unsigned>(MemorySpace::Global);
     }
 
-    xegpu::DistributeLayoutAttr getLayoutAttr() {
+    xegpu::DistributeLayoutAttr getDescLayoutAttr() {
       return dyn_cast_if_present<xegpu::DistributeLayoutAttr>(getType().getLayout());
     }
 
@@ -267,7 +267,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
     - `l1_hint`, `l2_hint`, `l3_hint`: [optional] An cache-hint attribute
       indicating the desired behavior at the L1, L2, and L3 cache levels.
 
-    - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand.
+    - `layout`: [optional] Describes the expected layout of the `tensor_desc` operand.
        Only valid at the workgroup and subgroup levels.
 
     Example (Workgroup level):
@@ -277,7 +277,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
       xegpu.prefetch_nd %tdesc[%c0, %c1] {l1_hint = #xegpu.cache_hint<cached>,
                                 l2_hint = #xegpu.cache_hint<cached>,
                                 l3_hint = #xegpu.cache_hint<cached>,
-                                anchor_layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [8, 32]> }
+                                layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [8, 32]> }
         : !xegpu.tensor_desc<32x256xf16>
     ```
 
@@ -289,7 +289,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint,
-                       OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
+                       OptionalAttr<DistributeLayoutAttr>:$layout);
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
     xegpu::TensorDescType getTensorDescType() {
@@ -304,7 +304,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
       return getMixedValues(statics, dynamics, getContext());
     }
 
-    xegpu::DistributeLayoutAttr getLayoutAttr() {
+    xegpu::DistributeLayoutAttr getDescLayoutAttr() {
       return dyn_cast_if_present<xegpu::DistributeLayoutAttr>(getTensorDescType().getLayout());
     }
 
@@ -376,7 +376,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
     - `l1_hint`, `l2_hint`, `l3_hint`: [optional] Cache-hint attributes indicating the
       desired behavior at the L1, L2, and L3 cache levels.
 
-    - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as the result of the load (they are identical). Only valid at workgroup and subgroup levels.
+    - `layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as the result of the load (they are identical). Only valid at workgroup and subgroup levels.
 
     Example 1 (Workgroup level):
     ```mlir
@@ -384,7 +384,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
                         l1_hint = #xegpu.cache_hint<cached>,
                         l2_hint = #xegpu.cache_hint<uncached>,
                         l3_hint = #xegpu.cache_hint<streaming>,
-                        anchor_layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [8, 32]>}
+                        layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [8, 32]>}
               : !xegpu.tensor_desc<32x256xf32> -> vector<32x256xf32>
     ```
     Example 2 (lane level):
@@ -405,7 +405,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint, 
-                       OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
+                       OptionalAttr<DistributeLayoutAttr>:$layout);
 
   let results = (outs XeGPU_ValueType: $value);
 
@@ -426,7 +426,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
       return getMixedValues(statics, dynamics, getContext());
     }
 
-    xegpu::DistributeLayoutAttr getLayoutAttr() {
+    xegpu::DistributeLayoutAttr getDescLayoutAttr() {
       return dyn_cast_if_present<xegpu::DistributeLayoutAttr>(getTensorDescType().getLayout());
     }
 
@@ -490,7 +490,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
     - `l1_hint`, `l2_hint`, `l3_hint`: [optional] Cache-hint attributes indicating the
       desired behavior at the L1, L2, and L3 cache levels.
 
-    - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as
+    - `layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as
       the value to be stored (they are identical). Only valid at workgroup and subgroup levels.
 
     Example 1 (Workgroup level):
@@ -498,7 +498,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
       xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
                              l2_hint = #xegpu.cache_hint<write_back>,
                              l3_hint = #xegpu.cache_hint<write_through>,
-                             anchor_layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [8, 32]>}
+                             layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [8, 32]>}
                              : vector<32x256xf16>, !xegpu.tensor_desc<32x256xf16>
     ```
     Example 2 (lane level):
@@ -519,7 +519,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint,
-                       OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
+                       OptionalAttr<DistributeLayoutAttr>:$layout);
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
     VectorType getValueType() {
@@ -538,7 +538,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
       return getMixedValues(statics, dynamics, getContext());
     }
 
-    xegpu::DistributeLayoutAttr getLayoutAttr() {
+    xegpu::DistributeLayoutAttr getDescLayoutAttr() {
       return dyn_cast_if_present<xegpu::DistributeLayoutAttr>(getTensorDescType().getLayout());
     }
 
@@ -749,7 +749,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
     - `offset_align_byte`: [optional] required if `source` is a pointer. If `source` is not a pointer,
         it is not allowed. Represents the alignment in bytes of each offset in offsets.
 
-    - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` or `offsets`
+    - `layout`: [optional] Describes the expected layout of the `tensor_desc` or `offsets`
       operand. Only valid at workgroup and subgroup levels.
 
     Example 1 (Workgroup level):
@@ -757,7 +757,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
       xegpu.prefetch %tdesc {l1_hint = #xegpu.cache_hint<cached>,
                              l2_hint = #xegpu.cache_hint<cached>,
                              l3_hint = #xegpu.cache_hint<cached>, 
-                             anchor_layout = #xegpu.layout<sg_layout = [8], sg_data = [32]>
+                             layout = #xegpu.layout<sg_layout = [8], sg_data = [32]>
                              }
         : !xegpu.tensor_desc<256xf16>
     ```
@@ -803,7 +803,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
       OptionalAttr<I64Attr>:$offset_align_byte,
-      OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
+      OptionalAttr<DistributeLayoutAttr>:$layout);
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
     Type getSourceType() {
@@ -876,7 +876,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
 
     - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache.
 
-    - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand or the result 
+    - `layout`: [optional] Describes the expected layout of the `tensor_desc` operand or the result 
       of load. Only valid at workgroup and subgroup levels.
 
     Results:
@@ -888,7 +888,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
     %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>,
                              l2_hint = #xegpu.cache_hint<uncached>,
                              l3_hint = #xegpu.cache_hint<uncached>}, 
-                             anchor_layout = #xegpu.layout<sg_layout = [8], sg_data = [32]>>
+                             layout = #xegpu.layout<sg_layout = [8], sg_data = [32]>>
           : !xegpu.tensor_desc<256xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
             vector<256xi1> -> vector<256xf32>
   ```
@@ -898,7 +898,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
     %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>,
                              l2_hint = #xegpu.cache_hint<uncached>,
                              l3_hint = #xegpu.cache_hint<uncached>},
-                             anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 8]>>
+                             layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 8]>>
           : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
             vector<16xi1> -> vector<16x8xf32>
   ```
@@ -915,7 +915,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
     %val = xegpu.load %a[%offsets], %mask {l1_hint = #xegpu.cache_hint<cached>,
                            l2_hint = #xegpu.cache_hint<cached>,
                            l3_hint = #xegpu.cache_hint<cached>, 
-                           anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+                           layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
       : memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32>
   ```
 
@@ -937,7 +937,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
       OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
-      OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
+      OptionalAttr<DistributeLayoutAttr>:$layout);
   let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value);
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
@@ -1039,7 +1039,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
 
     - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache.
 
-    - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand or the value
+    - `layout`: [optional] Describes the expected layout of the `tensor_desc` operand or the value
       to be stored. Only valid at workgroup and subgroup levels.
 
 
@@ -1048,7 +1048,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
     xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint<uncached>,
                              l2_hint = #xegpu.cache_hint<write_back>,
                              l3_hint = #xegpu.cache_hint<write_through>,
-                             anchor_layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>}>
+                             layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>}>
           : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.scattered_tdesc_attr<>>, vector<256xi1>
   ```
 
@@ -1057,7 +1057,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
     xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint<uncached>,
                              l2_hint = #xegpu.cache_hint<write_back>,
                              l3_hint = #xegpu.cache_hint<write_through>,
-                             anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 8]>}>
+                             layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 8]>}>
           : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
   ```
 
@@ -1074,7 +1074,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
     xegpu.store %val, %a[%offsets], %mask {l1_hint = #xegpu.cache_hint<cached>,
                            l2_hint = #xegpu.cache_hint<cached>,
                            l3_hint = #xegpu.cache_hint<cached>,
-                           anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+                           layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
       : memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32>
   ```
 
@@ -1097,7 +1097,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
       OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
-      OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
+      OptionalAttr<DistributeLayoutAttr>:$layout);
 
   let extraClassDeclaration = extraBaseClassDeclaration#[{
     Type getDestType() {
@@ -1241,7 +1241,7 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
     - `acc`: [optional] A vector value representing the accumulator matrix tile (C). When present, the
       result is computed as `lhs * rhs + acc`; otherwise, the accumulator is implicitly assumed to be zero.
 
-    - `anchor_layout_a`, `anchor_layout_b`, `anchor_layout_cd`: [optional] Attributes that identify this
+    - `layout_a`, `layout_b`, `layout_cd`: [optional] Attributes that identify this
       operation as anchor for operands A, B, and the accumulator/result, enabling users to assign layouts
       that govern distribution at the subgroup and/or lane level. Only valid at workgroup and subgroup
       level.
@@ -1250,9 +1250,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
 
     ```mlir
       %d = xegpu.dpas %a, %b, %c <{
-          anchor_layout_a = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 128]>,
-          anchor_layout_b = #xegpu.layout<sg_layout = [4, 8], sg_data = [128, 16]>,
-          anchor_layout_cd = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16]>}
+          layout_a = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 128]>,
+          layout_b = #xegpu.layout<sg_layout = [4, 8], sg_data = [128, 16]>,
+          layout_cd = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16]>}
           : vector<64x128xf16>, vector<128x128xf16>, vector<64x128xf32> -> vector<64x128xf32>
     ```
 
@@ -1268,9 +1268,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
     XeGPU_DpasOprType : $lhs,
     XeGPU_DpasOprType : $rhs,
     Optional<XeGPU_DpasResType>: $acc, 
-    OptionalAttr<DistributeLayoutAttr>:$anchor_layout_a,
-    OptionalAttr<DistributeLayoutAttr>:$anchor_layout_b,
-    OptionalAttr<DistributeLayoutAttr>:$anchor_layout_cd
+    OptionalAttr<DistributeLayoutAttr>:$layout_a,
+    OptionalAttr<DistributeLayoutAttr>:$layout_b,
+    OptionalAttr<DistributeLayoutAttr>:$layout_cd
   );
   let results = (outs XeGPU_DpasResType: $result);
 
@@ -1336,7 +1336,7 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
     - `value`: The input values used by the atomic operation. It must have the same
       shape and element type as `tensorDesc` and `result`.
 
-    - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
+    - `layout`: [optional] An attribute that identifies the operation as an anchor,
       enabling users to assign a layout that governs distribution at the subgroup
       and/or lane level. Only valid at workgroup and subgroup levels.
   }];
@@ -1346,7 +1346,7 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
     XeGPU_TensorDesc:$tensorDesc,
     XeGPU_MaskType:$mask,
     XeGPU_ValueType:$value,
-    OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
+    OptionalAttr<DistributeLayoutAttr>:$layout);
 
   let results = (outs XeGPU_ValueType:$result);
 
@@ -1507,7 +1507,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
     OptionalAttr<UnitAttr>:$subgroup_block_io,
-    OptionalAttr<DistributeLayoutAttr>:$anchor_layout
+    OptionalAttr<DistributeLayoutAttr>:$layout
   );
   let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$res);  
   let assemblyFormat = [{
@@ -1529,7 +1529,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
      - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered
         to a subgroup block load. When this attribute is present, the offsets are subgroup-uniform
         across all lanes. Only used on subgroup and lane level.
-     - `anchor_layout`: [optional] Describes the expected layout of the `mem_desc` operand as well as
+     - `layout`: [optional] Describes the expected layout of the `mem_desc` operand as well as
       the result of load (they are identical).
         Only valid at workgroup and subgroup levels.
 
@@ -1540,7 +1540,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
     ```mlir
         %c0 = arith.constant 0 : index
         %1 = xegpu.load_matrix %0[%c0, %c0] <{
-                anchor_layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [32, 16]> }>
+                layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [32, 16]> }>
           : !xegpu.mem_desc<128x128xf16, #xegpu.mem_layout<stride = [1, 128], block = [16, 16]>>
           , index, index -> vector<128x128xf16>
     ```
@@ -1574,7 +1574,7 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
     OptionalAttr<UnitAttr>:$subgroup_block_io,
-    OptionalAttr<DistributeLayoutAttr>:$anchor_layout
+    OptionalAttr<DistributeLayoutAttr>:$layout
   );
   let assemblyFormat = [{ $data `,` $mem_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
                           prop-dict attr-dict `` `:` type(operands)}];
@@ -1593,14 +1593,14 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
      - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered
         to a subgroup block load. When this attribute is present, the offsets are subgroup-uniform
         across all lanes. Only used on subgroup and lane level.
-     - `anchor_layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as
+     - `layout`: [optional] Describes the expected layout of the `tensor_desc` operand as well as
         the value to be stored (they are identical). Only valid at workgroup and subgroup levels.
 
     Example (Workgroup level):
     ```mlir
         %c0 = arith.constant 0 : index
         xegpu.store_matrix %1, %0[%c0, %c0] <{
-                anchor_layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [32, 16]> }>
+                layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [32, 16]> }>
           : vector<128x128xf16>, !xegpu.mem_desc<128x128xf16>>, index, index
     ```
   }];
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 29daab384bf7f..8cb666298c959 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -1157,8 +1157,7 @@ LogicalResult LoadMatrixOp::verify() {
   MemDescType mdescTy = getMemDesc().getType();
 
   return IsValidMatrixOpParams(resTy, mdescTy, subgroup_block_io,
-                               getAnchorLayoutAttr(),
-                               [&]() { return emitError(); });
+                               getLayoutAttr(), [&]() { return emitError(); });
 }
 
 //===----------------------------------------------------------------------===//
@@ -1182,8 +1181,7 @@ LogicalResult StoreMatrixOp::verify() {
   UnitAttr subgroup_block_io = getSubgroupBlockIoAttr();
   MemDescType mdescTy = getMemDesc().getType();
   return IsValidMatrixOpParams(dataTy, mdescTy, subgroup_block_io,
-                               getAnchorLayoutAttr(),
-                               [&]() { return emitError(); });
+                               getLayoutAttr(), [&]() { return emitError(); });
 }
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 8fb63da8cb0a0..f2b0e71c9397f 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -496,7 +496,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
     ArrayRef<const LayoutInfoLattice *> results) {
 
   LayoutInfo prefetchLayout;
-  xegpu::DistributeLayoutAttr anchorLayout = prefetch.getAnchorLayoutAttr();
+  xegpu::DistributeLayoutAttr anchorLayout = prefetch.getLayoutAttr();
   if (hasParamsOfLayoutKind(anchorLayout)) {
     prefetchLayout = LayoutInfo(anchorLayout);
   } else {
@@ -540,7 +540,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
       prefetchLayout = getDefaultSIMTLayoutInfo(
           tdescTy, uArch, uArchInstruction->getPackedFormatBitSize());
 
-    prefetch.setAnchorLayoutAttr(
+    prefetch.setLayoutAttr(
         dyn_cast<xegpu::DistributeLayoutAttr>(prefetchLayout.get()));
   }
   // Propagate the layout to the source tensor descriptor.
@@ -647,10 +647,10 @@ void LayoutInfoPropagation::visitDpasOp(
   LayoutInfo dpasBLayout;
   LayoutInfo dpasCDLayout;
 
-  xegpu::DistributeLayoutAttr anchorLayoutCD = dpas.getAnchorLayoutCdAttr();
+  xegpu::DistributeLayoutAttr anchorLayoutCD = dpas.getLayoutCdAttr();
   if (hasParamsOfLayoutKind(anchorLayoutCD)) {
-    xegpu::DistributeLayoutAttr anchorLayoutA = dpas.getAnchorLayoutAAttr();
-    xegpu::DistributeLayoutAttr anchorLayoutB = dpas.getAnchorLayoutBAttr();
+    xegpu::DistributeLayoutAttr anchorLayoutA = dpas.getLayoutAAttr();
+    xegpu::DistributeLayoutAttr anchorLayoutB = dpas.getLayoutBAttr();
     assert(hasParamsOfLayoutKind(anchorLayoutA) &&
            "Expected anchor layout for DPAS A operand.");
     assert(hasParamsOfLayoutKind(anchorLayoutB) &&
@@ -720,12 +720,12 @@ void LayoutInfoPropagation::visitDpasOp(
         dpasCDLayout = getSIMTLayoutInfoForDPASOperand(
             cTy, 2, uArch, uArchInstruction->getPackedFormatBitSizeB());
 
-      dpas.setAnchorLayoutCdAttr(
+      dpas.setLayoutCdAttr(
           dyn_cast<xegpu::DistributeLayoutAttr>(dpasCDLayout.get()));
     }
-    dpas.setAnchorLayoutAAttr(
+    dpas.setLayoutAAttr(
         dyn_cast<xegpu::DistributeLayoutAttr>(dpasALayout.get()));
-    dpas.setAnchorLayoutBAttr(
+    dpas.setLayoutBAttr(
         dyn_cast<xegpu::DistributeLayoutAttr>(dpasBLayout.get()));
   }
 
@@ -742,7 +742,7 @@ void LayoutInfoPropagation::visitStoreNdOp(
     ArrayRef<const LayoutInfoLattice *> results) {
 
   LayoutInfo storeLayout;
-  xegpu::DistributeLayoutAttr anchorLayout = store.getAnchorLayoutAttr();
+  xegpu::DistributeLayoutAttr anchorLayout = store.getLayoutAttr();
   if (hasParamsOfLayoutKind(anchorLayout)) {
     storeLayout = LayoutInfo(anchorLayout);
   } else {
@@ -782,7 +782,7 @@ void LayoutInfoPropagation::visitStoreNdOp(
       storeLayout =
           getDefaultSIMTLayoutInfo(store.getValueType(), uArch,
                                    uArchInstruction->getPackedFormatBitSize());
-    store.setAnchorLayoutAttr(
+    store.setLayoutAttr(
         dyn_cast<xegpu::DistributeLayoutAttr>(storeLayout.get()));
   }
   // Propagate the layout to the value operand.
@@ -798,7 +798,7 @@ void LayoutInfoPropagation::visitLoadNdOp(
     ArrayRef<const LayoutInfoLattice *> results) {
 
   LayoutInfo loadLayout;
-  xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr();
+  xegpu::DistributeLayoutAttr anchorLayout = load.getLayoutAttr();
   if (hasParamsOfLayoutKind(anchorLayout)) {
     loadLayout = LayoutInfo(anchorLayout);
   } else {
@@ -816,8 +816,7 @@ void LayoutInfoPropagation::visitLoadNdOp(
                        "LayoutInfoPropagation stage.");
       loadLayout = valueLayout.transpose(transpose.value());
     }
-    load.setAnchorLayoutAttr(
-        dyn_cast<xegpu::DistributeLayoutAttr>(loadLayout.get()));
+    load.setLayoutAttr(dyn_cast<xegpu::DistributeLayoutAttr>(loadLayout.get()));
   }
   // Propagate the new layout to the tensor descriptor operand.
   propagateIfChanged(operands[0], operands[0]->meet(loadLayout));
@@ -913,7 +912,7 @@ void LayoutInfoPropagation::visitLoadGatherOp(
 
   LayoutInfo loadLayout;
   LayoutInfo maskLayout;
-  xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr();
+  xegpu::DistributeLayoutAttr anchorLayout = load.getLayoutAttr();
   if (hasParamsOfLayoutKind(anchorLayout)) {
     loadLayout = LayoutInfo(anchorLayout);
     maskLayout = loadLayout;
@@ -947,8 +946,7 @@ void LayoutInfoPropagation::visitLoadGatherOp(
     // Mask operand should have 1D default layout.
     maskLayout = getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize);
 
-    load.setAnchorLayoutAttr(
-        dyn_cast<xegpu::DistributeLayoutAttr>(loadLayout.get()));
+    load.setLayoutAttr(dyn_cast<xegpu::DistributeLayoutAttr>(loadLayout.get()));
   }
   // Propagate the new layout to the tensor descriptor operand.
   if (isa<xegpu::TensorDescType>(load.getSourceType()))
@@ -983,7 +981,7 @@ void LayoutInfoPropagation::visitStoreScatterOp(
 
   LayoutInfo payloadLayout;
   LayoutInfo maskLayout;
-  xegpu::DistributeLayoutAttr anchorLayout = storeScatter.getAnchorLayoutAttr();
+  xegpu::DistributeLayoutAttr anchorLayout = storeScatter.getLayoutAttr();
   if (hasParamsOfLayoutKind(anchorLayout)) {
     payloadLayout = LayoutInfo(anchorLayout);
     maskLayout = payloadLayout;
@@ -1027,7 +1025,7 @@ void LayoutInfoPropagation::visitStoreScatterOp(
     maskLayout =
         getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize);
 
-    storeScatter.setAnchorLayoutAttr(
+    storeScatter.setLayoutAttr(
         dyn_cast<xegpu::DistributeLayoutAttr>(payloadLayout.get()));
   }
   // Propagate the payload operand layout
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index ac65babfcb4cb..4455811a2e681 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -965,7 +965,7 @@ struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern {
     SmallVector<Value> offsetsAsValues =
         vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
 
-    auto layout = matrixOp.getAnchorLayoutAttr();
+    auto layout = matrixOp.getLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           matrixOp, "the matrix operation lacks layout attribute");
@@ -1041,7 +1041,7 @@ struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
     SmallVector<Value> offsetsAsValues =
         vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
 
-    auto layout = matrixOp.getAnchorLayoutAttr();
+    auto layout = matrixOp.getLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           matrixOp, "the matrix operation lacks layout attribute");
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index c644f784606e9..330553564f81a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -678,7 +678,7 @@ struct UnrollLoadGatherOpWithOffset
           pack(offsets, convertedOffsetTypes, *targetShape, loc, rewriter);
     }
 
-    auto layout = op.getAnchorLayoutAttr();
+    auto layout = op.getLayoutAttr();
     if (layout)
       layout = layout.dropInstData();
 
@@ -778,7 +778,7 @@ struct UnrollStoreScatterOpWithOffsets
     SmallVector<Value> convertedValues =
         pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter);
 
-    auto layout = op.getAnchorLayoutAttr();
+    auto layout = op.getLayoutAttr();
     if (layout)
       layout = layout.dropInstData();
 
@@ -954,7 +954,7 @@ struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
 
     Type elemTy = valueTy.getElementType();
     ArrayRef<int64_t> shape = valueTy.getShape();
-    auto layout = dyn_cast<xegpu::LayoutAttr>(op.getAnchorLayoutAttr());
+    auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
 
     VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy);
 
@@ -993,7 +993,7 @@ struct UnrollStoreMatrixOp : public UnrollPattern<xegpu::StoreMatrixOp> {
     VectorType valueTy = llvm::dyn_cast<VectorType>(op.getData().getType());
     assert(valueTy && "the value type must be vector type!");
     ArrayRef<int64_t> shape = valueTy.getShape();
-    auto layout = dyn_cast<xegpu::LayoutAttr>(op.getAnchorLayoutAttr());
+    auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
 
     SmallVector<Type> convertedValTypes =
         getUnrolledTypes(valueTy, *targetShape);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 2562c46adfa8d..73876ce3b1639 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -86,13 +86,13 @@ genOffsetsList(ConversionPatternRewriter &rewriter, OpType op,
   if (origOffsets.empty())
     return failure();
 
-  // if op is xegpu::CreateNdDescOp, call op.getLayoutAttr()
+  // if op is xegpu::CreateNdDescOp, call op.getDescLayoutAttr()
   xegpu::DistributeLayoutAttr layout;
   if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp> ||
                 std::is_same_v<OpType, xegpu::StoreMatrixOp>) {
-    layout = op.getAnchorLayoutAttr();
-  } else {
     layout = op.getLayoutAttr();
+  } else {
+    layout = op.getDescLayoutAttr();
   }
 
   // not applicable to ops without workgroup layout attributes
@@ -1007,7 +1007,7 @@ struct WgToSgLoadMatrixOp : public OpConversionPattern<xegpu::LoadMatrixOp> {
     assert(valueTy && "the value type must be vector type!");
     Type elemTy = valueTy.getElementType();
 
-    xegpu::DistributeLayoutAttr layout = op.getAnchorLayoutAttr();
+    xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
     SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
     VectorType newResTy = VectorType::get(sgShape, elemTy);
     SmallVector<Value> newOps;
@@ -1033,7 +1033,7 @@ struct WgToSgStoreMatrixOp : public OpConversionPattern<xegpu::StoreMatrixOp> {
     if (failed(genOffsetsList(rewriter, op, offsetsList)))
       return failure();
 
-    xegpu::DistributeLayoutAttr layout = op.getAnchorLayoutAttr();
+    xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
     for (auto [v, offsets] : llvm::zip(adaptor.getData(), offsetsList))
       xegpu::StoreMatrixOp::create(rewriter, op.getLoc(), v, op.getMemDesc(),
                                    offsets, layout.dropSgLayoutAndData());
@@ -1417,12 +1417,12 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
 
   target.addDynamicallyLegalOp<xegpu::LoadMatrixOp>(
       [=](xegpu::LoadMatrixOp op) -> bool {
-        return isLegal(op.getAnchorLayoutAttr());
+        return isLegal(op.getLayoutAttr());
       });
 
   target.addDynamicallyLegalOp<xegpu::StoreMatrixOp>(
       [=](xegpu::StoreMatrixOp op) -> bool {
-        return isLegal(op.getAnchorLayoutAttr());
+        return isLegal(op.getLayoutAttr());
       });
 
   target.addDynamicallyLegalOp<arith::ConstantOp>(
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 572e5442760bc..91432b1c11304 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -135,11 +135,11 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
 
     // for LoadMatrixOp, the layout is attached to the property of the op
     if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(defOp))
-      return loadOp.getAnchorLayoutAttr();
+      return loadOp.getLayoutAttr();
 
     // for StoreMatrixOp, the layout is attached to the property of the op
     if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(defOp))
-      return storeOp.getAnchorLayoutAttr();
+      return storeOp.getLayoutAttr();
     std::string layoutName = getLayoutName(result);
     if (defOp->hasAttr(layoutName))
       return defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
@@ -147,7 +147,7 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
     // check for "permament" layout only after "temporary" layout name lookup
     // for backward compatibility
     if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(defOp))
-      return loadGatherOp.getAnchorLayoutAttr();
+      return loadGatherOp.getLayoutAttr();
   }
 
   if (auto arg = dyn_cast<BlockArgument>(value)) {
@@ -167,10 +167,10 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
   Operation *op = opr.getOwner();
 
   if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(op))
-    return loadOp.getAnchorLayoutAttr();
+    return loadOp.getLayoutAttr();
 
   if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(op))
-    return storeOp.getAnchorLayoutAttr();
+    return storeOp.getLayoutAttr();
 
   std::string layoutName = xegpu::getLayoutName(opr);
   if (op->hasAttr(layoutName))
@@ -178,7 +178,7 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
 
   // check for "permament" layout only after "temporary" layout name lookup
   if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
-    if (auto layout = storeScatterOp.getAnchorLayoutAttr())
+    if (auto layout = storeScatterOp.getLayoutAttr())
       return layout;
 
   return getDistributeLayoutAttr(opr.get());
@@ -193,7 +193,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
   xegpu::DistributeLayoutAttr candidate = layout;
 
   if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(owner)) {
-    if (auto perm = loadOp.getAnchorLayoutAttr())
+    if (auto perm = loadOp.getLayoutAttr())
       candidate = perm;
   }
 
@@ -211,7 +211,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
 
   if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(owner)) {
     if (idx == 0) {
-      if (auto perm = storeOp.getAnchorLayoutAttr())
+      if (auto perm = storeOp.getLayoutAttr())
         candidate = perm;
     }
   }
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 62ac880030cda..92f353717ac59 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -894,7 +894,7 @@ func.func @store_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>, %arg1: ve
 // -----
 func.func @simt_store_matrix_vector_nonlinear(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1]>>, %arg1: vector<2x16xf32>) {
   // expected-error at +1 {{With subgroup_block_io, accessed data must be contiguous and coalesced}}
-  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
         vector<2x16xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1]>>
   return
 }
@@ -902,7 +902,7 @@ func.func @simt_store_matrix_vector_nonlinear(%arg0: !xegpu.mem_desc<32x32xf32,
 // -----
 func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [1, 16]>>, %arg1: vector<16x2xf32>) {
   // expected-error at +1 {{With subgroup_block_io, the distributed dimensions must be contiguous}}
-  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} :
+  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} :
         vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [1, 16]>>
   return
 }
@@ -910,7 +910,7 @@ func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf3
 // -----
 func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1], block = [1, 17]>>, %arg1: vector<16x2xf32>) {
   // expected-error at +1 {{With subgroup_block_io, the block shape must match the lane layout}}
-  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
         vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1], block = [1, 17]>>
   return
 }
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
index 62a33a4797d2b..1d86a2a4939e5 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -5,14 +5,14 @@
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} dense<0.000000e+00> : vector<8x16xf32>
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>
 // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>>
-// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]]  <{anchor_layout = #xegpu.layout<inst_data = [8, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]]  <{layout = #xegpu.layout<inst_data = [8, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
 // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{anchor_layout = #xegpu.layout<inst_data = [16, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [16, 16]>} :
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout<inst_data = [16, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [16, 16]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf16>
-// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {anchor_layout_a = #xegpu.layout<inst_data = [8, 16]>, anchor_layout_b = #xegpu.layout<inst_data = [16, 16]>, anchor_layout_cd = #xegpu.layout<inst_data = [8, 16]>, layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout<inst_data = [8, 16]>, layout_b = #xegpu.layout<inst_data = [16, 16]>, layout_cd = #xegpu.layout<inst_data = [8, 16]>, layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
 // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
 // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>
-// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{anchor_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>>
 gpu.module @test {
 
 func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
@@ -46,7 +46,7 @@ gpu.module @test_kernel {
     %out:3 = scf.for %k = %c0 to %c1024 step %c32
       iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
       -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) {
-      //CHECK: xegpu.load_nd {{.*}} <{anchor_layout = #xegpu.layout<inst_data = [8, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+      //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout<inst_data = [8, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
       //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<16x32xf16>
       %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
       %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
@@ -85,7 +85,7 @@ gpu.module @test_kernel {
     %out:3 = scf.for %k = %c0 to %c1024 step %c32
       iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
       -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) {
-      //CHECK: xegpu.load_nd {{.*}} <{anchor_layout = #xegpu.layout<inst_data = [4, 16]>}>  {layout_result_0 = #xegpu.layout<inst_data = [4, 16]>} :
+      //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout<inst_data = [4, 16]>}>  {layout_result_0 = #xegpu.layout<inst_data = [4, 16]>} :
       //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>> -> vector<12x32xf16>
       %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
       %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
@@ -113,9 +113,9 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{anchor_layout = #xegpu.layout<inst_data = [16, 8]>, chunk_size = 8 : i64}>
+// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64, layout = #xegpu.layout<inst_data = [16, 8]>}>
 // CHECK-SAME: {layout_result_0 = #xegpu.layout<inst_data = [16, 8]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{anchor_layout = #xegpu.layout<inst_data = [16, 8]>, chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64, layout = #xegpu.layout<inst_data = [16, 8]>}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
 func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
   %1 = arith.constant dense<1>: vector<16xi1>
   %offset = arith.constant dense<12> : vector<16xindex>
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index d1bee47dd6d37..f8b59b87a122b 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -6,14 +6,14 @@ gpu.module @test {
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]]  <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]]  <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {anchor_layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, anchor_layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, anchor_layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
 // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
 // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
@@ -32,7 +32,7 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me
 gpu.module @test {
 // CHECK-LABEL: func.func @dpas_i8(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) {
-// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {anchor_layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>, anchor_layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} 
+// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} 
 
 func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) {
   %c0 = arith.constant 0 : index
@@ -47,7 +47,7 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre
 gpu.module @test {
 // CHECK-LABEL: func.func @load_with_transpose_effect(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
 func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
@@ -109,7 +109,7 @@ gpu.module @test {
 // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> ->
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
-// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]]  <{anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}> {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]]  <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}> {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>, vector<16xi1> -> vector<16x16xf16>
 func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
@@ -136,7 +136,7 @@ gpu.module @test {
 // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> ->
 // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
+// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, vector<16xi1> -> vector<16xf32>
 func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
   %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
@@ -184,9 +184,9 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>, chunk_size = 8 : i64}>
+// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}>
 // CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]  <{anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>, chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]  <{chunk_size = 8 : i64, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
 func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
   %1 = arith.constant dense<1>: vector<16xi1>
   %offset = arith.constant dense<12> : vector<16xindex>
@@ -205,7 +205,7 @@ gpu.module @test {
 // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
 // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
 // CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
-// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]  <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]  <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
 func.func @scatter_ops(%src: memref<256xf16>) {
   %1 = arith.constant dense<1>: vector<16xi1>
   %offset = arith.constant dense<12> : vector<16xindex>
@@ -218,13 +218,13 @@ func.func @scatter_ops(%src: memref<256xf16>) {
 gpu.module @test {
 // CHECK-LABEL: func.func @scatter_ops_custom_perm_layout(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
-// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
-// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
+// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} dense<12> : vector<16xindex>
 // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
 // CHECK-SAME:  {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
-// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : vector<16xf16>
+// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16>
 // CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
-// CHECK-SAME  <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK-SAME  <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
 func.func @scatter_ops_custom_perm_layout(%src: memref<256xf16>) {
   %1 = arith.constant dense<1>: vector<16xi1>
   %offset = arith.constant dense<12> : vector<16xindex>
@@ -238,13 +238,13 @@ func.func @scatter_ops_custom_perm_layout(%src: memref<256xf16>) {
 gpu.module @test {
 // CHECK-LABEL: func.func @scatter_ops_preserve_load_perm_layout(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
-// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
-// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
+// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} dense<12> : vector<16xindex>
 // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] 
 // CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
-// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : vector<16xf16>
+// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16>
 // CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
-// CHECK-SAME <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
 func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) {
   %1 = arith.constant dense<1>: vector<16xi1>
   %offset = arith.constant dense<12> : vector<16xindex>
@@ -257,9 +257,9 @@ func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) {
 // -----
 gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16(
-// CHECK:       %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK:       %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:     !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16>
-// CHECK:       %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+// CHECK:       %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
 // CHECK-SAME:     !xegpu.tensor_desc<16x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xi16>
 // CHECK:       %{{.*}} = vector.bitcast %[[LOAD0]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:      vector<8x16xi16> to vector<8x16xf16>
@@ -282,7 +282,7 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1
 // -----
 gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i32_to_f16(
-// CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
 // CHECK-SAME:     !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
 // CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
 // CHECK-SAME:     vector<16x8xi32> to vector<16x16xf16>
@@ -303,7 +303,7 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8
 // -----
 gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i16_to_i32(
-// CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
+// CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
 // CHECK-SAME:     !xegpu.tensor_desc<8x32xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>> -> vector<8x32xi16>
 // CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:     vector<8x32xi16> to vector<8x16xi32>
@@ -340,9 +340,9 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
 // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT: %{{.*}} = arith.addf %[[T1]], %[[T2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16>
 func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) {
@@ -363,9 +363,9 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
 // CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {anchor_layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, anchor_layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -386,11 +386,11 @@ gpu.module @test {
 // CHECK-NEXT: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
 // CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) ->
 // CHECK-SAME: (!xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, vector<8x16xf32>) {
-// CHECK-NEXT:   %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT:   %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-// CHECK-NEXT:   %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT:   %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK-NEXT:   %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {anchor_layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, anchor_layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, anchor_layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT:   %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
 // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
 // CHECK-NEXT:   %[[T7:.*]] = xegpu.update_nd_offset %[[ARG4]], [{{.*}}] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 // CHECK-NEXT:   %[[T8:.*]] = xegpu.update_nd_offset %[[ARG5]], [{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
@@ -398,7 +398,7 @@ gpu.module @test {
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, vector<8x16xf32>
 // CHECK-NEXT: } {layout_result_2 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-NEXT: %[[T3:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
@@ -426,11 +426,11 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
 // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
 // CHECK:  %{{.*}} = scf.if %[[ARG2]] -> (vector<16x16xf16>) {
-// CHECK-NEXT:    %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT:    %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:    scf.yield %[[T3]] : vector<16x16xf16>
 // CHECK-NEXT:  } else {
-// CHECK-NEXT:    %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT:    %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:    scf.yield %[[T4]] : vector<16x16xf16>
 // CHECK-NEXT:  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
@@ -456,11 +456,11 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG4:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
 // CHECK: %[[T1:.*]] = scf.if %[[ARG2]] -> (vector<16x16xf16>) {
-// CHECK-NEXT:       %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT:       %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:       scf.yield %[[T3]] : vector<16x16xf16>
 // CHECK-NEXT:     } else {
-// CHECK-NEXT:       %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT:       %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:       scf.yield %[[T4]] : vector<16x16xf16>
 // CHECK-NEXT:     } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
@@ -540,7 +540,7 @@ gpu.module @test {
 // CHECK-LABEL: func.func @prefetch_2d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 func.func @prefetch_2d(%arg0: memref<256x256xf16>){
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
@@ -553,7 +553,7 @@ gpu.module @test {
 // CHECK-LABEL: func.func @prefetch_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
 func.func @prefetch_1d(%arg0: memref<256xf16>){
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
@@ -600,7 +600,7 @@ gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed(
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK:         %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK:         %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:      !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:    %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
 // CHECK-SAME:       {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
@@ -622,7 +622,7 @@ gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(
 // CHECK-SAME:     %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME:     %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK:          %[[LOAD:.*]] = xegpu.load_nd %arg0 <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK:          %[[LOAD:.*]] = xegpu.load_nd %arg0 <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:        !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:     %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
 // CHECK-SAME:        {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1]
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index a7ce2c05b9d44..8fd3cca5594cb 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -281,8 +281,8 @@ gpu.module @xevm_module{
 gpu.module @xevm_module{
   gpu.func @load_store_matrix_1(%arg0: !xegpu.mem_desc<32x32xf32>) {
     %c0 = arith.constant 0 : index
-    %1 = xegpu.load_matrix %arg0[%c0, %c0] <{anchor_layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32>
-    xegpu.store_matrix %1, %arg0[%c0, %c0] <{anchor_layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+    %1 = xegpu.load_matrix %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32>
+    xegpu.store_matrix %1, %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index
     gpu.return
   }
 }
@@ -307,8 +307,8 @@ gpu.module @xevm_module{
   gpu.func @load_store_matrix_2(%arg0: !xegpu.mem_desc<32x32xf32>) {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
-    %1 = xegpu.load_matrix %arg0[%c0, %c1] <{anchor_layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32>
-    xegpu.store_matrix %1, %arg0[%c0, %c1] <{anchor_layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+    %1 = xegpu.load_matrix %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32>
+    xegpu.store_matrix %1, %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index
     gpu.return
   }
 }
@@ -323,9 +323,9 @@ gpu.module @xevm_module{
   gpu.func @load_store_matrix_3(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>) {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
-    %1 = xegpu.load_matrix %arg0[%c0, %c1] {subgroup_block_io, anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
+    %1 = xegpu.load_matrix %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
       !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index -> vector<16x2xf32>
-    xegpu.store_matrix %1, %arg0[%c0, %c1] {subgroup_block_io, anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
+    xegpu.store_matrix %1, %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
       vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index
     gpu.return
   }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index 456d8e8a03cfc..d61908b422194 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -569,7 +569,7 @@ gpu.module @test_kernel {
     %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x32xf32>
     //CHECK-COUNT-8: xegpu.load_matrix {{.*}} : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x16xf32>
     //CHECK-COUNT-8: vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<32x32xf32>
-    %1 = xegpu.load_matrix %0[0, 0] <{anchor_layout = #xegpu.layout<inst_data = [8, 16]>}>: !xegpu.mem_desc<32x32xf32> -> vector<32x32xf32>
+    %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout<inst_data = [8, 16]>}>: !xegpu.mem_desc<32x32xf32> -> vector<32x32xf32>
     gpu.return %1: vector<32x32xf32>
   }
 }
@@ -580,7 +580,7 @@ gpu.module @test_kernel {
   gpu.func @unroll_store_matrix(%value: vector<32x32xf32>, %arg0 : memref<32768xi8, 3>) {
     %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
     // CHECK-COUNT-8:  xegpu.store_matrix {{.*}} : vector<8x16xf32>, !xegpu.mem_desc<64x128xf32>, index, index
-    xegpu.store_matrix %value, %mdesc[0, 0] {anchor_layout = #xegpu.layout<inst_data = [8, 16]>} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>
+    xegpu.store_matrix %value, %mdesc[0, 0] {layout = #xegpu.layout<inst_data = [8, 16]>} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>
     gpu.return
   }
 }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 171cadeeaeaf9..5dde84e8e0bc2 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -286,7 +286,7 @@ gpu.module @test_distribution {
     // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<2.550000e+01> : vector<8xf16>
     // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<0> : vector<8xindex>
     // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<true> : vector<8xi1>
-    // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{anchor_layout = #xegpu.layout<inst_data = [8]>, chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}>
+    // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>, layout = #xegpu.layout<inst_data = [8]>}>
     // CHECK-SAME: {layout_operand_0 = #xegpu.layout<inst_data = [8]>, layout_operand_2 = #xegpu.layout<inst_data = [8]>,
     // CHECK-SAME: layout_operand_3 = #xegpu.layout<inst_data = [8]>}
     // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
@@ -333,9 +333,9 @@ gpu.module @test_distribution {
     //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]]
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
     //CHECK: [[off_x:%.+]] = index.remu [[l_off_x]], [[c128]]
-    //CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{anchor_layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32>
+    //CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32>
     %0 = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
-    %1 = xegpu.load_matrix %0[0, 0] <{anchor_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32], lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32>
+    %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32], lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32>
     gpu.return
   }
 
@@ -361,7 +361,7 @@ gpu.module @test_distribution {
     //CHECK: xegpu.store_matrix [[cst]], [[mdesc]][[[off_y]], [[off_x]]] : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>, index, index
     %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32]>} dense<1.0> : vector<64x128xf32>
     %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
-    xegpu.store_matrix %cst, %mdesc[0, 0] {anchor_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32]>} : vector<64x128xf32>, !xegpu.mem_desc<64x128xf32>
+    xegpu.store_matrix %cst, %mdesc[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32]>} : vector<64x128xf32>, !xegpu.mem_desc<64x128xf32>
     gpu.return
   }
 
@@ -554,7 +554,7 @@ gpu.module @test_distribution {
     %offset =  arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<0> : vector<256xindex>
     %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<1> : vector<256xi1>
 
-    // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{anchor_layout = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>, chunk_size = 1 : i64}>
+    // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>}>
     // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>} :
     // CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32>
     %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32>

>From 60f53969439d312413bea128b3fadcd7560a7285 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Wed, 26 Nov 2025 02:21:08 +0000
Subject: [PATCH 06/23] fix test

---
 .../XeGPU/propagate-layout-inst-data.mlir     | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
index 1d86a2a4939e5..d911baa49acbb 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -1,5 +1,29 @@
 // RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=inst" -split-input-file %s | FileCheck %s
 
+
+// CHECK-LABEL: func.func @load_store_no_array_len(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x32xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<8x32xf32>) {
+// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+// CHECK: %[[TDESC_SRC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+// CHECK: %[[TDESC_DST:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0 <{layout = #xegpu.layout<inst_data = [8, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<8x32xf32>
+// CHECK: xegpu.store_nd %[[LOADED]], %[[TDESC_DST]] <{layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+gpu.module @test {
+// Although the uArch allows 8x32 inst data using block count (or array_len),
+// it is up to optimization passes to decide on the block count usage.
+func.func @load_store_no_array_len(%arg0: memref<8x32xf32>, %arg1: memref<8x32xf32>) {
+  %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32>
+  %1 = xegpu.create_nd_tdesc %arg1 : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32>
+  %2 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x32xf32> -> vector<8x32xf32>
+  xegpu.store_nd %2, %1  : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32>
+  return
+}
+}
+
+// -----
+
 // CHECK-LABEL: func.func @dpas_f16(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} dense<0.000000e+00> : vector<8x16xf32>

>From 72fa240c0faa8b01e9e5f8f0554f6e6712c423d2 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Wed, 26 Nov 2025 18:21:45 +0000
Subject: [PATCH 07/23] fix clang-format

---
 clang-tools-extra/clang-tidy/.clang-format | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/.clang-format b/clang-tools-extra/clang-tidy/.clang-format
index e97ba0573dd1e..b32e785264e6a 100644
--- a/clang-tools-extra/clang-tidy/.clang-format
+++ b/clang-tools-extra/clang-tidy/.clang-format
@@ -1,4 +1,8 @@
 BasedOnStyle: LLVM
-QualifierAlignment: Left
-LineEnding: LF
 InsertNewlineAtEOF: true
+KeepEmptyLines:
+  AtEndOfFile: false
+  AtStartOfBlock: false
+  AtStartOfFile: false
+LineEnding: LF
+QualifierAlignment: Left
\ No newline at end of file

>From 5f25c89f7bc359bd9c9fe848ce2bd0e521c49af7 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Wed, 26 Nov 2025 18:24:46 +0000
Subject: [PATCH 08/23] fix missing space in .clang-format

---
 clang-tools-extra/clang-tidy/.clang-format | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/clang-tidy/.clang-format b/clang-tools-extra/clang-tidy/.clang-format
index b32e785264e6a..fe94ed1fa4e81 100644
--- a/clang-tools-extra/clang-tidy/.clang-format
+++ b/clang-tools-extra/clang-tidy/.clang-format
@@ -5,4 +5,4 @@ KeepEmptyLines:
   AtStartOfBlock: false
   AtStartOfFile: false
 LineEnding: LF
-QualifierAlignment: Left
\ No newline at end of file
+QualifierAlignment: Left

>From ae01e29d685852f29384ae72ac9a7744689c9cfc Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Wed, 26 Nov 2025 23:47:32 +0000
Subject: [PATCH 09/23] simplify: load/store/prefetch/loadmatrix/storematrix
 use anchor layout for blocking

---
 .../XeGPU/Transforms/XeGPUBlocking.cpp        | 89 +++++++++++++------
 mlir/test/Dialect/XeGPU/xegpu-blocking.mlir   | 74 +++++++--------
 2 files changed, 95 insertions(+), 68 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index ec5feb8bc8c4a..36fc653cbabff 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -175,33 +175,68 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
 
 std::optional<SmallVector<int64_t>>
 XeGPUBlockingPass::getTileShape(Operation *op) const {
+
+  auto getShapeSkipLeadingUnitDim = [](xegpu::DistributeLayoutAttr layout)
+      -> std::optional<SmallVector<int64_t>> {
+    SmallVector<int64_t> instData = layout.getEffectiveInstDataAsInt();
+    if (!instData.empty()) {
+      // Remove leading unit dimensions from inst_data
+      // For example, if the inst_data is [1, 1, 32]
+      // it will pass [32] as the unroll/blocking size.
+      auto it = llvm::find_if(instData, [](auto val) { return val != 1; });
+      instData.erase(instData.begin(), it);
+      return instData;
+    }
+    return std::nullopt;
+  };
+
   if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp, xegpu::CreateDescOp,
-          xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op))
+          xegpu::UpdateOffsetOp>(op))
     return getTileShape(op->getOpResult(0));
-  if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
-          xegpu::StoreMatrixOp>(op))
+
+  xegpu::DistributeLayoutAttr layout = nullptr;
+
+  if (isa<xegpu::LoadMatrixOp>(op))
+    layout = dyn_cast<xegpu::LoadMatrixOp>(op).getLayoutAttr();
+  if (isa<xegpu::StoreMatrixOp>(op))
+    layout = dyn_cast<xegpu::StoreMatrixOp>(op).getLayoutAttr();
+
+  if (layout != nullptr) {
+    assert(layout.isForSubgroup() &&
+           "Matrix load/store should have subgroup level layout");
+    return layout.getEffectiveInstDataAsInt();
+  }
+
+  if (isa<xegpu::LoadGatherOp>(op))
+    layout = dyn_cast<xegpu::LoadGatherOp>(op).getLayoutAttr();
+  if (isa<xegpu::StoreScatterOp>(op))
+    layout = dyn_cast<xegpu::StoreScatterOp>(op).getLayoutAttr();
+  if (isa<xegpu::PrefetchOp>(op))
+    layout = dyn_cast<xegpu::PrefetchOp>(op).getLayoutAttr();
+  if (layout != nullptr) {
+    assert(
+        layout.isForSubgroup() &&
+        "LoadGather/StoreScatter/Prefetch should have subgroup level layout");
+    return getShapeSkipLeadingUnitDim(layout);
+  }
+
+  if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp>(op))
     return getTileShape(op->getOpOperand(0));
   if (isa<xegpu::StoreNdOp>(op))
     return getTileShape(op->getOpOperand(1));
 
-  // Handle LoadGatherOp and StoreScatterOp (with and without offset)
-  if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(op)) {
-    if (loadGatherOp.getOffsets())
-      return getTileShape(loadGatherOp->getOpResult(0));
-    else
-      return getTileShape(loadGatherOp->getOpOperand(0));
-  }
+  if (isa<xegpu::DpasOp>(op)) {
 
-  if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
-    return getTileShape(storeScatterOp.getOffsets()
-                            ? storeScatterOp->getOpOperand(0)
-                            : storeScatterOp->getOpOperand(1));
+    auto layoutA = dyn_cast<xegpu::DpasOp>(op).getLayoutAAttr();
+    auto layoutB = dyn_cast<xegpu::DpasOp>(op).getLayoutBAttr();
+    auto layoutCD = dyn_cast<xegpu::DpasOp>(op).getLayoutCdAttr();
 
-  if (isa<xegpu::DpasOp>(op)) {
     std::optional<SmallVector<int64_t>> aTile =
-        getTileShape(op->getOpOperand(0));
+        layoutA.getEffectiveInstDataAsInt();
     std::optional<SmallVector<int64_t>> bTile =
-        getTileShape(op->getOpOperand(1));
+        layoutB.getEffectiveInstDataAsInt();
+    std::optional<SmallVector<int64_t>> cdTile =
+        layoutCD.getEffectiveInstDataAsInt();
 
     if (!aTile || aTile->size() != 2 || !bTile || bTile->size() != 2)
       return std::nullopt;
@@ -210,14 +245,9 @@ XeGPUBlockingPass::getTileShape(Operation *op) const {
     if ((*aTile)[1] != (*bTile)[0])
       return std::nullopt;
 
-    // semantic check for C
-    if (op->getNumOperands() == 3) {
-      std::optional<SmallVector<int64_t>> cTile =
-          getTileShape(op->getOpOperand(2));
-      int64_t expectedCTile[2] = {(*aTile)[0], (*bTile)[1]};
-      if (!cTile || !llvm::equal(*cTile, expectedCTile))
-        return std::nullopt;
-    }
+    int64_t expectedCDTile[2] = {(*aTile)[0], (*bTile)[1]};
+    if (!cdTile || !llvm::equal(*cdTile, expectedCDTile))
+      return std::nullopt;
 
     return SmallVector<int64_t>({(*aTile)[0], (*aTile)[1], (*bTile)[1]});
   }
@@ -383,9 +413,12 @@ void XeGPUBlockingPass::runOnOperation() {
         }
       }
 
-      newTy =
-          xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding,
-                                     tdescTy.getLayoutAttr().dropInstData());
+      xegpu::LayoutAttr newLayout = nullptr;
+      if (tdescTy.getLayoutAttr())
+        newLayout = tdescTy.getLayoutAttr().dropInstData();
+
+      newTy = xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding,
+                                         newLayout);
     } else {
       newTy = VectorType::get(tileShape, elemTy);
     }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index d61908b422194..c2ca63f6b289d 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -26,8 +26,8 @@ gpu.module @test_kernel {
       %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
       //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
       %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
-      //CHECK-COUNT-8: xegpu.dpas {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-      %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
+      //CHECK-COUNT-8: xegpu.dpas {{.*}} 
+      %c = xegpu.dpas %a, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c,layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
       //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
       %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a>
       //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
@@ -67,8 +67,8 @@ gpu.module @test_kernel {
       %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16>
       //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
       %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16>
-      //CHECK-COUNT-8: xegpu.dpas {{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-      %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
+      //CHECK-COUNT-8: xegpu.dpas {{.*}} 
+      %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
       //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16>
       %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l1>
       //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16>
@@ -111,7 +111,7 @@ gpu.module @test_kernel {
       %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16>
       //CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
       %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16>
-      %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32>
+      %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32>
       //CHECK: xegpu.update_nd_offset {{.*}} [%c0, %c32] : !xegpu.tensor_desc<8x16xf16>
       %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<8x16xf16, #l1>
       //CHECK-COUNT-2: xegpu.update_nd_offset {{.*}} [%c32, %c0] : !xegpu.tensor_desc<16x16xf16>
@@ -154,8 +154,8 @@ gpu.module @test_kernel {
       %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
       //CHECK-COUNT-4: math.exp {{.*}} : vector<8x16xf16>
       %e = math.exp %a {layout_result_0 = #a} : vector<16x32xf16>
-      //CHECK-COUNT-8: xegpu.dpas {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-      %c = xegpu.dpas %e, %b, %arg2 {layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
+      //CHECK-COUNT-8: xegpu.dpas {{.*}} 
+      %c = xegpu.dpas %e, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c,layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
       //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
       %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a>
       //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
@@ -353,14 +353,15 @@ gpu.module @test_kernel   {
 }
 
 // -----
+
 gpu.module @test_kernel {
   // CHECK-LABEL: test_prefetch_load_store_update
   // CHECK-SAME: [[arg0:%.+]]: ui64
   // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  // CHECK-COUNT-2: xegpu.prefetch {{.*}}
   // CHECK-COUNT-2: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex>
-  // CHECK-COUNT-2: xegpu.load  {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
-  // CHECK-COUNT-2: xegpu.store  {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
+  // CHECK-COUNT-2: xegpu.load  {{.*}} 
+  // CHECK-COUNT-2: xegpu.store  {{.*}} 
 
   gpu.func @test_prefetch_load_store_update(%src: ui64)  {
 
@@ -372,7 +373,7 @@ gpu.module @test_kernel {
     ]> : vector<32xindex>
 
     %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
-    xegpu.prefetch %tdesc: !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    xegpu.prefetch %tdesc {layout = #xegpu.layout<inst_data = [16]>}: !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
 
     %delta = arith.constant dense<[
     32,   32,  32,  32,  32,  32,  32,  32,
@@ -386,10 +387,10 @@ gpu.module @test_kernel {
     %c17 = arith.constant 17: index
     %mask = vector.create_mask %c17: vector<32xi1>
 
-    %ld_vec = xegpu.load %new_tdesc, %mask: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1> -> vector<32xf32>
+    %ld_vec = xegpu.load %new_tdesc, %mask {layout = #xegpu.layout<inst_data = [16]>}: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1> -> vector<32xf32>
 
     %st_vec = arith.addf %ld_vec, %ld_vec : vector<32xf32>
-    xegpu.store %st_vec, %tdesc, %mask:
+    xegpu.store %st_vec, %tdesc, %mask {layout = #xegpu.layout<inst_data = [16]>}:
                  vector<32xf32>,
                  !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>,
                  vector<32xi1>
@@ -400,15 +401,14 @@ gpu.module @test_kernel {
 }
 
 // -----
-
 gpu.module @test_kernel   {
   // CHECK-LABEL: test_prefetch_load_store_update_chunk
   // CHECK-SAME: [[arg0:%.+]]: ui64
   // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
-  // CHECK-COUNT-4: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK-COUNT-4: xegpu.prefetch {{.*}}  <{layout = #xegpu.layout<inst_data = [16, 2]>}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
   // CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xindex>
-  // CHECK-COUNT-4: xegpu.load  {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1> -> vector<16x2xf32>
-  // CHECK-COUNT-4: xegpu.store  {{.*}} : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1>
+  // CHECK-COUNT-4: xegpu.load  {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1> -> vector<16x2xf32> 
+  // CHECK-COUNT-4: xegpu.store  {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1>
 
   gpu.func @test_prefetch_load_store_update_chunk(%src: ui64)  {
 
@@ -420,7 +420,7 @@ gpu.module @test_kernel   {
     ]> : vector<32xindex>
 
     %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
-    xegpu.prefetch %tdesc: !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+    xegpu.prefetch %tdesc {layout = #xegpu.layout<inst_data = [16, 2]>}: !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
 
     %delta = arith.constant dense<[
       32,   32,  32,  32,  32,  32,  32,  32,
@@ -434,10 +434,10 @@ gpu.module @test_kernel   {
     %c17 = arith.constant 17: index
     %mask = vector.create_mask %c17: vector<32xi1>
 
-    %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xi1> -> vector<32x4xf32>
+    %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<inst_data = [16, 2]>}>: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xi1> -> vector<32x4xf32>
 
     %st_vec = arith.addf %ld_vec, %ld_vec : vector<32x4xf32>
-    xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>:
+    xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<inst_data = [16, 2]>}>:
                  vector<32x4xf32>,
                  !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>,
                  vector<32xi1>
@@ -476,7 +476,7 @@ gpu.module @test_kernel   {
     ]> : vector<4x8xindex>
 
     %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<4x8xindex> -> !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>
-    xegpu.prefetch %tdesc: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>
+    xegpu.prefetch %tdesc {layout = #l}: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>
 
     %delta = arith.constant dense<[
       [32,   32,  32,  32,  32,  32,  32,  32],
@@ -490,10 +490,10 @@ gpu.module @test_kernel   {
     %c4 = arith.constant 4: index
     %mask = vector.create_mask %c4, %c4: vector<4x8xi1>
 
-    %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>, vector<4x8xi1> -> vector<4x8x4xf32>
+    %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #l}>: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>, vector<4x8xi1> -> vector<4x8x4xf32>
 
     %st_vec = arith.addf %ld_vec, %ld_vec {layout_result_0 = #l} : vector<4x8x4xf32>
-    xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>:
+    xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #l}>:
                  vector<4x8x4xf32>,
                  !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>,
                  vector<4x8xi1>
@@ -518,8 +518,8 @@ gpu.module @test_kernel {
   //CHECK: [[cvt:%.+]] = xegpu.convert_layout [[load_a]] <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>}> : vector<16x16xf16>
   //CHECK: [[a0:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
   //CHECK: [[a1:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
-  //CHECK: [[dpas0:%.+]] = xegpu.dpas [[a0]], [[load_b]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-  //CHECK: [[dpas1:%.+]] = xegpu.dpas [[a1]], [[load_b]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+  //CHECK: [[dpas0:%.+]] = xegpu.dpas [[a0]], [[load_b]]
+  //CHECK: [[dpas1:%.+]] = xegpu.dpas [[a1]], [[load_b]]
   //CHECK: [[c_tdesc_0:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c0]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
   //CHECK: [[c_tdesc_1:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c8]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
   //CHECK: xegpu.store_nd [[dpas0]], [[c_tdesc_0]]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
@@ -532,7 +532,7 @@ gpu.module @test_kernel {
     %a = xegpu.load_nd %a_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
     %b = xegpu.load_nd %b_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
     %e = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16>
-    %c = xegpu.dpas %e, %b {layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    %c = xegpu.dpas %e, %b {layout_a=#a, layout_b = #b, layout_cd = #c, layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     %c_tdesc = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c>
     xegpu.store_nd %c, %c_tdesc: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c>
     gpu.return
@@ -599,7 +599,7 @@ gpu.module @test_kernel {
 
       %c17 = arith.constant 17: index
       %mask = vector.create_mask %c17: vector<32xi1>
-      %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
+      %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
 
       gpu.return %ld : vector<32xf32>
   }
@@ -621,10 +621,7 @@ gpu.module @test_kernel {
       %mask = vector.create_mask %c17: vector<32xi1>
 
       %st_vec = arith.constant dense<1023.0>: vector<32xf32>
-      xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 1, layout_operand_0 = #xegpu.layout<inst_data = [16]>, 
-                                              layout_operand_2 = #xegpu.layout<inst_data = [16]>, 
-                                              layout_operand_3 = #xegpu.layout<inst_data = [16]>, 
-                                              l1_hint = #xegpu.cache_hint<cached>} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1>
+      xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1>
 
       gpu.return
   }
@@ -649,7 +646,7 @@ gpu.module @test_kernel {
 
     %c17 = arith.constant 17: index
     %mask = vector.create_mask %c17: vector<32xi1>
-    %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout_result_0 = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32>
+    %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32>
     gpu.return %ld : vector<32x4xf32>
    }
 }
@@ -675,10 +672,7 @@ gpu.module @test_kernel {
     %mask = vector.create_mask %c17: vector<32xi1>
 
     %st_vec = arith.constant dense<1023.>: vector<32x4xf32>
-    xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 4, layout_operand_0 = #xegpu.layout<inst_data = [16, 2]>,
-                                            layout_operand_2 = #xegpu.layout<inst_data = [16, 2]>,
-                                            layout_operand_3 = #xegpu.layout<inst_data = [16, 2]>,
-                                            l1_hint = #xegpu.cache_hint<cached>} : vector<32x4xf32>, ui64, vector<32xindex>, vector<32xi1>
+    xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : vector<32x4xf32>, ui64, vector<32xindex>, vector<32xi1>
     gpu.return
   }
 }
@@ -704,7 +698,7 @@ gpu.module @test_kernel {
       ]]> : vector<1x1x32xindex>
 
       %mask = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>} dense<true> : vector<1x1x32xi1>
-      %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
+      %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [1, 1, 16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
 
       gpu.return %ld : vector<1x1x32xf32>
   }
@@ -778,10 +772,10 @@ gpu.module @test_kernel {
         128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]]
     ]> : vector<1x1x32xindex>
     %mask = arith.constant {layout_result_0 = #inst_data} dense<true> : vector<1x1x32xi1>
-    %a = xegpu.load %A[%cst], %mask {chunk_size = 1, layout_result_0 = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
-    %b = xegpu.load %B[%cst], %mask {chunk_size = 1, layout_result_0 = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
+    %a = xegpu.load %A[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
+    %b = xegpu.load %B[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
     %addf = arith.addf %a, %b {layout_result_0 = #inst_data} : vector<1x1x32xf32>
-    xegpu.store %addf, %C[%cst], %mask {chunk_size = 1, layout_operand_0 = #inst_data, layout_operand_2 = #inst_data, layout_operand_3 = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : vector<1x1x32xf32>, ui64, vector<1x1x32xindex>, vector<1x1x32xi1>
+    xegpu.store %addf, %C[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : vector<1x1x32xf32>, ui64, vector<1x1x32xindex>, vector<1x1x32xi1>
     gpu.return
   }
 }

>From 4795f24ef9533984af8d4c797647cfc9fcaf6d46 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 27 Nov 2025 00:20:25 +0000
Subject: [PATCH 10/23] simplify: load_nd/store_nd/prefetch_nd use anchor
 layout for blocking

---
 .../XeGPU/Transforms/XeGPUBlocking.cpp        | 11 +--
 mlir/test/Dialect/XeGPU/xegpu-blocking.mlir   | 78 +++++++++----------
 2 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 36fc653cbabff..ce53e8fe970b1 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -200,6 +200,12 @@ XeGPUBlockingPass::getTileShape(Operation *op) const {
     layout = dyn_cast<xegpu::LoadMatrixOp>(op).getLayoutAttr();
   if (isa<xegpu::StoreMatrixOp>(op))
     layout = dyn_cast<xegpu::StoreMatrixOp>(op).getLayoutAttr();
+  if (isa<xegpu::LoadNdOp>(op))
+    layout = dyn_cast<xegpu::LoadNdOp>(op).getLayoutAttr();
+  if (isa<xegpu::StoreNdOp>(op))
+    layout = dyn_cast<xegpu::StoreNdOp>(op).getLayoutAttr();
+  if (isa<xegpu::PrefetchNdOp>(op))
+    layout = dyn_cast<xegpu::PrefetchNdOp>(op).getLayoutAttr();
 
   if (layout != nullptr) {
     assert(layout.isForSubgroup() &&
@@ -220,11 +226,6 @@ XeGPUBlockingPass::getTileShape(Operation *op) const {
     return getShapeSkipLeadingUnitDim(layout);
   }
 
-  if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp>(op))
-    return getTileShape(op->getOpOperand(0));
-  if (isa<xegpu::StoreNdOp>(op))
-    return getTileShape(op->getOpOperand(1));
-
   if (isa<xegpu::DpasOp>(op)) {
 
     auto layoutA = dyn_cast<xegpu::DpasOp>(op).getLayoutAAttr();
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index c2ca63f6b289d..b3032bac351ac 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -15,7 +15,7 @@ gpu.module @test_kernel {
     %n = arith.muli %block_id_y, %c32 : index
 
     %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c>
-    %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32>
+    %c_init = xegpu.load_nd %c_tdesc {layout = #c}: !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32>
 
     %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a>
     %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b>
@@ -23,9 +23,9 @@ gpu.module @test_kernel {
       iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
       -> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) {
       //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
-      %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
+      %a = xegpu.load_nd %arg0 {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
       //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
-      %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
+      %b = xegpu.load_nd %arg1 {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
       //CHECK-COUNT-8: xegpu.dpas {{.*}} 
       %c = xegpu.dpas %a, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c,layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
       //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
@@ -36,7 +36,7 @@ gpu.module @test_kernel {
         : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>
     }
     //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
-    xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
+    xegpu.store_nd %out#2, %c_tdesc {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
     gpu.return
   }
 }
@@ -56,7 +56,7 @@ gpu.module @test_kernel {
     %n = arith.muli %block_id_y, %c32 : index
 
     %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #l1>
-    %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #l1> -> vector<16x32xf32>
+    %c_init = xegpu.load_nd %c_tdesc {layout = #l1}: !xegpu.tensor_desc<16x32xf32, #l1> -> vector<16x32xf32>
 
     %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l1>
     %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #l2>
@@ -64,9 +64,9 @@ gpu.module @test_kernel {
       iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
       -> (!xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32>) {
       //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
-      %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16>
+      %a = xegpu.load_nd %arg0 {layout = #l1}: !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16>
       //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
-      %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16>
+      %b = xegpu.load_nd %arg1 {layout = #l2}: !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16>
       //CHECK-COUNT-8: xegpu.dpas {{.*}} 
       %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
       //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16>
@@ -77,7 +77,7 @@ gpu.module @test_kernel {
         : !xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32>
     }
     //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-    xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1>
+    xegpu.store_nd %out#2, %c_tdesc {layout = #l1}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1>
     gpu.return
   }
 }
@@ -100,7 +100,7 @@ gpu.module @test_kernel {
     %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x32xf32, #l1>
 
     //CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
-    %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<8x32xf32, #l1> -> vector<8x32xf32>
+    %c_init = xegpu.load_nd %c_tdesc {layout = #l1}: !xegpu.tensor_desc<8x32xf32, #l1> -> vector<8x32xf32>
 
     %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16, #l1>
     %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l2>
@@ -108,9 +108,9 @@ gpu.module @test_kernel {
       iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
       -> (!xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32>) {
       //CHECK: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-      %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16>
+      %a = xegpu.load_nd %arg0 {layout = #l1}: !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16>
       //CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-      %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16>
+      %b = xegpu.load_nd %arg1 {layout = #l2}: !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16>
       %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32>
       //CHECK: xegpu.update_nd_offset {{.*}} [%c0, %c32] : !xegpu.tensor_desc<8x16xf16>
       %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<8x16xf16, #l1>
@@ -120,7 +120,7 @@ gpu.module @test_kernel {
         : !xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32>
     }
     //CHECK-COUNT-2: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-    xegpu.store_nd %out#2, %c_tdesc: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1>
+    xegpu.store_nd %out#2, %c_tdesc {layout = #l1}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1>
     gpu.return
   }
 }
@@ -141,7 +141,7 @@ gpu.module @test_kernel {
     %n = arith.muli %block_id_y, %c32 : index
 
     %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c>
-    %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32>
+    %c_init = xegpu.load_nd %c_tdesc {layout = #c}: !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32>
 
     %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a>
     %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b>
@@ -149,9 +149,9 @@ gpu.module @test_kernel {
       iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
       -> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) {
       //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
-      %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
+      %a = xegpu.load_nd %arg0 {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
       //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
-      %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
+      %b = xegpu.load_nd %arg1 {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
       //CHECK-COUNT-4: math.exp {{.*}} : vector<8x16xf16>
       %e = math.exp %a {layout_result_0 = #a} : vector<16x32xf16>
       //CHECK-COUNT-8: xegpu.dpas {{.*}} 
@@ -164,7 +164,7 @@ gpu.module @test_kernel {
         : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>
     }
     //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
-    xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
+    xegpu.store_nd %out#2, %c_tdesc {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
     gpu.return
   }
 }
@@ -188,14 +188,14 @@ gpu.module @test_kernel {
       iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
       -> (!xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>) {
       //CHECK-COUNT-8: xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-      %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
-      %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
+      %a = xegpu.load_nd %arg0 {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
+      %b = xegpu.load_nd %arg1 {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
 
       //CHECK-COUNT-4: arith.addf {{.*}} : vector<8x16xf16>
       %c = arith.addf %a, %b {layout_result_0 = #l} : vector<16x32xf16>
 
       //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
-      xegpu.store_nd %c, %arg2: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #l>
+      xegpu.store_nd %c, %arg2 {layout = #l}: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #l>
 
       //CHECK-COUNT-12: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16>
       %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l>
@@ -227,14 +227,14 @@ gpu.module @test_kernel {
       iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
       -> (!xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>) {
       //CHECK-COUNT-8: xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<8xf16> -> vector<8xf16>
-      %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
-      %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
+      %a = xegpu.load_nd %arg0 {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
+      %b = xegpu.load_nd %arg1 {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
 
       //CHECK-COUNT-4: arith.addf {{.*}} : vector<8xf16>
       %c = arith.addf %a, %b {layout_result_0 = #l} : vector<32xf16>
 
       //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8xf16>, !xegpu.tensor_desc<8xf16>
-      xegpu.store_nd %c, %arg2: vector<32xf16>, !xegpu.tensor_desc<32xf16, #l>
+      xegpu.store_nd %c, %arg2 {layout = #l}: vector<32xf16>, !xegpu.tensor_desc<32xf16, #l>
 
       //CHECK-COUNT-12: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8xf16>
       %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c32] : !xegpu.tensor_desc<32xf16, #l>
@@ -257,12 +257,12 @@ gpu.module @test_kernel  {
     %block_id_x = gpu.block_id x
     %m = arith.muli %block_id_x, %c64 : index
     %0 = xegpu.create_nd_tdesc %a[0, %m] : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l>
-    %1 = xegpu.load_nd %0: !xegpu.tensor_desc<16x64xf32, #l> -> vector<16x64xf32>
+    %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<16x64xf32, #l> -> vector<16x64xf32>
     // CHECK: vector.multi_reduction <add>, {{.*}}, [[ACC:%[0-9A-Za-z]+]] [0] : vector<16x16xf32> to vector<16xf32>
     // CHECK-COUNT-3: vector.multi_reduction <add>, {{.*}}, [[ACC]] [0] : vector<16x16xf32> to vector<16xf32>
     %2 = vector.multi_reduction <add>, %1, %acc {layout_result_0 = #r} [0]: vector<16x64xf32> to vector<64xf32>
     %3 = xegpu.create_nd_tdesc %b[%m] : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r>
-    xegpu.store_nd %2, %3: vector<64xf32>, !xegpu.tensor_desc<64xf32, #r>
+    xegpu.store_nd %2, %3 {layout = #r}: vector<64xf32>, !xegpu.tensor_desc<64xf32, #r>
     gpu.return
   }
 }
@@ -282,14 +282,14 @@ gpu.module @test_kernel   {
     %m = arith.muli %block_id_x, %c32 : index
     %n = arith.muli %block_id_y, %c32 : index
     %0 = xegpu.create_nd_tdesc %a[%m, %n] : memref<512x32xf32> -> !xegpu.tensor_desc<32x128xf32, #l>
-    %1 = xegpu.load_nd %0: !xegpu.tensor_desc<32x128xf32, #l> -> vector<32x128xf32>
+    %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<32x128xf32, #l> -> vector<32x128xf32>
 
     // CHECK: vector.multi_reduction <add>, {{.*}}, [[INIT:%[0-9A-Za-z]+]] [1] : vector<16x16xf32> to vector<16xf32>
     // CHECK-COUNT-1: vector.multi_reduction <add>, {{.*}}, [[INIT]] [1] : vector<16x16xf32> to vector<16xf32>
 
     %2 = vector.multi_reduction <add>, %1, %acc {layout_result_0 = #r} [1]: vector<32x128xf32> to vector<32xf32>
     %3 = xegpu.create_nd_tdesc %b[%n] : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r>
-    xegpu.store_nd %2, %3: vector<32xf32>, !xegpu.tensor_desc<32xf32, #r>
+    xegpu.store_nd %2, %3 {layout = #r}: vector<32xf32>, !xegpu.tensor_desc<32xf32, #r>
     gpu.return
   }
 }
@@ -304,11 +304,11 @@ gpu.module @test_kernel   {
     %block_id_x = gpu.block_id x
     %m = arith.muli %block_id_x, %c64 : index
     %0 = xegpu.create_nd_tdesc %a[%m] : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r>
-    %1 = xegpu.load_nd %0: !xegpu.tensor_desc<64xf32, #r> -> vector<64xf32>
+    %1 = xegpu.load_nd %0 {layout = #r}: !xegpu.tensor_desc<64xf32, #r> -> vector<64xf32>
     // CHECK-COUNT-4: vector.broadcast {{.*}} : vector<16xf32> to vector<16x16xf32>
     %2 = vector.broadcast  %1 {layout_result_0 = #l} : vector<64xf32> to vector<16x64xf32>
     %3 = xegpu.create_nd_tdesc %b[0, %m] : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l>
-    xegpu.store_nd %2, %3: vector<16x64xf32>, !xegpu.tensor_desc<16x64xf32, #l>
+    xegpu.store_nd %2, %3 {layout = #l}: vector<16x64xf32>, !xegpu.tensor_desc<16x64xf32, #l>
     gpu.return
   }
 }
@@ -323,7 +323,7 @@ gpu.module @test_kernel  {
     %block_id_x = gpu.block_id x
     %m = arith.muli %block_id_x, %c32 : index
     %0 = xegpu.create_nd_tdesc %a[%m] : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r>
-    %1 = xegpu.load_nd %0: !xegpu.tensor_desc<32xf32, #r> -> vector<32xf32>
+    %1 = xegpu.load_nd %0 {layout = #r}: !xegpu.tensor_desc<32xf32, #r> -> vector<32xf32>
     %11 = vector.shape_cast %1 :  vector<32xf32> to vector<32x1xf32>
     // CHECK-COUNT-8: vector.broadcast {{.*}}: vector<16x1xf32> to vector<16x16xf32>
     %2 = vector.broadcast  %11 {layout_result_0 = #l} : vector<32x1xf32> to vector<32x64xf32>
@@ -343,11 +343,11 @@ gpu.module @test_kernel   {
     %block_id_x = gpu.block_id x
     %m = arith.muli %block_id_x, %c32 : index
     %0 = xegpu.create_nd_tdesc %a[%m, 0] : memref<512x8xf32> -> !xegpu.tensor_desc<32x8xf32, #l>
-    %1 = xegpu.load_nd %0: !xegpu.tensor_desc<32x8xf32, #l> -> vector<32x8xf32>
+    %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<32x8xf32, #l> -> vector<32x8xf32>
     // CHECK-COUNT-2: vector.transpose {{.*}}  [1, 0] : vector<16x8xf32> to vector<8x16xf32>
     %2 = vector.transpose  %1, [1, 0] {layout_result_0 = #t} : vector<32x8xf32> to vector<8x32xf32>
     %3 = xegpu.create_nd_tdesc %b[0, %m] : memref<8x512xf32> -> !xegpu.tensor_desc<8x32xf32, #t>
-    xegpu.store_nd %2, %3: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #t>
+    xegpu.store_nd %2, %3 {layout = #t}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #t>
     gpu.return
   }
 }
@@ -513,8 +513,8 @@ gpu.module @test_kernel {
   //CHECK: [[c0:%.+]] = arith.constant 0 : index
   //CHECK: [[a:%.+]] = xegpu.create_nd_tdesc [[arg0]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
   //CHECK: [[b:%.+]] = xegpu.create_nd_tdesc [[arg1]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
-  //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
-  //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
+  //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] <{layout = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
+  //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] <{layout = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
   //CHECK: [[cvt:%.+]] = xegpu.convert_layout [[load_a]] <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>}> : vector<16x16xf16>
   //CHECK: [[a0:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
   //CHECK: [[a1:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
@@ -529,12 +529,12 @@ gpu.module @test_kernel {
     %c0 = arith.constant 0 : index
     %a_tdesc = xegpu.create_nd_tdesc %A[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
     %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
-    %a = xegpu.load_nd %a_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
-    %b = xegpu.load_nd %b_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+    %a = xegpu.load_nd %a_tdesc {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+    %b = xegpu.load_nd %b_tdesc {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
     %e = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16>
     %c = xegpu.dpas %e, %b {layout_a=#a, layout_b = #b, layout_cd = #c, layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     %c_tdesc = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c>
-    xegpu.store_nd %c, %c_tdesc: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c>
+    xegpu.store_nd %c, %c_tdesc {layout = #c}: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c>
     gpu.return
   }
 }
@@ -746,11 +746,11 @@ gpu.module @test_kernel {
     %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x32xf32, #l>
     %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x32xf32, #l>
 
-    %a = xegpu.load_nd %a_tdesc[%c0, %c0] : !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32>
-    %b = xegpu.load_nd %b_tdesc[%c0, %c0] : !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32>
+    %a = xegpu.load_nd %a_tdesc[%c0, %c0] {layout = #l}: !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32>
+    %b = xegpu.load_nd %b_tdesc[%c0, %c0] {layout = #l}: !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32>
 
     %result = arith.addf %a, %b {layout_result_0 = #l} : vector<1x32xf32>
-    xegpu.store_nd %result, %c_tdesc[%c0, %c0] : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #l>
+    xegpu.store_nd %result, %c_tdesc[%c0, %c0] {layout = #l}: vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #l>
     gpu.return
   }
 }

>From 227c15d19e7f311d3c7ff58b8f63624c0954671b Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 27 Nov 2025 00:42:55 +0000
Subject: [PATCH 11/23] simplify: Avoid calling getTileShape(Result&Operand)
 from getTileShape(op)

---
 .../XeGPU/Transforms/XeGPUBlocking.cpp        | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index ce53e8fe970b1..6edbca6734905 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -166,8 +166,8 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
       return instData;
     }
 
-    if (auto type = dyn_cast<ShapedType>(value.getType()))
-      return llvm::to_vector(type.getShape());
+    // if (auto type = dyn_cast<ShapedType>(value.getType()))
+    //   return llvm::to_vector(type.getShape());
   }
   LDBG() << "failed to getTileShape for: " << value;
   return std::nullopt;
@@ -190,12 +190,10 @@ XeGPUBlockingPass::getTileShape(Operation *op) const {
     return std::nullopt;
   };
 
-  if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp, xegpu::CreateDescOp,
-          xegpu::UpdateOffsetOp>(op))
-    return getTileShape(op->getOpResult(0));
-
   xegpu::DistributeLayoutAttr layout = nullptr;
 
+  if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp>(op))
+    layout = xegpu::getDistributeLayoutAttr(op->getOpResult(0));
   if (isa<xegpu::LoadMatrixOp>(op))
     layout = dyn_cast<xegpu::LoadMatrixOp>(op).getLayoutAttr();
   if (isa<xegpu::StoreMatrixOp>(op))
@@ -212,7 +210,8 @@ XeGPUBlockingPass::getTileShape(Operation *op) const {
            "Matrix load/store should have subgroup level layout");
     return layout.getEffectiveInstDataAsInt();
   }
-
+  if (isa<xegpu::CreateDescOp, xegpu::UpdateOffsetOp>(op))
+    layout = xegpu::getDistributeLayoutAttr(op->getOpResult(0));
   if (isa<xegpu::LoadGatherOp>(op))
     layout = dyn_cast<xegpu::LoadGatherOp>(op).getLayoutAttr();
   if (isa<xegpu::StoreScatterOp>(op))
@@ -254,14 +253,16 @@ XeGPUBlockingPass::getTileShape(Operation *op) const {
   }
 
   if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1)
-    return getTileShape(op->getOpResult(0));
-
+    layout = xegpu::getDistributeLayoutAttr(op->getOpResult(0));
   if (isa<vector::MultiDimReductionOp>(op))
-    return getTileShape(op->getOpOperand(0));
-
+    layout = xegpu::getDistributeLayoutAttr(op->getOpOperand(0));
   if (isa<vector::TransposeOp, vector::BroadcastOp>(op))
-    return getTileShape(op->getOpResult(0));
-
+    layout = xegpu::getDistributeLayoutAttr(op->getOpResult(0));
+  if (layout != nullptr) {
+    assert(layout.isForSubgroup() &&
+           "Other ops (Vector/Math/Arith) should have subgroup level layout");
+    return getShapeSkipLeadingUnitDim(layout);
+  }
   return std::nullopt;
 }
 

>From 0e44d076aa47727144c86b0ece160505893bb970 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 29 Nov 2025 23:43:20 +0000
Subject: [PATCH 12/23] simplify getDistributeLayoutAttr

---
 .../XeGPU/Transforms/XeGPUBlocking.cpp        |  6 +-
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 11 +++-
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 61 ++++++++++++++-----
 .../Dialect/XeGPU/subgroup-distribute.mlir    |  4 +-
 .../Dialect/XeGPU/xegpu-unroll-patterns.mlir  |  4 +-
 .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir       |  8 +--
 6 files changed, 66 insertions(+), 28 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 6edbca6734905..2f1524a3659c2 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -268,19 +268,19 @@ XeGPUBlockingPass::getTileShape(Operation *op) const {
 
 bool XeGPUBlockingPass::needsUnroll(Operation *op) const {
   // skip the op if any of its operands or results has workgroup level layouts
-  bool hasWgLayoutOperands =
+  /* bool hasWgLayoutOperands =
       llvm::any_of(op->getOpOperands(), [](OpOperand &opr) {
         xegpu::DistributeLayoutAttr layout =
             xegpu::getDistributeLayoutAttr(opr);
         return layout && layout.isForWorkgroup();
-      });
+      }); */
   bool hasWgLayoutResults =
       llvm::any_of(op->getOpResults(), [](OpResult result) {
         xegpu::DistributeLayoutAttr layout =
             xegpu::getDistributeLayoutAttr(result);
         return layout && layout.isForWorkgroup();
       });
-  if (hasWgLayoutOperands || hasWgLayoutResults) {
+  if (hasWgLayoutResults) {
     LDBG() << "skip unrolling for op with workgroup level layout: " << *op;
     return false;
   }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index b64eb5b29ccb0..991c29c8de4ed 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1647,8 +1647,11 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
       // Layouts are needed for vector type only.
       if (!isa<VectorType>(operand.get().getType()))
         continue;
-      if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(op))
-        continue;
+      //      if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(op))
+      // xegpu::DpasOp,
+      // xegpu::LoadGatherOp, xegpu::StoreScatterOp,
+      // xegpu::LoadNdOp, xegpu::StoreNdOp>(op))
+      //      continue;
 
       auto layout = xegpu::getDistributeLayoutAttr(operand.get());
       if (!layout) {
@@ -1660,6 +1663,10 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
       xegpu::setDistributeLayoutAttr(operand, layout);
     }
   });
+
+  // dump out the op here
+  // getOperation()->dump();
+
   // Step 2: Move all operations of a GPU function inside
   // gpu.warp_execute_on_lane_0 operation.
   {
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 91432b1c11304..dff1474e586d8 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -126,8 +126,8 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
     assert(defOp && "result must have a defining op");
 
     // For ConvertLayoutOp, the layout is stored in the targetLayoutAttr
-    if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(defOp))
-      return convertOp.getTargetLayoutAttr();
+    // if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(defOp))
+    //   return convertOp.getTargetLayoutAttr();
 
     // for LoadNdOp, the layout is stored in the tensor descriptor
     if (auto loadNd = dyn_cast<xegpu::LoadNdOp>(defOp))
@@ -137,17 +137,17 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
     if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(defOp))
       return loadOp.getLayoutAttr();
 
-    // for StoreMatrixOp, the layout is attached to the property of the op
-    if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(defOp))
-      return storeOp.getLayoutAttr();
-    std::string layoutName = getLayoutName(result);
-    if (defOp->hasAttr(layoutName))
-      return defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+    // // for StoreMatrixOp, the layout is attached to the property of the op
+    // if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(defOp))
+    //   return storeOp.getLayoutAttr();
 
     // check for "permament" layout only after "temporary" layout name lookup
     // for backward compatibility
     if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(defOp))
       return loadGatherOp.getLayoutAttr();
+    std::string layoutName = getLayoutName(result);
+    if (defOp->hasAttr(layoutName))
+      return defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
   }
 
   if (auto arg = dyn_cast<BlockArgument>(value)) {
@@ -162,26 +162,49 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
   return nullptr;
 }
 
+// xegpu::DistributeLayoutAttr
+// xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
+//   Operation *op = opr.getOwner();
+
+//   if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(op))
+//     return loadOp.getLayoutAttr();
+
+//   if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(op))
+//     return storeOp.getLayoutAttr();
+
+//   std::string layoutName = xegpu::getLayoutName(opr);
+//   if (op->hasAttr(layoutName))
+//     return op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+
+//   // check for "permament" layout only after "temporary" layout name lookup
+//   if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
+//     if (auto layout = storeScatterOp.getLayoutAttr())
+//       return layout;
+
+//   return getDistributeLayoutAttr(opr.get());
+// }
+
 xegpu::DistributeLayoutAttr
 xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
   Operation *op = opr.getOwner();
 
-  if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(op))
-    return loadOp.getLayoutAttr();
+  // if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(op))
+  //   return loadOp.getLayoutAttr();
 
   if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(op))
     return storeOp.getLayoutAttr();
 
-  std::string layoutName = xegpu::getLayoutName(opr);
-  if (op->hasAttr(layoutName))
-    return op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
-
   // check for "permament" layout only after "temporary" layout name lookup
   if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
     if (auto layout = storeScatterOp.getLayoutAttr())
       return layout;
 
-  return getDistributeLayoutAttr(opr.get());
+  std::string layoutName = xegpu::getLayoutName(opr);
+  if (op->hasAttr(layoutName))
+    return op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+
+  return nullptr;
+  // return getDistributeLayoutAttr(opr.get());
 }
 
 // Returns the permanent layout attribute for the given result if it's
@@ -286,6 +309,14 @@ void xegpu::removeLayoutAttrs(Operation *op) {
       removeLayoutAttr(opr);
     for (OpResult result : nestOp->getOpResults())
       removeLayoutAttr(result);
+    if (op->hasAttrOfType<DistributeLayoutAttr>("layout"))
+      op->removeAttr("layout");
+    if (op->hasAttrOfType<DistributeLayoutAttr>("layout_a"))
+      op->removeAttr("layout_a");
+    if (op->hasAttrOfType<DistributeLayoutAttr>("layout_b"))
+      op->removeAttr("layout_b");
+    if (op->hasAttrOfType<DistributeLayoutAttr>("layout_cd"))
+      op->removeAttr("layout_cd");
   });
 }
 
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 8fd3cca5594cb..076c1d6ca51a6 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -137,7 +137,7 @@ gpu.module @xevm_module{
     %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
     %loaded = scf.if %pred -> (vector<16x8xf16>) {
       %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
-        layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+        layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
       } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
       scf.yield %3 : vector<16x8xf16>
     } else {
@@ -169,7 +169,7 @@ gpu.module @xevm_module{
     %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
     scf.if %pred  {
       %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
-        layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+        layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
       } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
       xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
     }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
index dbc52b8a98894..c3be138fef38a 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
@@ -226,7 +226,7 @@ gpu.module @test {
 
       %c17 = arith.constant 17: index
       %mask = vector.create_mask %c17: vector<32xi1>
-      %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
+      %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
 
       gpu.return %ld : vector<32xf32>
   }
@@ -381,7 +381,7 @@ gpu.module @test {
 
     %c17 = arith.constant 17: index
     %mask = vector.create_mask %c17: vector<32xi1>
-    %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout_result_0 = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32>
+    %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32>
     gpu.return %ld : vector<32x4xf32>
    }
 
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 574b365443a0a..582dbe783e5e3 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -275,7 +275,7 @@ gpu.module @test_distribution {
     // CHECK-SAME: : memref<?xf16>, vector<32x4xindex>, vector<32x4xi1> -> vector<32x4xf16>
     %offset =  arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 4]>} dense<0> : vector<256x16xindex>
     %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 4]>} dense<1> : vector<256x16xi1>
-    %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 4]>, l1_hint = #xegpu.cache_hint<cached>}
+    %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 4]>, l1_hint = #xegpu.cache_hint<cached>}
       : memref<?xf16>, vector<256x16xindex>, vector<256x16xi1> -> vector<256x16xf16>
     gpu.return
   }
@@ -310,7 +310,7 @@ gpu.module @test_distribution {
     // CHECK-SAME: : memref<?xf16>, vector<8xindex>, vector<8xi1> -> vector<8x4xf16>
     %offset =  arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<0> : vector<256xindex>
     %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<1> : vector<256xi1>
-    %load = xegpu.load %src[%offset], %mask {chunk_size = 4, layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 4]>, l1_hint = #xegpu.cache_hint<cached>}
+    %load = xegpu.load %src[%offset], %mask {chunk_size = 4, layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 4]>, l1_hint = #xegpu.cache_hint<cached>}
       : memref<?xf16>, vector<256xindex>, vector<256xi1> -> vector<256x4xf16>
     gpu.return
   }
@@ -398,7 +398,7 @@ gpu.module @test_distribution {
       %cst_acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>, dims = [3]>} dense<0.0> : vector<4x2x6xf16>
       %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} dense<0>  : vector<4x2x6x32xindex>
       %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} dense<true> : vector<4x2x6x32xi1>
-      %load = xegpu.load %src[%offset], %mask  {layout_result_0 = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} : ui64, vector<4x2x6x32xindex>, vector<4x2x6x32xi1> -> vector<4x2x6x32xf16>
+      %load = xegpu.load %src[%offset], %mask  {layout = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} : ui64, vector<4x2x6x32xindex>, vector<4x2x6x32xi1> -> vector<4x2x6x32xf16>
       // CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [3] : vector<1x1x1x32xf16> to vector<1x1x1xf16>
       %reduce = vector.multi_reduction <add>, %load, %cst_acc {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>, dims = [3]>} [3]
       : vector<4x2x6x32xf16> to vector<4x2x6xf16>
@@ -592,7 +592,7 @@ gpu.module @test_distribution {
     // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>}>
     // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>} :
     // CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32>
-    %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32>
+    %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32>
 
     // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[LOAD]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : vector<32xf32> to vector<32x32xf32>
     %4 = vector.broadcast %3 {layout_result_0 =

>From c3bc0850d6002208a5ce8d1cc410ca8fc85cd366 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sun, 30 Nov 2025 05:25:52 +0000
Subject: [PATCH 13/23] add interface getAnchorLayout

---
 .../mlir/Dialect/XeGPU/IR/CMakeLists.txt      |  5 ++
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h    |  1 +
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 20 +++++
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 73 ++++++++++++++++---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  2 +
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 45 +++++++-----
 6 files changed, 115 insertions(+), 31 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
index efca3cfa0dab7..b10219f71b531 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
@@ -15,3 +15,8 @@ set(LLVM_TARGET_DEFINITIONS XeGPUAttrs.td)
 mlir_tablegen(XeGPUAttrInterface.h.inc -gen-attr-interface-decls)
 mlir_tablegen(XeGPUAttrInterface.cpp.inc -gen-attr-interface-defs)
 add_mlir_dialect_tablegen_target(MLIRXeGPUAttrInterfaceIncGen)
+
+set(LLVM_TARGET_DEFINITIONS XeGPUAttrs.td)
+mlir_tablegen(XeGPUOpInterface.h.inc -gen-op-interface-decls)
+mlir_tablegen(XeGPUOpInterface.cpp.inc -gen-op-interface-defs)
+add_mlir_dialect_tablegen_target(MLIRXeGPUOpInterfaceIncGen)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 0c059967bb898..7badfaf4a8216 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -34,6 +34,7 @@ class SliceAttr;
 #include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
+#include <mlir/Dialect/XeGPU/IR/XeGPUOpInterface.h.inc>
 // clang-format on
 
 #define GET_ATTRDEF_CLASSES
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 93c5187b00756..d5b2db75bcad9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -740,4 +740,24 @@ def XeGPU_MemLayoutAttr : XeGPUAttr<"MemLayout", "mem_layout"> {
 
 }
 
+def AnchorLayoutInterface : OpInterface<"AnchorLayoutInterface"> {
+  let cppNamespace = "::mlir::xegpu";
+
+  let description = [{
+    An attribute interface for accessing anchor layout information.
+    This interface provides a method to retrieve the anchor layout
+    from attributes that implement it.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      /*desc=*/"Get the anchor layout attribute.",
+      /*retTy=*/"xegpu::DistributeLayoutAttr", 
+      /*methodName=*/"getAnchorLayout",
+      /*args=*/(ins)
+    >,
+  ];
+
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index d93ffb70881bd..7a9202a45ae40 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -17,6 +17,7 @@ include "mlir/Interfaces/ShapedOpInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
 
+
 // Base class for dialect operations. This operation inherits from the base
 // `Op` class in OpBase.td, and provides:
 //   * The parent dialect of the operation.
@@ -247,7 +248,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
   }];
 }
 
-def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
+def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", [AnchorLayoutInterface]> {
   let summary = "prefetches a n-D block to cache";
   let description = [{
     It issues an instruction to prefetch a block of data from continuous
@@ -296,6 +297,10 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
       return getTensorDesc().getType();
     }
 
+    xegpu::DistributeLayoutAttr getAnchorLayout() {
+      return getLayout().value_or(nullptr);
+    }
+
    SmallVector<OpFoldResult> getMixedOffsets() {
       auto statics = getConstOffsets().value_or(SmallVector<int64_t>());
       auto dynamics = getOffsets();
@@ -337,7 +342,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
 
 
 def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
-  AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemRead]>
+  AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemRead]>, AnchorLayoutInterface
   ]> {
   let summary = "loads a n-D block from memory (represented by TensorDesc)"
                 "to registers (represented by vector)";
@@ -418,6 +423,10 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
       return getTensorDesc().getType();
     }
 
+    xegpu::DistributeLayoutAttr getAnchorLayout() {
+      return getLayout().value_or(nullptr);
+    }
+
     SmallVector<OpFoldResult> getMixedOffsets() {
       auto statics = getConstOffsets().value_or(SmallVector<int64_t>());
       auto dynamics = getOffsets();
@@ -460,7 +469,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
 }
 
 def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
-  AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemWrite]>
+  AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemWrite]>, AnchorLayoutInterface
   ]> {
   let summary = "stores a n-D block register region back to memory, currently only supports 2D";
 
@@ -530,6 +539,10 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
       return getTensorDesc().getType();
     }
 
+    xegpu::DistributeLayoutAttr getAnchorLayout() {
+      return getLayout().value_or(nullptr);
+    }
+
     SmallVector<OpFoldResult> getMixedOffsets() {
       auto statics = getConstOffsets().value_or(SmallVector<int64_t>());
       auto dynamics = getOffsets();
@@ -721,7 +734,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
   let hasVerifier = 1;
 }
 
-def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
+def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> {
   let summary = "prefetches a set of scattered data points to cache";
 
   let description = [{
@@ -810,6 +823,10 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
       return getSource().getType();
     }
 
+    xegpu::DistributeLayoutAttr getAnchorLayout() {
+      return getLayout().value_or(nullptr);
+    }
+
     TypedValue<xegpu::TensorDescType> getTensorDesc() {
       if (auto tdescType = getTensorDescType()) {
         return llvm::cast<TypedValue<xegpu::TensorDescType>>(getSource());
@@ -840,7 +857,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
   let hasVerifier = 1;
 }
 
-def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
+def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayoutInterface]> {
   let summary = "load a set of scattered data points from memory.";
 
   let description = [{ It (aka. load) load data per each lane. The output
@@ -946,6 +963,10 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
       return getSource().getType();
     }
 
+    xegpu::DistributeLayoutAttr getAnchorLayout() {
+      return getLayout().value_or(nullptr);
+    }
+
     TypedValue<xegpu::TensorDescType> getTensorDesc() {
       if (auto tdescType = getTensorDescType()) {
         return llvm::cast<TypedValue<xegpu::TensorDescType>>(getSource());
@@ -1002,7 +1023,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
   let hasVerifier = 1;
 }
 
-def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
+def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorLayoutInterface]> {
   let summary = "store data to scattered memory locations.";
   let description =
       [{ It (aka. store) stores data to scattered memory locations. The value is
@@ -1104,6 +1125,10 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
       return getDest().getType();
     }
 
+    xegpu::DistributeLayoutAttr getAnchorLayout() {
+      return getLayout().value_or(nullptr);
+    }
+
     TypedValue<xegpu::TensorDescType> getTensorDesc() {
       if (auto tdescType = getTensorDescType()) {
         return llvm::cast<TypedValue<xegpu::TensorDescType>>(getDest());
@@ -1214,7 +1239,7 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
   let hasVerifier = 1;
 }
 
-def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]> {
+def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>, AnchorLayoutInterface]> {
   let summary = "It performs mma computation";
 
   let description = [{DPAS performs matrix multiplication on matrix A of `mxk`
@@ -1275,6 +1300,11 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
   let results = (outs XeGPU_DpasResType: $result);
 
   let extraClassDeclaration = [{
+
+    xegpu::DistributeLayoutAttr getAnchorLayout() {
+      return getLayoutCd().value_or(nullptr);
+    }
+
     VectorType getLhsType() {
       return getLhs().getType();
     }
@@ -1308,7 +1338,8 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
 def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
       MemoryEffects<[MemRead, MemWrite]>,
       AllElementTypesMatch<["tensorDesc", "value", "result"]>,
-      AllShapesMatch<["tensorDesc", "value", "result"]>]> {
+      AllShapesMatch<["tensorDesc", "value", "result"]>, 
+      AnchorLayoutInterface]> {
   let summary = "Atomic read-modify-write operation on the TensorDesc. ";
 
   let description = [{
@@ -1348,6 +1379,12 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
     XeGPU_ValueType:$value,
     OptionalAttr<DistributeLayoutAttr>:$layout);
 
+  let extraClassDeclaration = [{
+    xegpu::DistributeLayoutAttr getAnchorLayout() {
+      return getLayout().value_or(nullptr);
+    }
+  }];
+
   let results = (outs XeGPU_ValueType:$result);
 
   let assemblyFormat = [{
@@ -1421,7 +1458,7 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
   let extraClassDeclaration = extraBaseClassDeclaration;
 }
 
-def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> {
+def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>, AnchorLayoutInterface]> {
     let summary = "Convert the layout of the input operand";
     let description = [{
       `convert_layout` redistribute data across subgroups and/or lanes from the `input_layout` to
@@ -1455,6 +1492,11 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
     let assemblyFormat = [{
         $source prop-dict attr-dict `:` type($source)
     }];
+    let extraClassDeclaration = [{
+      xegpu::DistributeLayoutAttr getAnchorLayout() {
+        return getTargetLayout();
+      }
+    }];
 
     let hasFolder = 1;
     let hasVerifier = 1;
@@ -1496,7 +1538,7 @@ def XeGPU_CreateMemDescOp: XeGPU_Op<"create_mem_desc", [Pure,
 }
 
 def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
-                              AllElementTypesMatch<["mem_desc", "res"]>]>  {
+                              AllElementTypesMatch<["mem_desc", "res"]>, AnchorLayoutInterface]>  {
   let arguments = (ins XeGPU_MemDesc:$mem_desc,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
@@ -1555,13 +1597,18 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
         return vecTy.getShape();
       return {};
     }
+
+    xegpu::DistributeLayoutAttr getAnchorLayout() {
+      return getLayout().value_or(nullptr);
+    }
+
   }];
 
   let hasVerifier = 1;
 }
 
 def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
-                              AllElementTypesMatch<["mem_desc", "data"]>]> {
+                              AllElementTypesMatch<["mem_desc", "data"]>, AnchorLayoutInterface]> {
   let arguments = (ins
     AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$data,
     XeGPU_MemDesc:$mem_desc,
@@ -1614,6 +1661,10 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
       return {};
     }
 
+    xegpu::DistributeLayoutAttr getAnchorLayout() {
+      return getLayout().value_or(nullptr);
+    }
+
   }];
 
   let hasVerifier = 1;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index fb5d1e758dbd1..bd3bcf4698c5a 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -36,6 +36,8 @@ void XeGPUDialect::initialize() {
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.cpp.inc>
       >();
 }
+#define GET_OP_INTERFACE_CLASSES
+#include "mlir/Dialect/XeGPU/IR/XeGPUOpInterface.cpp.inc"
 
 // A `srcShape` consists of N distribution units, each being `subShapesLayout` x
 // `subShape`. A `delinearizedId` is used to identify a particular `subShape`
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index dff1474e586d8..3e77c5b2ca43f 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -125,26 +125,31 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
     Operation *defOp = result.getDefiningOp();
     assert(defOp && "result must have a defining op");
 
-    // For ConvertLayoutOp, the layout is stored in the targetLayoutAttr
-    // if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(defOp))
-    //   return convertOp.getTargetLayoutAttr();
-
-    // for LoadNdOp, the layout is stored in the tensor descriptor
-    if (auto loadNd = dyn_cast<xegpu::LoadNdOp>(defOp))
-      return getDistributeLayoutAttr(loadNd.getTensorDesc());
-
-    // for LoadMatrixOp, the layout is attached to the property of the op
-    if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(defOp))
-      return loadOp.getLayoutAttr();
-
-    // // for StoreMatrixOp, the layout is attached to the property of the op
-    // if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(defOp))
-    //   return storeOp.getLayoutAttr();
-
-    // check for "permament" layout only after "temporary" layout name lookup
-    // for backward compatibility
-    if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(defOp))
-      return loadGatherOp.getLayoutAttr();
+    // // For ConvertLayoutOp, the layout is stored in the targetLayoutAttr
+    // // if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(defOp))
+    // //   return convertOp.getTargetLayoutAttr();
+
+    // // for LoadNdOp, the layout is stored in the tensor descriptor
+    // if (auto loadNd = dyn_cast<xegpu::LoadNdOp>(defOp))
+    //   return getDistributeLayoutAttr(loadNd.getTensorDesc());
+
+    // // for LoadMatrixOp, the layout is attached to the property of the op
+    // if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(defOp))
+    //   return loadOp.getLayoutAttr();
+
+    // // // for StoreMatrixOp, the layout is attached to the property of the op
+    // // if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(defOp))
+    // //   return storeOp.getLayoutAttr();
+
+    // // check for "permament" layout only after "temporary" layout name lookup
+    // // for backward compatibility
+    // if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(defOp))
+    //   return loadGatherOp.getLayoutAttr();
+
+    if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
+      return anchorOp.getAnchorLayout();
+    }
+
     std::string layoutName = getLayoutName(result);
     if (defOp->hasAttr(layoutName))
       return defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);

>From 5d3e20e25ff8bbbcceafd4585cc890b68119b81f Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Wed, 10 Dec 2025 19:14:06 +0000
Subject: [PATCH 14/23] remove permantLayout parameter in setDistribueLayout
 function

---
 mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h   |  6 +-----
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp        |  2 +-
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp          | 12 ++++++------
 3 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 58092c3bb9ed2..f23174cb0f697 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -104,15 +104,11 @@ void removeLayoutAttrs(Operation *op);
 
 /// Sets the DistributeLayoutAttr for a given OpOperand or OpResult by attaching
 /// it to the owner's dictionary attributes
-/// If `respectPermLayout` is true the existing permament layout
-/// attribute will be kept and assigned to the attribute dict instead
-/// of the provided layout.
 template <typename T,
           typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
                                       std::is_same_v<T, OpResult>>>
 void setDistributeLayoutAttr(const T &operandOrResult,
-                             const DistributeLayoutAttr layout,
-                             bool respectPermLayout = false);
+                             const DistributeLayoutAttr layout);
 
 /// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given
 /// operation. If the operation contains regions, it is also applied recursively
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index dc9eb96c169b4..89b6695bd921e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -1173,7 +1173,7 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op,
     }
     // If the result is a vector type, add a temporary layout attribute to the
     // op.
-    xegpu::setDistributeLayoutAttr(result, layout, /*respectPermLayout*/ true);
+    xegpu::setDistributeLayoutAttr(result, layout);
   }
   return success();
 }
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 94d95958c758b..e56d0a9cf0613 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -248,8 +248,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
 
 template <typename T, typename>
 void xegpu::setDistributeLayoutAttr(const T &operandOrResult,
-                                    const DistributeLayoutAttr layout,
-                                    bool respectPermLayout) {
+                                    const DistributeLayoutAttr layout) {
   Operation *owner = operandOrResult.getOwner();
   std::string name = xegpu::getLayoutName(operandOrResult);
 
@@ -257,8 +256,9 @@ void xegpu::setDistributeLayoutAttr(const T &operandOrResult,
     return;
 
   DistributeLayoutAttr candidate = layout;
-  if (respectPermLayout)
-    candidate = maybePickPermanentLayout(layout, operandOrResult, owner, name);
+  // if (respectPermLayout)
+  //   candidate = maybePickPermanentLayout(layout, operandOrResult, owner,
+  //   name);
 
   if (candidate)
     owner->setAttr(name, candidate);
@@ -267,12 +267,12 @@ void xegpu::setDistributeLayoutAttr(const T &operandOrResult,
 // Explicit instantiation for OpResult
 template void xegpu::setDistributeLayoutAttr<mlir::OpResult>(
     const mlir::OpResult &result,
-    const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout);
+    const mlir::xegpu::DistributeLayoutAttr layout);
 
 // Explicit instantiation for OpOperand
 template void xegpu::setDistributeLayoutAttr<mlir::OpOperand>(
     const mlir::OpOperand &operand,
-    const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout);
+    const mlir::xegpu::DistributeLayoutAttr layout);
 
 void xegpu::setDistributeLayoutAttrs(
     Operation *op, function_ref<DistributeLayoutAttr(Value)> getLayoutImpl) {

>From 1025b571c57d81a4be6d3aeb8844c8d1eca1dd72 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Wed, 10 Dec 2025 22:48:11 +0000
Subject: [PATCH 15/23] add setAnchorLayout interface

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |   8 +-
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |  46 +++++++
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     |   4 +-
 .../XeGPU/Transforms/XeGPUBlocking.cpp        | 125 +++++++-----------
 .../Transforms/XeGPUSubgroupDistribute.cpp    |  16 +--
 .../Transforms/XeGPUWgToSgDistribute.cpp      |   2 +-
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   |  27 ++--
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp |   4 +-
 8 files changed, 121 insertions(+), 111 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index c2cca3e19fd6a..a5064d6145341 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -775,7 +775,7 @@ def AnchorLayoutInterface : OpInterface<"AnchorLayoutInterface"> {
 
   let description = [{
     An attribute interface for accessing anchor layout information.
-    This interface provides a method to retrieve the anchor layout
+    This interface provides a method to set and retrieve the anchor layout
     from attributes that implement it.
   }];
 
@@ -786,6 +786,12 @@ def AnchorLayoutInterface : OpInterface<"AnchorLayoutInterface"> {
       /*methodName=*/"getAnchorLayout",
       /*args=*/(ins)
     >,
+    InterfaceMethod<
+      /*desc=*/"Set the anchor layout attribute.",
+      /*retTy=*/"void", 
+      /*methodName=*/"setAnchorLayout",
+      /*args=*/(ins "xegpu::DistributeLayoutAttr":$layout)
+    >,
   ];
 
 }
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 4583c6400058f..d0206ebea9359 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -301,6 +301,10 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", [AnchorLayoutInterface]> {
       return getLayout().value_or(nullptr);
     }
 
+    void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+      getProperties().layout = anchorLayout;
+    }
+
    SmallVector<OpFoldResult> getMixedOffsets() {
       auto statics = getConstOffsets().value_or(SmallVector<int64_t>());
       auto dynamics = getOffsets();
@@ -428,6 +432,10 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
       return getLayout().value_or(nullptr);
     }
 
+    void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+      getProperties().layout = anchorLayout;
+    }
+
     SmallVector<OpFoldResult> getMixedOffsets() {
       auto statics = getConstOffsets().value_or(SmallVector<int64_t>());
       auto dynamics = getOffsets();
@@ -545,6 +553,10 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
       return getLayout().value_or(nullptr);
     }
 
+    void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+      getProperties().layout = anchorLayout;
+    }
+
     SmallVector<OpFoldResult> getMixedOffsets() {
       auto statics = getConstOffsets().value_or(SmallVector<int64_t>());
       auto dynamics = getOffsets();
@@ -830,6 +842,10 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> {
       return getLayout().value_or(nullptr);
     }
 
+    void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+      getProperties().layout = anchorLayout;
+    }
+
     TypedValue<xegpu::TensorDescType> getTensorDesc() {
       if (auto tdescType = getTensorDescType()) {
         return llvm::cast<TypedValue<xegpu::TensorDescType>>(getSource());
@@ -970,6 +986,10 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou
       return getLayout().value_or(nullptr);
     }
 
+    void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+      getProperties().layout = anchorLayout;
+    }
+
     TypedValue<xegpu::TensorDescType> getTensorDesc() {
       if (auto tdescType = getTensorDescType()) {
         return llvm::cast<TypedValue<xegpu::TensorDescType>>(getSource());
@@ -1132,6 +1152,10 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL
       return getLayout().value_or(nullptr);
     }
 
+    void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+      getProperties().layout = anchorLayout;
+    }
+
     TypedValue<xegpu::TensorDescType> getTensorDesc() {
       if (auto tdescType = getTensorDescType()) {
         return llvm::cast<TypedValue<xegpu::TensorDescType>>(getDest());
@@ -1308,6 +1332,10 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>,
       return getLayoutCd().value_or(nullptr);
     }
 
+    void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+      getProperties().layout_cd = anchorLayout;
+    }
+
     VectorType getLhsType() {
       return getLhs().getType();
     }
@@ -1386,6 +1414,11 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
     xegpu::DistributeLayoutAttr getAnchorLayout() {
       return getLayout().value_or(nullptr);
     }
+
+    void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+      getProperties().layout = anchorLayout;
+    }
+
   }];
 
   let results = (outs XeGPU_ValueType:$result);
@@ -1499,6 +1532,11 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
       xegpu::DistributeLayoutAttr getAnchorLayout() {
         return getTargetLayout();
       }
+
+      void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+        getProperties().target_layout = anchorLayout;
+      }
+
     }];
 
     let hasFolder = 1;
@@ -1605,6 +1643,10 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
       return getLayout().value_or(nullptr);
     }
 
+    void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+      getProperties().layout = anchorLayout;
+    }
+
   }];
 
   let hasVerifier = 1;
@@ -1668,6 +1710,10 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
       return getLayout().value_or(nullptr);
     }
 
+    void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+      getProperties().layout = anchorLayout;
+    }
+
   }];
 
   let hasVerifier = 1;
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index f23174cb0f697..5b05a2192f5a6 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -64,10 +64,10 @@ FailureOr<VectorType> getDistributedVectorType(VectorType originalType,
                                                LayoutAttr layout);
 
 /// Return the attribute name for the OpOperand to attach DistributeLayoutAttr
-std::string getLayoutName(const OpOperand &operand);
+std::string getTempLayoutName(const OpOperand &operand);
 
 /// Return the attribute name for the OpResult to attach DistributeLayoutAttr
-std::string getLayoutName(const OpResult result);
+std::string getTempLayoutName(const OpResult result);
 
 /// Retrieves the DistributeLayoutAttr associated with a given Value. For
 /// TensorDescType values, the DistributeLayoutAttr is extracted from the
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 2f1524a3659c2..b584479db7a7c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -166,8 +166,8 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
       return instData;
     }
 
-    // if (auto type = dyn_cast<ShapedType>(value.getType()))
-    //   return llvm::to_vector(type.getShape());
+    if (auto type = dyn_cast<ShapedType>(value.getType()))
+      return llvm::to_vector(type.getShape());
   }
   LDBG() << "failed to getTileShape for: " << value;
   return std::nullopt;
@@ -175,68 +175,33 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
 
 std::optional<SmallVector<int64_t>>
 XeGPUBlockingPass::getTileShape(Operation *op) const {
-
-  auto getShapeSkipLeadingUnitDim = [](xegpu::DistributeLayoutAttr layout)
-      -> std::optional<SmallVector<int64_t>> {
-    SmallVector<int64_t> instData = layout.getEffectiveInstDataAsInt();
-    if (!instData.empty()) {
-      // Remove leading unit dimensions from inst_data
-      // For example, if the inst_data is [1, 1, 32]
-      // it will pass [32] as the unroll/blocking size.
-      auto it = llvm::find_if(instData, [](auto val) { return val != 1; });
-      instData.erase(instData.begin(), it);
-      return instData;
-    }
-    return std::nullopt;
-  };
-
-  xegpu::DistributeLayoutAttr layout = nullptr;
-
-  if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp>(op))
-    layout = xegpu::getDistributeLayoutAttr(op->getOpResult(0));
-  if (isa<xegpu::LoadMatrixOp>(op))
-    layout = dyn_cast<xegpu::LoadMatrixOp>(op).getLayoutAttr();
-  if (isa<xegpu::StoreMatrixOp>(op))
-    layout = dyn_cast<xegpu::StoreMatrixOp>(op).getLayoutAttr();
-  if (isa<xegpu::LoadNdOp>(op))
-    layout = dyn_cast<xegpu::LoadNdOp>(op).getLayoutAttr();
+  if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp, xegpu::CreateDescOp,
+          xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op))
+    return getTileShape(op->getOpResult(0));
+  if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
+          xegpu::StoreMatrixOp>(op))
+    return getTileShape(op->getOpOperand(0));
   if (isa<xegpu::StoreNdOp>(op))
-    layout = dyn_cast<xegpu::StoreNdOp>(op).getLayoutAttr();
-  if (isa<xegpu::PrefetchNdOp>(op))
-    layout = dyn_cast<xegpu::PrefetchNdOp>(op).getLayoutAttr();
-
-  if (layout != nullptr) {
-    assert(layout.isForSubgroup() &&
-           "Matrix load/store should have subgroup level layout");
-    return layout.getEffectiveInstDataAsInt();
-  }
-  if (isa<xegpu::CreateDescOp, xegpu::UpdateOffsetOp>(op))
-    layout = xegpu::getDistributeLayoutAttr(op->getOpResult(0));
-  if (isa<xegpu::LoadGatherOp>(op))
-    layout = dyn_cast<xegpu::LoadGatherOp>(op).getLayoutAttr();
-  if (isa<xegpu::StoreScatterOp>(op))
-    layout = dyn_cast<xegpu::StoreScatterOp>(op).getLayoutAttr();
-  if (isa<xegpu::PrefetchOp>(op))
-    layout = dyn_cast<xegpu::PrefetchOp>(op).getLayoutAttr();
-  if (layout != nullptr) {
-    assert(
-        layout.isForSubgroup() &&
-        "LoadGather/StoreScatter/Prefetch should have subgroup level layout");
-    return getShapeSkipLeadingUnitDim(layout);
+    return getTileShape(op->getOpOperand(1));
+
+  // Handle LoadGatherOp and StoreScatterOp (with and without offset)
+  if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(op)) {
+    if (loadGatherOp.getOffsets())
+      return getTileShape(loadGatherOp->getOpResult(0));
+    else
+      return getTileShape(loadGatherOp->getOpOperand(0));
   }
 
-  if (isa<xegpu::DpasOp>(op)) {
-
-    auto layoutA = dyn_cast<xegpu::DpasOp>(op).getLayoutAAttr();
-    auto layoutB = dyn_cast<xegpu::DpasOp>(op).getLayoutBAttr();
-    auto layoutCD = dyn_cast<xegpu::DpasOp>(op).getLayoutCdAttr();
+  if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
+    return getTileShape(storeScatterOp.getOffsets()
+                            ? storeScatterOp->getOpOperand(0)
+                            : storeScatterOp->getOpOperand(1));
 
+  if (isa<xegpu::DpasOp>(op)) {
     std::optional<SmallVector<int64_t>> aTile =
-        layoutA.getEffectiveInstDataAsInt();
+        getTileShape(op->getOpOperand(0));
     std::optional<SmallVector<int64_t>> bTile =
-        layoutB.getEffectiveInstDataAsInt();
-    std::optional<SmallVector<int64_t>> cdTile =
-        layoutCD.getEffectiveInstDataAsInt();
+        getTileShape(op->getOpOperand(1));
 
     if (!aTile || aTile->size() != 2 || !bTile || bTile->size() != 2)
       return std::nullopt;
@@ -245,42 +210,45 @@ XeGPUBlockingPass::getTileShape(Operation *op) const {
     if ((*aTile)[1] != (*bTile)[0])
       return std::nullopt;
 
-    int64_t expectedCDTile[2] = {(*aTile)[0], (*bTile)[1]};
-    if (!cdTile || !llvm::equal(*cdTile, expectedCDTile))
-      return std::nullopt;
+    // semantic check for C
+    if (op->getNumOperands() == 3) {
+      std::optional<SmallVector<int64_t>> cTile =
+          getTileShape(op->getOpOperand(2));
+      int64_t expectedCTile[2] = {(*aTile)[0], (*bTile)[1]};
+      if (!cTile || !llvm::equal(*cTile, expectedCTile))
+        return std::nullopt;
+    }
 
     return SmallVector<int64_t>({(*aTile)[0], (*aTile)[1], (*bTile)[1]});
   }
 
   if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1)
-    layout = xegpu::getDistributeLayoutAttr(op->getOpResult(0));
+    return getTileShape(op->getOpResult(0));
+
   if (isa<vector::MultiDimReductionOp>(op))
-    layout = xegpu::getDistributeLayoutAttr(op->getOpOperand(0));
+    return getTileShape(op->getOpOperand(0));
+
   if (isa<vector::TransposeOp, vector::BroadcastOp>(op))
-    layout = xegpu::getDistributeLayoutAttr(op->getOpResult(0));
-  if (layout != nullptr) {
-    assert(layout.isForSubgroup() &&
-           "Other ops (Vector/Math/Arith) should have subgroup level layout");
-    return getShapeSkipLeadingUnitDim(layout);
-  }
+    return getTileShape(op->getOpResult(0));
+
   return std::nullopt;
 }
 
 bool XeGPUBlockingPass::needsUnroll(Operation *op) const {
   // skip the op if any of its operands or results has workgroup level layouts
-  /* bool hasWgLayoutOperands =
+  bool hasWgLayoutOperands =
       llvm::any_of(op->getOpOperands(), [](OpOperand &opr) {
         xegpu::DistributeLayoutAttr layout =
             xegpu::getDistributeLayoutAttr(opr);
         return layout && layout.isForWorkgroup();
-      }); */
+      });
   bool hasWgLayoutResults =
       llvm::any_of(op->getOpResults(), [](OpResult result) {
         xegpu::DistributeLayoutAttr layout =
             xegpu::getDistributeLayoutAttr(result);
         return layout && layout.isForWorkgroup();
       });
-  if (hasWgLayoutResults) {
+  if (hasWgLayoutOperands || hasWgLayoutResults) {
     LDBG() << "skip unrolling for op with workgroup level layout: " << *op;
     return false;
   }
@@ -415,12 +383,9 @@ void XeGPUBlockingPass::runOnOperation() {
         }
       }
 
-      xegpu::LayoutAttr newLayout = nullptr;
-      if (tdescTy.getLayoutAttr())
-        newLayout = tdescTy.getLayoutAttr().dropInstData();
-
-      newTy = xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding,
-                                         newLayout);
+      newTy =
+          xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding,
+                                     tdescTy.getLayoutAttr().dropInstData());
     } else {
       newTy = VectorType::get(tileShape, elemTy);
     }
@@ -447,14 +412,14 @@ void XeGPUBlockingPass::runOnOperation() {
   op->walk([](Operation *op) {
     // Remove the layout attributes cached per operands.
     for (OpOperand &opr : op->getOpOperands()) {
-      std::string name = xegpu::getLayoutName(opr);
+      std::string name = xegpu::getTempLayoutName(opr);
       if (op->hasAttrOfType<xegpu::LayoutAttr>(name))
         op->removeAttr(name);
     }
 
     // Update the layout attributes per result.
     for (OpResult result : op->getOpResults()) {
-      std::string name = xegpu::getLayoutName(result);
+      std::string name = xegpu::getTempLayoutName(result);
       if (auto layout = op->getAttrOfType<xegpu::LayoutAttr>(name)) {
         op->removeAttr(name);
         if (!isa<LoopLikeOpInterface>(op))
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index fbe511af39f43..4be6b44ff3082 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -636,9 +636,9 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
 
     auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
     unsigned operandIdx = operand->getOperandNumber();
-    std::string layoutAName = xegpu::getLayoutName(dpasOp->getOpOperand(0));
-    std::string layoutBName = xegpu::getLayoutName(dpasOp->getOpOperand(1));
-    std::string layoutCName = xegpu::getLayoutName(dpasOp->getOpResult(0));
+    std::string layoutAName = xegpu::getTempLayoutName(dpasOp->getOpOperand(0));
+    std::string layoutBName = xegpu::getTempLayoutName(dpasOp->getOpOperand(1));
+    std::string layoutCName = xegpu::getTempLayoutName(dpasOp->getOpResult(0));
 
     xegpu::LayoutAttr layoutA =
         dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutAName);
@@ -869,11 +869,11 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
           storeScatterOp, "Expected at most 2D result at SG level");
 
     std::string layoutPayloadName =
-        xegpu::getLayoutName(storeScatterOp->getOpOperand(0));
+        xegpu::getTempLayoutName(storeScatterOp->getOpOperand(0));
     std::string layoutOffsetsName =
-        xegpu::getLayoutName(storeScatterOp->getOpOperand(2));
+        xegpu::getTempLayoutName(storeScatterOp->getOpOperand(2));
     std::string layoutMaskName =
-        xegpu::getLayoutName(storeScatterOp->getOpOperand(3));
+        xegpu::getTempLayoutName(storeScatterOp->getOpOperand(3));
 
     xegpu::LayoutAttr layoutPayload =
         storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutPayloadName);
@@ -1152,9 +1152,9 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
                                          "Expected 1D offsets and mask vector");
     // Assume offset and mask producers will be distributed as well.
     std::string layoutOffsetsName =
-        xegpu::getLayoutName(loadGatherOp->getOpOperand(1));
+        xegpu::getTempLayoutName(loadGatherOp->getOpOperand(1));
     std::string layoutMaskName =
-        xegpu::getLayoutName(loadGatherOp->getOpOperand(2));
+        xegpu::getTempLayoutName(loadGatherOp->getOpOperand(2));
 
     xegpu::LayoutAttr layoutOffsets =
         loadGatherOp->getAttrOfType<xegpu::LayoutAttr>(layoutOffsetsName);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index be82cda574f1e..a42b4c394a476 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -1575,7 +1575,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
   // Layout propagation pass will activated.
   getOperation()->walk([](Operation *op) {
     for (OpResult result : op->getOpResults()) {
-      std::string name = xegpu::getLayoutName(result);
+      std::string name = xegpu::getTempLayoutName(result);
       if (auto layout = op->getAttrOfType<xegpu::LayoutAttr>(name)) {
         op->removeAttr(name);
         if (!isa<scf::IfOp, scf::ForOp, scf::WhileOp, scf::ConditionOp>(op)) {
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index e56d0a9cf0613..19998b619c291 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -101,13 +101,13 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType,
   return xegpu::getDistributedVectorType(helperTdescTy);
 }
 
-std::string xegpu::getLayoutName(const OpOperand &operand) {
+std::string xegpu::getTempLayoutName(const OpOperand &operand) {
   const StringRef prefix("layout_operand_");
   unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
   return llvm::formatv("{0}{1}", prefix, idx).str();
 }
 
-std::string xegpu::getLayoutName(const OpResult result) {
+std::string xegpu::getTempLayoutName(const OpResult result) {
   const StringRef prefix = "layout_result_";
   return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str();
 }
@@ -149,7 +149,7 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
       return anchorOp.getAnchorLayout();
     }
 
-    std::string layoutName = getLayoutName(result);
+    std::string layoutName = getTempLayoutName(result);
     if (defOp->hasAttr(layoutName))
       return defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
   }
@@ -176,7 +176,7 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
 //   if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(op))
 //     return storeOp.getLayoutAttr();
 
-//   std::string layoutName = xegpu::getLayoutName(opr);
+//   std::string layoutName = xegpu::getTempLayoutName(opr);
 //   if (op->hasAttr(layoutName))
 //     return op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
 
@@ -192,18 +192,11 @@ xegpu::DistributeLayoutAttr
 xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
   Operation *op = opr.getOwner();
 
-  // if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(op))
-  //   return loadOp.getLayoutAttr();
-
-  if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(op))
-    return storeOp.getLayoutAttr();
-
-  // check for "permament" layout only after "temporary" layout name lookup
-  if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
-    if (auto layout = storeScatterOp.getLayoutAttr())
-      return layout;
+  if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
+    return anchorOp.getAnchorLayout();
+  }
 
-  std::string layoutName = xegpu::getLayoutName(opr);
+  std::string layoutName = xegpu::getTempLayoutName(opr);
   if (op->hasAttr(layoutName))
     return op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
 
@@ -250,7 +243,7 @@ template <typename T, typename>
 void xegpu::setDistributeLayoutAttr(const T &operandOrResult,
                                     const DistributeLayoutAttr layout) {
   Operation *owner = operandOrResult.getOwner();
-  std::string name = xegpu::getLayoutName(operandOrResult);
+  std::string name = xegpu::getTempLayoutName(operandOrResult);
 
   if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
     return;
@@ -294,7 +287,7 @@ void xegpu::setDistributeLayoutAttrs(
 template <typename T, typename>
 void xegpu::removeLayoutAttr(const T &operandOrResult) {
   Operation *owner = operandOrResult.getOwner();
-  std::string name = xegpu::getLayoutName(operandOrResult);
+  std::string name = xegpu::getTempLayoutName(operandOrResult);
   if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
     owner->removeAttr(name);
 }
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 93d51441f5b81..de0efdc1ccc34 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -184,7 +184,7 @@ class TestStepOpPattern : public OpConversionPattern<vector::StepOp> {
   matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
 
-    auto layoutName = xegpu::getLayoutName(op->getResult(0));
+    auto layoutName = xegpu::getTempLayoutName(op->getResult(0));
     auto sliceAttr = op->getAttrOfType<xegpu::SliceAttr>(layoutName);
     if (!sliceAttr || sliceAttr.getRank() != 1)
       return failure();
@@ -324,7 +324,7 @@ struct TestXeGPULayoutInterface
 
     target.addDynamicallyLegalOp<vector::StepOp>(
         [&](vector::StepOp op) -> bool {
-          auto layoutName = xegpu::getLayoutName(op->getResult(0));
+          auto layoutName = xegpu::getTempLayoutName(op->getResult(0));
           auto sliceAttr = op->getAttrOfType<xegpu::SliceAttr>(layoutName);
           return isLegal(sliceAttr);
         });

>From 3221cc84c9b7fd587492c27fe4f32d25a521f5fc Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 11 Dec 2025 06:28:17 +0000
Subject: [PATCH 16/23] refactor setDistributeLayoutAttr

---
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     | 12 +--
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 89 +++++--------------
 .../Dialect/XeGPU/optimize-transpose.mlir     | 30 +++----
 3 files changed, 44 insertions(+), 87 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 5b05a2192f5a6..88ff28e7cc3c7 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -102,12 +102,12 @@ void removeLayoutAttr(const T &operandOrResult);
 /// applied recursively to the contained operations
 void removeLayoutAttrs(Operation *op);
 
-/// Sets the DistributeLayoutAttr for a given OpOperand or OpResult by attaching
-/// it to the owner's dictionary attributes
-template <typename T,
-          typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
-                                      std::is_same_v<T, OpResult>>>
-void setDistributeLayoutAttr(const T &operandOrResult,
+/// Sets the DistributeLayoutAttr for a given OpResult
+void setDistributeLayoutAttr(const OpResult &Result,
+                             const DistributeLayoutAttr layout);
+
+/// Sets the DistributeLayoutAttr for a given OpOperand
+void setDistributeLayoutAttr(const OpOperand &opr,
                              const DistributeLayoutAttr layout);
 
 /// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 19998b619c291..755a686baaa06 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -124,27 +124,6 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
     Operation *defOp = result.getDefiningOp();
     assert(defOp && "result must have a defining op");
 
-    // // For ConvertLayoutOp, the layout is stored in the targetLayoutAttr
-    // // if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(defOp))
-    // //   return convertOp.getTargetLayoutAttr();
-
-    // // for LoadNdOp, the layout is stored in the tensor descriptor
-    // if (auto loadNd = dyn_cast<xegpu::LoadNdOp>(defOp))
-    //   return getDistributeLayoutAttr(loadNd.getTensorDesc());
-
-    // // for LoadMatrixOp, the layout is attached to the property of the op
-    // if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(defOp))
-    //   return loadOp.getLayoutAttr();
-
-    // // // for StoreMatrixOp, the layout is attached to the property of the op
-    // // if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(defOp))
-    // //   return storeOp.getLayoutAttr();
-
-    // // check for "permament" layout only after "temporary" layout name lookup
-    // // for backward compatibility
-    // if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(defOp))
-    //   return loadGatherOp.getLayoutAttr();
-
     if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
       return anchorOp.getAnchorLayout();
     }
@@ -166,28 +145,6 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
   return nullptr;
 }
 
-// xegpu::DistributeLayoutAttr
-// xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
-//   Operation *op = opr.getOwner();
-
-//   if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(op))
-//     return loadOp.getLayoutAttr();
-
-//   if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(op))
-//     return storeOp.getLayoutAttr();
-
-//   std::string layoutName = xegpu::getTempLayoutName(opr);
-//   if (op->hasAttr(layoutName))
-//     return op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
-
-//   // check for "permament" layout only after "temporary" layout name lookup
-//   if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
-//     if (auto layout = storeScatterOp.getLayoutAttr())
-//       return layout;
-
-//   return getDistributeLayoutAttr(opr.get());
-// }
-
 xegpu::DistributeLayoutAttr
 xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
   Operation *op = opr.getOwner();
@@ -200,8 +157,8 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
   if (op->hasAttr(layoutName))
     return op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
 
-  return nullptr;
-  // return getDistributeLayoutAttr(opr.get());
+  // return nullptr;
+  return getDistributeLayoutAttr(opr.get());
 }
 
 // Returns the permanent layout attribute for the given result if it's
@@ -239,33 +196,33 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
   return candidate;
 }
 
-template <typename T, typename>
-void xegpu::setDistributeLayoutAttr(const T &operandOrResult,
-                                    const DistributeLayoutAttr layout) {
-  Operation *owner = operandOrResult.getOwner();
-  std::string name = xegpu::getTempLayoutName(operandOrResult);
+void xegpu::setDistributeLayoutAttr(
+    const mlir::OpResult &result,
+    const mlir::xegpu::DistributeLayoutAttr layout) {
+  Operation *owner = result.getOwner();
+
+  if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
+    if (anchorOp.getAnchorLayout() == layout)
+      return;
+    return anchorOp.setAnchorLayout(layout);
+  }
 
+  std::string name = xegpu::getTempLayoutName(result);
   if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
     return;
-
-  DistributeLayoutAttr candidate = layout;
-  // if (respectPermLayout)
-  //   candidate = maybePickPermanentLayout(layout, operandOrResult, owner,
-  //   name);
-
-  if (candidate)
-    owner->setAttr(name, candidate);
+  owner->setAttr(name, layout);
 }
 
-// Explicit instantiation for OpResult
-template void xegpu::setDistributeLayoutAttr<mlir::OpResult>(
-    const mlir::OpResult &result,
-    const mlir::xegpu::DistributeLayoutAttr layout);
+void xegpu::setDistributeLayoutAttr(const OpOperand &operand,
+                                    const DistributeLayoutAttr layout) {
+  Operation *owner = operand.getOwner();
 
-// Explicit instantiation for OpOperand
-template void xegpu::setDistributeLayoutAttr<mlir::OpOperand>(
-    const mlir::OpOperand &operand,
-    const mlir::xegpu::DistributeLayoutAttr layout);
+  std::string name = xegpu::getTempLayoutName(operand);
+
+  if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
+    return;
+  owner->setAttr(name, layout);
+}
 
 void xegpu::setDistributeLayoutAttrs(
     Operation *op, function_ref<DistributeLayoutAttr(Value)> getLayoutImpl) {
diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir
index 24a0de6ed48a5..c748c1ca5ef88 100644
--- a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir
+++ b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir
@@ -10,7 +10,7 @@
 // CHECK:         %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C32]]], strides : [%[[C32]], 1] : i64
 // CHECK-SAME:      -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
 // CHECK-NEXT:    %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}, %[[C16]]]
-// CHECK-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:      {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
 // CHECK-SAME:      : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
 // CHECK:         %[[BITCAST:.*]] = vector.bitcast %[[B]]
 // CHECK-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : vector<16x8xi32> to vector<16x16xf16>
@@ -38,7 +38,7 @@ gpu.func @no_scf(%arg0: memref<64x64xf16>, %arg1: vector<8x16xf16>) -> vector<8x
 // CHECK:         %[[T1:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C16]]], strides : [%[[C16]], 1] : i64
 // CHECK-SAME:      -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
 // CHECK:         %[[T2:.*]] = xegpu.load_nd %[[T1]][%{{.*}}, %[[C16]]]
-// CHECK-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:      {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
 // CHECK-SAME:      : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
 // CHECK:         %[[T3:.*]] = vector.bitcast %[[T2]]
 // CHECK-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 4]>} : vector<16x8xi32> to vector<16x32xi8>
@@ -73,7 +73,7 @@ gpu.func @no_scf_i8(%arg0: memref<64x64xi8>, %arg1: vector<8x32xi8>) -> vector<8
 // CHECK:           %{{.*}} = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) {
 // CHECK:             %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index
 // CHECK-NEXT:        %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]]
-// CHECK-SAME:          {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
+// CHECK-SAME:          <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}> :
 // CHECK-SAME:          !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
 // CHECK-NEXT:        %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
 // CHECK-SAME:          : vector<16x8xi32> to vector<16x16xf16>
@@ -115,8 +115,8 @@ gpu.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16
 // CHECK-SAME:          -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
 // CHECK:            %{{.*}} = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) {
 // CHECK:              %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index
-// CHECK-NEXT:         %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]]  {layout_result_0 = #xegpu.layout<
-// CHECK-SAME:          lane_layout = [16, 1], lane_data = [1, 1]>} :
+// CHECK-NEXT:         %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]]  <{layout = #xegpu.layout<
+// CHECK-SAME:          lane_layout = [16, 1], lane_data = [1, 1]>}> :
 // CHECK-SAME:          !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
 // CHECK-NEXT:         %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
 // CHECK-SAME:          : vector<16x8xi32> to vector<16x16xf16>
@@ -159,13 +159,13 @@ gpu.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %ar
 // CHECK-SAME:        -> !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
 // CHECK:           %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
 // CHECK:             %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index
-// CHECK:             %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK:             %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]]  <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
 // CHECK-SAME:          : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
 // CHECK:             %[[T7:.*]] = vector.insert_strided_slice %[[T6]], %[[CST]]
 // CHECK-SAME:          {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, offsets = [0, 0], strides = [1, 1]}
 // CHECK-SAME:          : vector<32x8xi32> into vector<32x16xi32>
 // CHECK:             %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index
-// CHECK:             %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK:             %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]]  <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
 // CHECK-SAME:          : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
 // CHECK:             %[[T10:.*]] = vector.insert_strided_slice %[[T9]], %[[T7]]
 // CHECK-SAME:          {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, offsets = [0, 8], strides = [1, 1]}
@@ -225,12 +225,12 @@ gpu.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2
 // CHECK-SAME:        !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
 // CHECK:           %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
 // CHECK:             %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index
-// CHECK:             %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK:             %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]]  <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
 // CHECK-SAME:          : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
 // CHECK:             %[[T7:.*]] = vector.bitcast %[[T6]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
 // CHECK-SAME:          : vector<32x8xi32> to vector<32x16xf16>
 // CHECK:             %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index
-// CHECK:             %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK:             %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]]  <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
 // CHECK-SAME:          : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
 // CHECK:             %[[T10:.*]] = vector.bitcast %[[T9]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
 // CHECK-SAME:          : vector<32x8xi32> to vector<32x16xf16>
@@ -244,12 +244,12 @@ gpu.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg
   %c32 = arith.constant 32 : index
   %c256 = arith.constant 256 : index
   %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a>
-  %1 = xegpu.load_nd %0[%c0, %c0]  { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
+  %1 = xegpu.load_nd %0[%c0, %c0]  { layout = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
   %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16>
     -> !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
   %4:4 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %1, %arg5 = %1, %arg6 = %1, %arg7 = %1)
     -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
-    %6 = xegpu.load_nd %3[%c0, %arg3]  { layout_result_0 = #b }
+    %6 = xegpu.load_nd %3[%c0, %arg3]  { layout = #b }
       : !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x32x16xf16>
     %19 = vector.extract %6[0] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16>
     %20 = vector.extract %6[1] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16>
@@ -265,10 +265,10 @@ gpu.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg
     %12 = vector.transpose %8, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
     %13 = vector.transpose %9, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
     %14 = vector.transpose %10, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
-    %15 = xegpu.dpas %arg0, %11, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-    %16 = xegpu.dpas %arg0, %12, %arg5 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-    %17 = xegpu.dpas %arg0, %13, %arg6 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-    %18 = xegpu.dpas %arg0, %14, %arg7 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %15 = xegpu.dpas %arg0, %11, %arg4 {layout = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %16 = xegpu.dpas %arg0, %12, %arg5 {layout = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %17 = xegpu.dpas %arg0, %13, %arg6 {layout = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %18 = xegpu.dpas %arg0, %14, %arg7 {layout = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
     scf.yield %15, %16, %17, %18 : vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>
   } {layout_result_0 = #a, layout_result_1 = #a, layout_result_2 = #a, layout_result_3 = #a}
   xegpu.store_nd %4#0, %0[%c0, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>

>From 5448d6b498bfa6619bff7991583a4c5a7db0da7d Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 12 Dec 2025 05:03:50 +0000
Subject: [PATCH 17/23] adjusting tests to use anchor layout

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |   2 +-
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     |   3 +-
 .../XeGPU/Transforms/XeGPUBlocking.cpp        |   6 +-
 .../Transforms/XeGPUSubgroupDistribute.cpp    |  12 +-
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 149 +++++++++++++++---
 .../XeGPU/propagate-layout-inst-data.mlir     |  14 +-
 mlir/test/Dialect/XeGPU/propagate-layout.mlir |  56 +++----
 mlir/test/Dialect/XeGPU/transform-ops.mlir    |   2 +-
 mlir/test/Dialect/XeGPU/xegpu-blocking.mlir   |   6 +-
 .../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir |  32 ++--
 .../XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir    |  22 +--
 .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir       |  62 ++++----
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir   |  45 +++---
 13 files changed, 252 insertions(+), 159 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index d0206ebea9359..97c7f69cc5d5f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1119,7 +1119,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL
                            l2_hint = #xegpu.cache_hint<cached>,
                            l3_hint = #xegpu.cache_hint<cached>,
                            layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
-      : memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32>
+      : vector<16xf32>, memref<1024xf32>, vector<16xi1>, vector<16xindex>
   ```
 
   Example 4 (Lane level):
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 88ff28e7cc3c7..d851ed9e3ccf1 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -113,8 +113,7 @@ void setDistributeLayoutAttr(const OpOperand &opr,
 /// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given
 /// operation. If the operation contains regions, it is also applied recursively
 /// to the contained operations
-void setDistributeLayoutAttrs(
-    Operation *op, function_ref<DistributeLayoutAttr(Value)> getLayoutImpl);
+void retrieveDistributeLayoutAttrsRecursive(Operation *op);
 
 /// Extract a set of small vectors from a value with a given shape using
 /// vector.extract_stride_slice
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index b584479db7a7c..bddc8f8c8de68 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -158,7 +158,8 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
       bool skipLeadingUnitDimRemoval =
           definingOp &&
           (isa<xegpu::CreateNdDescOp, xegpu::LoadNdOp, xegpu::DpasOp,
-               xegpu::StoreNdOp, xegpu::PrefetchNdOp>(definingOp));
+               xegpu::StoreNdOp, xegpu::PrefetchNdOp, vector::BroadcastOp>(
+              definingOp));
       if (!skipLeadingUnitDimRemoval) {
         auto it = llvm::find_if(instData, [](auto val) { return val != 1; });
         instData.erase(instData.begin(), it);
@@ -283,8 +284,7 @@ void XeGPUBlockingPass::runOnOperation() {
   // Preserve the LayoutAttr for each operand to the owner's DictionaryAttr.
   // This ensures that the LayoutAttr remains accessible even if the defining
   // operation is replaced.
-  xegpu::setDistributeLayoutAttrs(
-      op, [](Value v) { return xegpu::getDistributeLayoutAttr(v); });
+  xegpu::retrieveDistributeLayoutAttrsRecursive(op);
 
   auto getTileShapeAndCount = [](llvm::ArrayRef<int64_t> shape,
                                  xegpu::LayoutAttr layout) {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 4be6b44ff3082..1b851d964c1a9 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1240,20 +1240,24 @@ static Value lowerToVectorReductions(TypedValue<VectorType> src,
     vector::ExtractStridedSliceOp extractOp =
         vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
                                               sliceSizes, {1, 1});
+
     int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
+
     vector::ShapeCastOp slice = vector::ShapeCastOp::create(
         rewriter, loc,
         VectorType::get({nSliceElements}, sourceType.getElementType()),
         extractOp.getResult());
+
     // Shape cast is currently handled in xegpu side. So layouts must be
     // retained during lowering. Shape cast output has the same layout as the
     // accumulator. Shape cast source has the same layout as the original
     // reduction source.
     // TODO: other ops generated here may also need layout attributes.
-    xegpu::setDistributeLayoutAttr(slice->getOpOperand(0),
-                                   xegpu::getDistributeLayoutAttr(src));
-    xegpu::setDistributeLayoutAttr(slice->getOpResult(0),
-                                   xegpu::getDistributeLayoutAttr(acc));
+    auto srcLayout = xegpu::getDistributeLayoutAttr(src);
+    auto accLayout = xegpu::getDistributeLayoutAttr(acc);
+
+    xegpu::setDistributeLayoutAttr(slice->getOpOperand(0), srcLayout);
+    xegpu::setDistributeLayoutAttr(slice->getOpResult(0), accLayout);
     // Extract and reduction results in scalars, so no result layout is needed.
     Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
     Value reduction = vector::ReductionOp::create(
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 755a686baaa06..2c3589bbe6e4f 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -113,52 +113,115 @@ std::string xegpu::getTempLayoutName(const OpResult result) {
 }
 
 xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
-  if (!value)
+  if (!value) {
+    llvm::errs() << "getDistributeLayoutAttr: value is null\n";
     return nullptr;
+  }
+
+  llvm::errs() << "Getting layout for value: " << value << "\n";
 
   if (auto tdescTy =
-          dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
-    return tdescTy.getLayoutAttr();
+          dyn_cast_if_present<xegpu::TensorDescType>(value.getType())) {
+    auto layoutAttr = tdescTy.getLayoutAttr();
+    llvm::errs() << "  Found TensorDescType with layout\n";
+    return layoutAttr;
+  }
 
   if (auto result = dyn_cast<OpResult>(value)) {
     Operation *defOp = result.getDefiningOp();
     assert(defOp && "result must have a defining op");
+    llvm::errs() << "  Value is OpResult from: " << *defOp << "\n";
 
     if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
-      return anchorOp.getAnchorLayout();
+      auto layout = anchorOp.getAnchorLayout();
+      llvm::errs() << "  Returning anchor layout from defining op\n";
+      return layout;
     }
 
     std::string layoutName = getTempLayoutName(result);
-    if (defOp->hasAttr(layoutName))
-      return defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+    if (defOp->hasAttr(layoutName)) {
+      auto layout =
+          defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+      llvm::errs() << "  Returning temp layout from attribute: " << layoutName
+                   << "\n";
+      return layout;
+    }
+    llvm::errs() << "  No layout found for OpResult\n";
   }
 
   if (auto arg = dyn_cast<BlockArgument>(value)) {
     auto *parentOp = arg.getOwner()->getParentOp();
+    llvm::errs() << "  Value is BlockArgument, parent op: " << *parentOp
+                 << "\n";
     if (auto loop = dyn_cast<LoopLikeOpInterface>(parentOp)) {
       OpOperand *tiedInit = loop.getTiedLoopInit(arg);
-      if (tiedInit)
-        return getDistributeLayoutAttr(tiedInit->get());
+      if (tiedInit) {
+        llvm::errs() << "  Following tied loop init\n";
+        auto layout = getDistributeLayoutAttr(tiedInit->get());
+        return layout;
+      }
     }
+    llvm::errs() << "  No tied loop init found\n";
   }
 
+  llvm::errs() << "  Returning nullptr\n";
   return nullptr;
 }
 
 xegpu::DistributeLayoutAttr
 xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
   Operation *op = opr.getOwner();
+  unsigned idx = const_cast<OpOperand &>(opr).getOperandNumber();
+
+  llvm::errs() << "Getting layout for operand " << idx << " of op: " << *op
+               << "\n";
 
   if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
-    return anchorOp.getAnchorLayout();
+    if (auto dpasOp = dyn_cast<xegpu::DpasOp>(op)) {
+      if (idx == 0) {
+        llvm::errs() << "  Returning DpasOp layout A\n";
+        return dpasOp.getLayoutAAttr();
+      } else if (idx == 1) {
+        llvm::errs() << "  Returning DpasOp layout B\n";
+        return dpasOp.getLayoutBAttr();
+      } else if (idx == 2) {
+        llvm::errs() << "  Returning DpasOp layout CD\n";
+        return dpasOp.getLayoutCdAttr();
+      }
+    }
+    if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
+      llvm::errs() << "  Returning ConvertLayoutOp input layout\n";
+      return convertOp.getInputLayoutAttr();
+    }
+    auto layout = anchorOp.getAnchorLayout();
+    // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
+    // the layout is valid for the first two operands: value and memref/tdesc.
+    // For other operations, the layout applies to the first operand only.
+    if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
+            op)) {
+      if (idx < 2) {
+        llvm::errs() << "  Returning anchor layout for store op\n";
+        return layout;
+      }
+    } else {
+      if (idx == 0) {
+        llvm::errs() << "  Returning anchor layout\n";
+        return layout;
+      }
+    }
   }
 
   std::string layoutName = xegpu::getTempLayoutName(opr);
-  if (op->hasAttr(layoutName))
-    return op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+  if (op->hasAttr(layoutName)) {
+    auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+    llvm::errs() << "  Returning temp layout from attribute: " << layoutName
+                 << "\n";
+    return layout;
+  }
 
-  // return nullptr;
-  return getDistributeLayoutAttr(opr.get());
+  auto layout = getDistributeLayoutAttr(opr.get());
+  llvm::errs() << "  Returning layout from operand value\n";
+  return layout;
 }
 
 // Returns the permanent layout attribute for the given result if it's
@@ -210,32 +273,70 @@ void xegpu::setDistributeLayoutAttr(
   std::string name = xegpu::getTempLayoutName(result);
   if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
     return;
-  owner->setAttr(name, layout);
+  if (layout)
+    owner->setAttr(name, layout);
 }
-
 void xegpu::setDistributeLayoutAttr(const OpOperand &operand,
                                     const DistributeLayoutAttr layout) {
   Operation *owner = operand.getOwner();
+  unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
+  if (layout)
+    return;
+  if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
+    if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
+      if (idx == 0) {
+        llvm::errs() << "  set DpasOp layout A\n";
+        return dpasOp.setLayoutAAttr(layout);
+      } else if (idx == 1) {
+        llvm::errs() << "  set DpasOp layout B\n";
+        return dpasOp.setLayoutBAttr(layout);
+      } else if (idx == 2) {
+        llvm::errs() << "  set DpasOp layout CD\n";
+        return dpasOp.setLayoutCdAttr(layout);
+      }
+    }
+    if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(owner)) {
+      llvm::errs() << "  set ConvertLayoutOp input layout\n";
+      return convertOp.setInputLayoutAttr(layout);
+    }
 
-  std::string name = xegpu::getTempLayoutName(operand);
+    // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
+    // the layout is valid for the first two operands: value and memref/tdesc.
+    // For other operations, the layout applies to the first operand only.
+    if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
+            owner)) {
+      if (idx < 2) {
+        llvm::errs() << "  set anchor layout for store op\n";
+        anchorOp.setAnchorLayout(layout);
+      }
+    } else {
+      if (idx == 0) {
+        llvm::errs() << "  set anchor layout\n";
+        anchorOp.setAnchorLayout(layout);
+      }
+    }
+  }
 
+  std::string name = xegpu::getTempLayoutName(operand);
   if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
     return;
-  owner->setAttr(name, layout);
+  if (layout)
+    owner->setAttr(name, layout);
 }
 
-void xegpu::setDistributeLayoutAttrs(
-    Operation *op, function_ref<DistributeLayoutAttr(Value)> getLayoutImpl) {
+void xegpu::retrieveDistributeLayoutAttrsRecursive(Operation *op) {
   op->walk([&](Operation *nestOp) {
-    if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(nestOp))
-      return;
-
     for (OpOperand &opr : nestOp->getOpOperands()) {
-      auto layout = getLayoutImpl(opr.get());
+      auto layout = getDistributeLayoutAttr(opr.get());
+      llvm::errs() << "Setting layout for operand " << opr.getOperandNumber()
+                   << " of op: " << *nestOp << "\n";
       setDistributeLayoutAttr(opr, layout);
     }
+
     for (OpResult result : nestOp->getOpResults()) {
-      auto layout = getLayoutImpl(result);
+      auto layout = getDistributeLayoutAttr(result);
+      llvm::errs() << "Setting layout for result " << result.getResultNumber()
+                   << " of op: " << *nestOp << "\n";
       setDistributeLayoutAttr(result, layout);
     }
   });
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
index 32fb3178a8af2..5f70831f45e97 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -8,7 +8,7 @@
 // CHECK: %[[TDESC_DST:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
 // CHECK: xegpu.prefetch_nd %[[TDESC_SRC]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<inst_data = [8, 16]>}> :
 // CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
-// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0 <{layout = #xegpu.layout<inst_data = [8, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0 <{layout = #xegpu.layout<inst_data = [8, 16]>}>
 // CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<8x32xf32>
 // CHECK: xegpu.store_nd %[[LOADED]], %[[TDESC_DST]] <{layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
 gpu.module @test {
@@ -32,11 +32,11 @@ func.func @load_store_no_array_len(%arg0: memref<8x32xf32>, %arg1: memref<8x32xf
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} dense<0.000000e+00> : vector<8x16xf32>
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>
 // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>>
-// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]]  <{layout = #xegpu.layout<inst_data = [8, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]]  <{layout = #xegpu.layout<inst_data = [8, 16]>}> :
 // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout<inst_data = [16, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [16, 16]>} :
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout<inst_data = [16, 16]>}> :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf16>
-// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout<inst_data = [8, 16]>, layout_b = #xegpu.layout<inst_data = [16, 16]>, layout_cd = #xegpu.layout<inst_data = [8, 16]>, layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout<inst_data = [8, 16]>, layout_b = #xegpu.layout<inst_data = [16, 16]>, layout_cd = #xegpu.layout<inst_data = [8, 16]>} :
 // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
 // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>
 // CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>>
@@ -73,7 +73,7 @@ gpu.module @test_kernel {
     %out:3 = scf.for %k = %c0 to %c1024 step %c32
       iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
       -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) {
-      //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout<inst_data = [8, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+      //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout<inst_data = [8, 16]>}> :
       //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<16x32xf16>
       %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
       %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
@@ -112,7 +112,7 @@ gpu.module @test_kernel {
     %out:3 = scf.for %k = %c0 to %c1024 step %c32
       iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
       -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) {
-      //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout<inst_data = [4, 16]>}>  {layout_result_0 = #xegpu.layout<inst_data = [4, 16]>} :
+      //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout<inst_data = [4, 16]>}> :
       //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>> -> vector<12x32xf16>
       %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
       %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
@@ -141,7 +141,7 @@ gpu.module @test {
 // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
 // CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64, layout = #xegpu.layout<inst_data = [16, 8]>}>
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<inst_data = [16, 8]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
 // CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64, layout = #xegpu.layout<inst_data = [16, 8]>}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
 func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
   %1 = arith.constant dense<1>: vector<16xi1>
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 48e77d867508b..b88d8e1a78a26 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -6,11 +6,11 @@ gpu.module @test {
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
 // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]]  <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]]  <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
 // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
 // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 // CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -32,7 +32,7 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me
 gpu.module @test {
 // CHECK-LABEL: func.func @dpas_i8(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) {
-// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} 
+// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} 
 
 func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) {
   %c0 = arith.constant 0 : index
@@ -47,7 +47,7 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre
 gpu.module @test {
 // CHECK-LABEL: func.func @load_with_transpose_effect(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
 // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
 func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
@@ -109,7 +109,7 @@ gpu.module @test {
 // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> ->
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
-// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]]  <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}> {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]]  <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}> 
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>, vector<16xi1> -> vector<16x16xf16>
 func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
@@ -136,7 +136,7 @@ gpu.module @test {
 // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> ->
 // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
+// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> :
 // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, vector<16xi1> -> vector<16xf32>
 func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
   %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
@@ -185,7 +185,7 @@ gpu.module @test {
 // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
 // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}>
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
 // CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]  <{chunk_size = 8 : i64, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
 func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
   %1 = arith.constant dense<1>: vector<16xi1>
@@ -204,7 +204,7 @@ gpu.module @test {
 // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
 // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
 // CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]  <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
 func.func @scatter_ops(%src: memref<256xf16>) {
   %1 = arith.constant dense<1>: vector<16xi1>
@@ -221,7 +221,7 @@ gpu.module @test {
 // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} dense<12> : vector<16xindex>
 // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
-// CHECK-SAME:  {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
 // CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16>
 // CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
 // CHECK-SAME  <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
@@ -241,7 +241,7 @@ gpu.module @test {
 // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} dense<12> : vector<16xindex>
 // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] 
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
 // CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16>
 // CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
 // CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
@@ -257,9 +257,9 @@ func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) {
 // -----
 gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16(
-// CHECK:       %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK:       %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
 // CHECK-SAME:     !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16>
-// CHECK:       %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+// CHECK:       %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>
 // CHECK-SAME:     !xegpu.tensor_desc<16x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xi16>
 // CHECK:       %{{.*}} = vector.bitcast %[[LOAD0]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:      vector<8x16xi16> to vector<8x16xf16>
@@ -282,7 +282,7 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1
 // -----
 gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i32_to_f16(
-// CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
 // CHECK-SAME:     !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
 // CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
 // CHECK-SAME:     vector<16x8xi32> to vector<16x16xf16>
@@ -303,7 +303,7 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8
 // -----
 gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i16_to_i32(
-// CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
+// CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}>
 // CHECK-SAME:     !xegpu.tensor_desc<8x32xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>> -> vector<8x32xi16>
 // CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:     vector<8x32xi16> to vector<8x16xi32>
@@ -340,9 +340,9 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
 // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT: %{{.*}} = arith.addf %[[T1]], %[[T2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16>
 func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) {
@@ -363,7 +363,7 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
 // CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 // CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 // CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) {
@@ -386,11 +386,11 @@ gpu.module @test {
 // CHECK-NEXT: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
 // CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) ->
 // CHECK-SAME: (!xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, vector<8x16xf32>) {
-// CHECK-NEXT:   %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT:   %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
 // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-// CHECK-NEXT:   %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT:   %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK-NEXT:   %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT:   %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
 // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
 // CHECK-NEXT:   %[[T7:.*]] = xegpu.update_nd_offset %[[ARG4]], [{{.*}}] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 // CHECK-NEXT:   %[[T8:.*]] = xegpu.update_nd_offset %[[ARG5]], [{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
@@ -426,11 +426,11 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
 // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
 // CHECK:  %{{.*}} = scf.if %[[ARG2]] -> (vector<16x16xf16>) {
-// CHECK-NEXT:    %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT:    %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:    scf.yield %[[T3]] : vector<16x16xf16>
 // CHECK-NEXT:  } else {
-// CHECK-NEXT:    %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT:    %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:    scf.yield %[[T4]] : vector<16x16xf16>
 // CHECK-NEXT:  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
@@ -456,11 +456,11 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG4:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
 // CHECK: %[[T1:.*]] = scf.if %[[ARG2]] -> (vector<16x16xf16>) {
-// CHECK-NEXT:       %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT:       %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:       scf.yield %[[T3]] : vector<16x16xf16>
 // CHECK-NEXT:     } else {
-// CHECK-NEXT:       %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT:       %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:       scf.yield %[[T4]] : vector<16x16xf16>
 // CHECK-NEXT:     } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
@@ -600,7 +600,7 @@ gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed(
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK:         %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK:         %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
 // CHECK-SAME:      !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:    %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
 // CHECK-SAME:       {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
@@ -622,7 +622,7 @@ gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(
 // CHECK-SAME:     %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME:     %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK:          %[[LOAD:.*]] = xegpu.load_nd %arg0 <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK:          %[[LOAD:.*]] = xegpu.load_nd %arg0 <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
 // CHECK-SAME:        !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:     %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
 // CHECK-SAME:        {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1]
@@ -645,7 +645,7 @@ gpu.module @test {
 // CHECK-LABEL: func.func @vector_broadcast_1d_to_2d_broadcast_along_row(
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK:         %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK:         %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
 // CHECK-SAME:      !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
 // CHECK-NEXT:    %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
 // CHECK-SAME:       {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
diff --git a/mlir/test/Dialect/XeGPU/transform-ops.mlir b/mlir/test/Dialect/XeGPU/transform-ops.mlir
index 561034fb5880b..13ed24ebf0a3a 100644
--- a/mlir/test/Dialect/XeGPU/transform-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/transform-ops.mlir
@@ -149,7 +149,7 @@ func.func @set_op_layout_attr_result_default_index(%arg0: memref<4096x4096xf16>,
   %4 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16>
   %5 = xegpu.load_nd %4[0, 0]  : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16>
   // CHECK: = xegpu.dpas
-  // CHECK-SAME: {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], inst_data = [8, 16]>}
+  // CHECK-SAME: {layout_cd = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], inst_data = [8, 16]>}
   %6 = xegpu.dpas %1, %3, %5 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16>
   return
 }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index b3032bac351ac..9f9edcd416ddf 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -27,7 +27,7 @@ gpu.module @test_kernel {
       //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
       %b = xegpu.load_nd %arg1 {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
       //CHECK-COUNT-8: xegpu.dpas {{.*}} 
-      %c = xegpu.dpas %a, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c,layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
+      %c = xegpu.dpas %a, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
       //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
       %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a>
       //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
@@ -513,8 +513,8 @@ gpu.module @test_kernel {
   //CHECK: [[c0:%.+]] = arith.constant 0 : index
   //CHECK: [[a:%.+]] = xegpu.create_nd_tdesc [[arg0]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
   //CHECK: [[b:%.+]] = xegpu.create_nd_tdesc [[arg1]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
-  //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] <{layout = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
-  //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] <{layout = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
+  //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] <{layout = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
+  //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] <{layout = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
   //CHECK: [[cvt:%.+]] = xegpu.convert_layout [[load_a]] <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>}> : vector<16x16xf16>
   //CHECK: [[a0:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
   //CHECK: [[a1:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index 4829af3612de3..9b00adaaa56da 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -41,7 +41,7 @@ gpu.module @test_round_robin_assignment {
         -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
       // CHECK-COUNT-4: xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
       // CHECK-NOT: xegpu.load_nd
-      %load =  xegpu.load_nd %tdesc
+      %load =  xegpu.load_nd %tdesc {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
         : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
         -> vector<256x128xf32>
       gpu.return
@@ -54,8 +54,7 @@ gpu.module @test_round_robin_assignment {
         -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
       // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}} : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
       // CHECK-NOT: xegpu.store_nd
-      %load = xegpu.load_nd %tdesc
-        : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+      %load = xegpu.load_nd %tdesc {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
         -> vector<256x128xf32>
       xegpu.store_nd %load, %tdesc
         : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -79,20 +78,17 @@ gpu.module @test_round_robin_assignment {
   gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) {
     // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     // CHECK-NOT: xegpu.dpas
     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<256x128xf16>
       -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-    %load_a =  xegpu.load_nd %tdesc_a
-      : !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+    %load_a =  xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
       -> vector<256x128xf16>
     %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x256xf16>
       -> !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
-    %load_b =  xegpu.load_nd %tdesc_b
-      : !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
+    %load_b =  xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
       -> vector<128x256xf16>
-    %dpas = xegpu.dpas %load_a, %load_b
-      {layout_result_0 =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+    %dpas = xegpu.dpas %load_a, %load_b {layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32>
     gpu.return
   }
@@ -114,7 +110,7 @@ gpu.module @test_round_robin_assignment {
   gpu.func @broadcast(%src: memref<128x1xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<128x1xf32>
       -> !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
-    %load =  xegpu.load_nd %tdesc
+    %load =  xegpu.load_nd %tdesc {layout =  #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
       -> vector<128x1xf32>
     // CHECK-COUNT-2: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>} : vector<16x1xf32> to vector<16x32xf32>
@@ -137,7 +133,7 @@ gpu.module @test_round_robin_assignment {
     // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>)
     %2:2 = scf.for %arg2 = %c0 to %c1024 step %c256 iter_args(%arg3 = %0, %arg4 = %1)
         -> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>) {
-      %3 = xegpu.load_nd %0  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+      %3 = xegpu.load_nd %0 {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
       xegpu.store_nd %3, %arg3  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
       %4 = xegpu.update_nd_offset %arg3, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
       %5 = xegpu.update_nd_offset %arg4, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
@@ -153,7 +149,7 @@ gpu.module @test_round_robin_assignment {
     %c10_i32 = arith.constant 10 : i32
     %c0_i32 = arith.constant 0 : i32
     %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
-    %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+    %1 = xegpu.load_nd %0 {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
     %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
     // CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32)
     %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) {
@@ -166,7 +162,7 @@ gpu.module @test_round_robin_assignment {
       xegpu.store_nd %arg2, %2  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
       %4 = arith.addi %arg3, %c1_i32 : i32
       %5 = xegpu.update_nd_offset %0, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
-      %6 = xegpu.load_nd %5  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+      %6 = xegpu.load_nd %5 {layout =  #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
       scf.yield %6, %4 : vector<256xf32>, i32
     }
     gpu.return
@@ -181,12 +177,12 @@ gpu.module @test_round_robin_assignment {
     // CHECK-LABEL: scf.if
     // CHECK-SAME: (vector<16xf32>, vector<16xf32>)
     %4 = scf.if %3 -> (vector<256xf32>) {
-      %5 = xegpu.load_nd %1  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+      %5 = xegpu.load_nd %1 {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
       // CHECK-LABEL: scf.yield
       // CHECK-SAME: vector<16xf32>, vector<16xf32>
       scf.yield %5 : vector<256xf32>
     } else {
-      %5 = xegpu.load_nd %2  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+      %5 = xegpu.load_nd %2 {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
       // CHECK-LABEL: scf.yield
       // CHECK-SAME: vector<16xf32>, vector<16xf32>
       scf.yield %5 : vector<256xf32>
@@ -200,7 +196,7 @@ gpu.module @test_round_robin_assignment {
     %id = gpu.subgroup_id : index
 
     %t = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
-    %d = xegpu.load_nd %t : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+    %d = xegpu.load_nd %t {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>}: !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
 
     %0 = arith.cmpi eq, %id, %c10 : index
     // CHECK-LABEL: scf.if
@@ -224,7 +220,7 @@ gpu.module @test_round_robin_assignment {
     %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>>
     // CHECK-COUNT-2: xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32>
     // CHECK-COUNT-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32>
-    %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>> -> vector<32x64xf32>
+    %1 = xegpu.load_nd %0 {layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>} : !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>> -> vector<32x64xf32>
     %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>,
                                    target_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]>}> : vector<32x64xf32>
     gpu.return
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
index c95c64084f3f8..b3ba99887d763 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
@@ -14,11 +14,11 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: load_nd_tdesc_with_offset
   gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) {
-    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
+    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
     // CHECK-NOT: xegpu.load_nd
     %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-    %load = xegpu.load_nd %tdesc[0, 0]
+    %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
       -> vector<256x128xf32>
     gpu.return
@@ -30,7 +30,7 @@ gpu.module @test_distribution {
     // CHECK-NOT: xegpu.store_nd
     %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-    %load =  xegpu.load_nd %tdesc[0, 0]
+    %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
       -> vector<256x128xf32>
     xegpu.store_nd %load, %tdesc[0, 0]
@@ -53,23 +53,23 @@ gpu.module @test_distribution {
   // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>)
   gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) {
     // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
     // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}]  <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     // CHECK-NOT: xegpu.dpas
     %tdesc_a = xegpu.create_nd_tdesc %a : memref<256x128xf16>
       -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-    %load_a =  xegpu.load_nd %tdesc_a[0, 0]
+    %load_a =  xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
       -> vector<256x128xf16>
     %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x256xf16>
       -> !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
-    %load_b =  xegpu.load_nd %tdesc_b[0, 0]
+    %load_b =  xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>}
       : !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
       -> vector<128x256xf16>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout_result_0 =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+      {layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32>
     gpu.return
   }
@@ -80,7 +80,7 @@ gpu.module @test_distribution {
     %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>, dims = [1]>} dense<1.0> : vector<256xf32>
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32>
       -> !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>>
-    %load =  xegpu.load_nd %tdesc[0, 0]
+    %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>}
       : !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>>
       -> vector<256x64xf32>
     // CHECK-COUNT-2: vector.multi_reduction <add>, {{.*}}, %[[CST]] [1] : vector<16x64xf32> to vector<16xf32>
@@ -119,7 +119,7 @@ gpu.module @test_distribution {
   gpu.func @vector_transpose(%src: memref<256x128xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
         -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
-    %load = xegpu.load_nd %tdesc[0, 0]
+    %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>}
         : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
         -> vector<256x128xf32>
     // CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<32x16xf32> to vector<16x32xf32>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index c557a2f140527..9179344b8fc2b 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -38,10 +38,10 @@ gpu.module @test_distribution {
     //CHECK-DAG: %[[OFF_Y:.*]] = arith.remui %[[L_OFF_Y]], %[[C256]] : index
     //CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index
     //CHECK-DAG: %[[OFF_X:.*]] = arith.remui %[[L_OFF_X]], %[[C128]] : index
-    //CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32>
+    //CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32>
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-    %load =  xegpu.load_nd %tdesc[0, 0]
+    %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
       -> vector<256x128xf32>
     gpu.return
@@ -53,7 +53,7 @@ gpu.module @test_distribution {
     //CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}]  : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-    %load =  xegpu.load_nd %tdesc[0, 0]
+    %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
       -> vector<256x128xf32>
     xegpu.store_nd %load, %tdesc[0, 0]
@@ -75,42 +75,43 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: dpas
   gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
-    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
+    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_cd =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
     %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16>
       -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
-    %load_a =  xegpu.load_nd %tdesc_a[0, 0]
+    %load_a =  xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
       -> vector<128x128xf16>
     %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16>
       -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
-    %load_b =  xegpu.load_nd %tdesc_b[0, 0]
+    %load_b =  xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>}
       : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
       -> vector<128x128xf16>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout_result_0 =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+      {layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
     gpu.return
   }
 
   // CHECK-LABEL: dpas_no_sg_data
   gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
-    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_cd =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16>
       -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
       order = [1, 0]>>
-    %load_a =  xegpu.load_nd %tdesc_a[0, 0]
+    %load_a =  xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
+      order = [1, 0]>}
       : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
       order = [1, 0]>>
       -> vector<128x128xf16>
     %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16>
       -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
       order = [1, 0]>>
-    %load_b =  xegpu.load_nd %tdesc_b[0, 0]
+    %load_b =  xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]> }
       : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
       order = [1, 0]>>
       -> vector<128x128xf16>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout_result_0 =  #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
+      {layout_cd =  #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
       : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
     gpu.return
   }
@@ -119,7 +120,7 @@ gpu.module @test_distribution {
   gpu.func @dpas_with_no_create_nd_desc(%a: vector<256x128xf32>, %b: vector<128x256xf32>) {
     // CHECK-NOT: vector<32x32xf32>
     %dpas = xegpu.dpas %a, %b
-      {layout =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+      {layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32>
     gpu.return
   }
@@ -129,7 +130,7 @@ gpu.module @test_distribution {
   gpu.func @broadcast_dim1(%src: memref<256x1xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x1xf32>
       -> !xegpu.tensor_desc<256x1xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
-    %load =  xegpu.load_nd %tdesc[0, 0]
+    %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<256x1xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
       -> vector<256x1xf32>
     // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>}
@@ -145,7 +146,7 @@ gpu.module @test_distribution {
   gpu.func @broadcast_dim0(%src: memref<1x128xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src : memref<1x128xf32>
       -> !xegpu.tensor_desc<1x128xf32, #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-    %load =  xegpu.load_nd %tdesc[0, 0]
+    %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<1x128xf32, #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
       -> vector<1x128xf32>
     // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
@@ -175,9 +176,9 @@ gpu.module @test_distribution {
     %3 = xegpu.create_nd_tdesc %arg0 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
     %4 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
     // load_nd with offset
-    %5 = xegpu.load_nd %2[%0, %1] : !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>> -> vector<128x128xf32>
-    %6 = xegpu.load_nd %3[%0, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
-    %7 = xegpu.load_nd %4[%c0, %1] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
+    %5 = xegpu.load_nd %2[%0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}: !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>> -> vector<128x128xf32>
+    %6 = xegpu.load_nd %3[%0, %c0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
+    %7 = xegpu.load_nd %4[%c0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
     // scf.for loop
     //      CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]]
     // CHECK-SAME: iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) ->
@@ -189,10 +190,10 @@ gpu.module @test_distribution {
     %8:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %6, %arg5 = %7, %arg6 = %5)
         -> (vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>) {
       // load_nd with offset inside loop
-      %9 = xegpu.dpas %arg4, %arg5, %arg6 {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
+      %9 = xegpu.dpas %arg4, %arg5, %arg6 {layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
                           : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
-      %10 = xegpu.load_nd %3[%arg3, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
-      %11 = xegpu.load_nd %4[%c0, %arg3] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
+      %10 = xegpu.load_nd %3[%arg3, %c0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
+      %11 = xegpu.load_nd %4[%c0, %arg3] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
       scf.yield %10, %11, %9 : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>
     }
     // store_nd with offset
@@ -215,7 +216,7 @@ gpu.module @test_distribution {
         // CHECK-NOT: index.sub
         %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
           -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
-        %load =  xegpu.load_nd %tdesc[0, 0]
+        %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>}
           : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
           -> vector<256x128xf32>
     } {sg_id_range = #xegpu.range<[0, 32]>}
@@ -228,7 +229,7 @@ gpu.module @test_distribution {
       // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]]
       %tdesc = xegpu.create_nd_tdesc %src2 : memref<128x64xf32>
         -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
-      %load =  xegpu.load_nd %tdesc[0, 0]
+      %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>}
         : !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
         -> vector<128x64xf32>
       %exp = math.exp %load {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>} : vector<128x64xf32>
@@ -244,7 +245,7 @@ gpu.module @test_distribution {
     %c32 = arith.constant 32 : index
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
-    %load =  xegpu.load_nd %tdesc[0, 0]
+    %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
       -> vector<256x128xf32>
     %cond1 = arith.cmpi sge, %sg_id, %c3 : index
@@ -257,7 +258,7 @@ gpu.module @test_distribution {
         // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C3]]
         %td = xegpu.create_nd_tdesc %src1 : memref<128x64xf32>
           -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
-        %ld =  xegpu.load_nd %td[0, 0]
+        %ld =  xegpu.load_nd %td[0, 0] {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>}
           : !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
           -> vector<128x64xf32>
         %exp = math.exp %ld {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>} : vector<128x64xf32>
@@ -287,13 +288,11 @@ gpu.module @test_distribution {
     // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<0> : vector<8xindex>
     // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<true> : vector<8xi1>
     // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>, layout = #xegpu.layout<inst_data = [8]>}>
-    // CHECK-SAME: {layout_operand_0 = #xegpu.layout<inst_data = [8]>, layout_operand_2 = #xegpu.layout<inst_data = [8]>,
-    // CHECK-SAME: layout_operand_3 = #xegpu.layout<inst_data = [8]>}
-    // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
+     // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
     %val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<25.5> : vector<256xf16>
     %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<0> : vector<256xindex>
     %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<1> : vector<256xi1>
-    xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout_operand_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
+    xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
                                              layout_operand_2 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
                                              layout_operand_3 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
                                              l1_hint = #xegpu.cache_hint<cached>}
@@ -370,7 +369,7 @@ gpu.module @test_distribution {
     %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>, dims = [0]>} dense<1.0> : vector<128xf32>
     %tdesc = xegpu.create_nd_tdesc %src : memref<4x128xf32>
       -> !xegpu.tensor_desc<4x128xf32, #xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>>
-    %load =  xegpu.load_nd %tdesc[0, 0]
+    %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>}
       : !xegpu.tensor_desc<4x128xf32, #xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>>
       -> vector<4x128xf32>
     // CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [0] : vector<4x4xf32> to vector<4xf32>
@@ -384,7 +383,7 @@ gpu.module @test_distribution {
     %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>, dims = [1]>} dense<1.0> : vector<256xf32>
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32>
       -> !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>>
-    %load =  xegpu.load_nd %tdesc[0, 0]
+    %load =  xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>}
       : !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>>
       -> vector<256x64xf32>
     // CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [1] : vector<16x64xf32> to vector<16xf32>
@@ -468,7 +467,7 @@ gpu.module @test_distribution {
   gpu.func @vector_transpose(%src: memref<256x32xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x32xf32>
         -> !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
-    %load = xegpu.load_nd %tdesc[0, 0]
+    %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>}
         : !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
         -> vector<256x32xf32>
     //CHECK: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<64x32xf32> to vector<32x64xf32>
@@ -624,7 +623,6 @@ gpu.module @test_distribution {
     %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<1> : vector<256xi1>
 
     // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>}>
-    // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>} :
     // CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32>
     %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32>
 
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index a8015cced7eb4..fb77cb93b60fc 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -89,17 +89,15 @@ gpu.module @test_1_1_assignment {
   gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16>
       -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
-    %load_a =  xegpu.load_nd %tdesc_a
-      : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
+    %load_a =  xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
       -> vector<128x128xf16>
     %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x128xf16>
       -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
-    %load_b =  xegpu.load_nd %tdesc_b
-      : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
+    %load_b =  xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
       -> vector<128x128xf16>
-    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
+    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_cd =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout_result_0 =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+      {layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
     gpu.return
   }
@@ -109,20 +107,18 @@ gpu.module @test_1_1_assignment {
     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16>
       -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
       order = [1, 0]>>
-    %load_a =  xegpu.load_nd %tdesc_a
-      : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
+    %load_a =  xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
       order = [1, 0]>>
       -> vector<128x128xf16>
     %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x128xf16>
       -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
       order = [1, 0]>>
-    %load_b =  xegpu.load_nd %tdesc_b
-      : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
+    %load_b =  xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
       order = [1, 0]>>
       -> vector<128x128xf16>
-    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_cd =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout_result_0 =  #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
+      {layout_cd =  #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
       : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
     gpu.return
   }
@@ -194,25 +190,24 @@ gpu.module @test_1_1_assignment {
     %0 = arith.muli %block_id_x, %c128 : index
     %1 = arith.muli %block_id_y, %c128 : index
     %2 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
-    %3 = xegpu.load_nd %2 : !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>> -> vector<128x128xf32>
+    %3 = xegpu.load_nd %2 {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>} : !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>> -> vector<128x128xf32>
     %4 = xegpu.create_nd_tdesc %arg0[%0, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
     %5 = xegpu.create_nd_tdesc %arg1[%c0, %1] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
 
     // CHECK: %[[SCF:.*]]:3 = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C1024]] step %[[C128]]
     // CHECK-SAME: iter_args(%[[ARG4:.*]] = {{.*}}, %[[ARG5:.*]] = {{.*}}, %[[ARG6:.*]] = {{.*}}) ->
     // CHECK-SAME: (!xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>)
-    // CHECK: %[[A:.*]] = xegpu.load_nd %[[ARG4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16>
-    // CHECK: %[[B:.*]] = xegpu.load_nd %[[ARG5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
+    // CHECK: %[[A:.*]] = xegpu.load_nd %[[ARG4]] <{layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}> : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16>
+    // CHECK: %[[B:.*]] = xegpu.load_nd %[[ARG5]] <{layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}> : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
     // CHECK: %[[C:.*]] = xegpu.dpas %[[A]], %[[B]], %[[ARG6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32>
     // CHECK: %[[AT:.*]] = xegpu.update_nd_offset %[[ARG4]], [%[[C0]], %[[C128]]] : !xegpu.tensor_desc<16x128xf16>
     // CHECK: %[[BT:.*]] = xegpu.update_nd_offset %[[ARG5]], [%[[C128]], %[[C0]]] : !xegpu.tensor_desc<128x16xf16>
     // CHECK: scf.yield %[[AT]], %[[BT]], %[[C]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>
     %6:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3)
-        -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>,
-            !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>, vector<128x128xf32>) {
-      %8 = xegpu.load_nd %arg4  : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
-      %9 = xegpu.load_nd %arg5  : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
-      %10 = xegpu.dpas %8, %9, %arg6 {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
+        -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>, !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>, vector<128x128xf32>) {
+      %8 = xegpu.load_nd %arg4 {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
+      %9 = xegpu.load_nd %arg5 {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
+      %10 = xegpu.dpas %8, %9, %arg6 {layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
                           : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
       %11 = xegpu.update_nd_offset %arg4, [%c0, %c128] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
       %12 = xegpu.update_nd_offset %arg5, [%c128, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
@@ -221,7 +216,7 @@ gpu.module @test_1_1_assignment {
     }
     %7 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32>
             -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
-    xegpu.store_nd %6#2, %7  : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
+    xegpu.store_nd %6#2, %7 {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]> } : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
     gpu.return
   }
 
@@ -230,7 +225,7 @@ gpu.module @test_1_1_assignment {
     %c10_i32 = arith.constant 10 : i32
     %c0_i32 = arith.constant 0 : i32
     %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
-    %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+    %1 = xegpu.load_nd %0 {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
     %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
 
     // CHECK: scf.while {{.*}} : (vector<16xf32>, i32) -> (vector<16xf32>, i32)
@@ -244,7 +239,7 @@ gpu.module @test_1_1_assignment {
       xegpu.store_nd %arg2, %2  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
       %4 = arith.addi %arg3, %c1_i32 : i32
       %5 = xegpu.update_nd_offset %0, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
-      %6 = xegpu.load_nd %5  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+      %6 = xegpu.load_nd %5 {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
       scf.yield %6, %4 : vector<256xf32>, i32
     }
     gpu.return
@@ -263,14 +258,14 @@ gpu.module @test_1_1_assignment {
     %5 = scf.if %4 -> (vector<256xf32>) {
       // CHECK-LABEL: xegpu.load_nd
       //  CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32>
-      %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+      %2 = xegpu.load_nd %0 {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
       // CHECK-LABEL: scf.yield
       //  CHECK-SAME: vector<16xf32>
       scf.yield %2 : vector<256xf32>
     } else {
       // CHECK-LABEL: xegpu.load_nd
       //  CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32>
-      %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+      %3 = xegpu.load_nd %1 {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
       // CHECK-LABEL: scf.yield
       //  CHECK-SAME: vector<16xf32>
       scf.yield %3 : vector<256xf32>

>From efcc1b72b9de1e38aa5883eba7beec80ff912aae Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 12 Dec 2025 19:44:36 +0000
Subject: [PATCH 18/23] fixing XeGPU tests

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 63 ++++++++++++++++--
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 16 +++--
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 65 +++++++++++++++----
 .../XeGPU/subgroup-distribute-unit.mlir       |  7 +-
 .../Dialect/XeGPU/subgroup-distribute.mlir    | 31 +++++----
 .../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir |  7 +-
 .../XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir    |  6 +-
 .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir       | 25 +++++--
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir   | 23 +++++--
 .../Dialect/XeGPU/WG/simple_gemm.mlir         | 26 ++++----
 10 files changed, 196 insertions(+), 73 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 1b851d964c1a9..4e663e1dd6694 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -630,22 +630,26 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<xegpu::DpasOp>);
-    if (!operand)
+    if (!operand) {
+      DBGS() << "No dpas op result found in warp op\n";
       return rewriter.notifyMatchFailure(warpOp,
                                          "warp result is not a xegpu::Dpas op");
+    }
 
     auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
     unsigned operandIdx = operand->getOperandNumber();
-    std::string layoutAName = xegpu::getTempLayoutName(dpasOp->getOpOperand(0));
-    std::string layoutBName = xegpu::getTempLayoutName(dpasOp->getOpOperand(1));
-    std::string layoutCName = xegpu::getTempLayoutName(dpasOp->getOpResult(0));
+    DBGS() << "Found dpas op: " << *dpasOp << "\n";
 
     xegpu::LayoutAttr layoutA =
-        dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutAName);
+        dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutAAttr());
     xegpu::LayoutAttr layoutB =
-        dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutBName);
+        dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutBAttr());
     xegpu::LayoutAttr layoutOut =
-        dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutCName);
+        dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutCdAttr());
+
+    DBGS() << "Layout A: " << layoutA << ", Layout B: " << layoutB
+           << ", Layout Out: " << layoutOut << "\n";
+
     if (!layoutA || !layoutB || !layoutOut)
       return rewriter.notifyMatchFailure(
           dpasOp,
@@ -657,6 +661,24 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
         getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType());
     FailureOr<VectorType> distResultTypeByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());
+
+    DBGS() << "Original LHS type: " << dpasOp.getLhsType() << ", distributed: "
+           << (succeeded(distLhsTypeByWarpOpOrFailure)
+                   ? distLhsTypeByWarpOpOrFailure.value()
+                   : Type())
+           << "\n";
+    DBGS() << "Original RHS type: " << dpasOp.getRhsType() << ", distributed: "
+           << (succeeded(distRhsTypeByWarpOpOrFailure)
+                   ? distRhsTypeByWarpOpOrFailure.value()
+                   : Type())
+           << "\n";
+    DBGS() << "Original Result type: " << dpasOp.getResultType()
+           << ", distributed: "
+           << (succeeded(distResultTypeByWarpOpOrFailure)
+                   ? distResultTypeByWarpOpOrFailure.value()
+                   : Type())
+           << "\n";
+
     if (failed(distLhsTypeByWarpOpOrFailure) ||
         failed(distRhsTypeByWarpOpOrFailure) ||
         failed(distResultTypeByWarpOpOrFailure))
@@ -671,11 +693,14 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
         distRhsTypeByWarpOpOrFailure.value()};
     // Dpas acc operand is optional.
     if (dpasOp.getAcc()) {
+      DBGS() << "Dpas has accumulator operand\n";
       newYieldValues.push_back(dpasOp.getAcc());
       newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
     }
     // Create a new warp op without the dpas.
     SmallVector<size_t> newRetIndices;
+    DBGS() << "Creating new warp op with " << newYieldValues.size()
+           << " yielded values\n";
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
 
@@ -685,6 +710,23 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
         xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB);
     FailureOr<VectorType> expectedDistResultTyOrFailure =
         xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut);
+
+    DBGS() << "Expected dist LHS type: "
+           << (succeeded(expectedDistLhsTyOrFailure)
+                   ? expectedDistLhsTyOrFailure.value()
+                   : Type())
+           << "\n";
+    DBGS() << "Expected dist RHS type: "
+           << (succeeded(expectedDistRhsTyOrFailure)
+                   ? expectedDistRhsTyOrFailure.value()
+                   : Type())
+           << "\n";
+    DBGS() << "Expected dist Result type: "
+           << (succeeded(expectedDistResultTyOrFailure)
+                   ? expectedDistResultTyOrFailure.value()
+                   : Type())
+           << "\n";
+
     if (failed(expectedDistLhsTyOrFailure) ||
         failed(expectedDistRhsTyOrFailure) ||
         failed(expectedDistResultTyOrFailure))
@@ -704,6 +746,9 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
       newDpasOperandExpectedTypes.push_back(distributedResultTy);
 
     for (unsigned i = 0; i < newRetIndices.size(); i++) {
+      DBGS() << "Resolving operand " << i << " with type "
+             << newWarpOp.getResult(newRetIndices[i]).getType() << " to "
+             << newDpasOperandExpectedTypes[i] << "\n";
       newDpasOperands.push_back(
           resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
                                newDpasOperandExpectedTypes[i], rewriter));
@@ -711,13 +756,17 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
     auto newDpasOp = xegpu::DpasOp::create(rewriter, newWarpOp->getLoc(),
                                            distributedResultTy, newDpasOperands,
                                            dpasOp->getAttrs());
+    DBGS() << "Created new dpas op: " << *newDpasOp << "\n";
     xegpu::removeLayoutAttrs(newDpasOp);
     Value distributedVal = newWarpOp.getResult(operandIdx);
     // Resolve the output type.
+    DBGS() << "Resolving output from " << newDpasOp.getResult().getType()
+           << " to " << distResultTypeByWarpOpOrFailure.value() << "\n";
     Value typeResolved =
         resolveDistributedTy(newDpasOp.getResult(),
                              distResultTypeByWarpOpOrFailure.value(), rewriter);
     rewriter.replaceAllUsesWith(distributedVal, typeResolved);
+    DBGS() << "Successfully distributed dpas op\n";
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index a42b4c394a476..af90674d4ce0d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -425,10 +425,11 @@ struct WgToSgDpasOp : public OpConversionPattern<xegpu::DpasOp> {
     if (resultTy.getRank() != 2)
       return failure();
 
-    auto originalLayout = xegpu::getDistributeLayoutAttr(op.getResult());
-    if (!originalLayout)
+    auto layoutCd = op.getLayoutCdAttr();
+    auto layoutA = op.getLayoutAAttr();
+    auto layoutB = op.getLayoutBAttr();
+    if (!layoutCd || !layoutA || !layoutB)
       return failure();
-
     size_t i = 0;
     SmallVector<Value> newDpasOps;
     for (auto aVec : adaptor.getLhs()) {
@@ -447,11 +448,12 @@ struct WgToSgDpasOp : public OpConversionPattern<xegpu::DpasOp> {
             llvm::cast<VectorType>(bVec.getType()).getShape();
         VectorType resTy = VectorType::get({aVecShape[0], bVecShape[1]},
                                            resultTy.getElementType());
-        tmpC = xegpu::DpasOp::create(rewriter, loc, resTy, operands);
-        xegpu::setDistributeLayoutAttr(cast<OpResult>(tmpC),
-                                       originalLayout.dropSgLayoutAndData());
+        auto newDpasOp = xegpu::DpasOp::create(rewriter, loc, resTy, operands);
+        newDpasOp.setLayoutCdAttr(layoutCd.dropSgLayoutAndData());
+        newDpasOp.setLayoutAAttr(layoutA.dropSgLayoutAndData());
+        newDpasOp.setLayoutBAttr(layoutB.dropSgLayoutAndData());
 
-        newDpasOps.push_back(tmpC);
+        newDpasOps.push_back(newDpasOp);
       }
     }
     rewriter.replaceOpWithMultiple(op, {newDpasOps});
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 2c3589bbe6e4f..f4da557a8fa32 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -264,39 +264,53 @@ void xegpu::setDistributeLayoutAttr(
     const mlir::xegpu::DistributeLayoutAttr layout) {
   Operation *owner = result.getOwner();
 
+  llvm::errs() << "Setting layout for result of op: " << *owner << "\n";
+
   if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
     if (anchorOp.getAnchorLayout() == layout)
       return;
-    return anchorOp.setAnchorLayout(layout);
+    llvm::errs() << "  Setting anchor layout\n";
+    anchorOp.setAnchorLayout(layout);
+    llvm::errs() << "  After setting anchor layout, op: " << *owner << "\n";
+    return;
   }
 
   std::string name = xegpu::getTempLayoutName(result);
-  if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
+  if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) {
     return;
-  if (layout)
+  }
+  if (layout) {
+    llvm::errs() << "  Setting temp layout attribute: " << name << "\n";
     owner->setAttr(name, layout);
+    llvm::errs() << "  After setting temp layout, op: " << *owner << "\n";
+  }
 }
 void xegpu::setDistributeLayoutAttr(const OpOperand &operand,
                                     const DistributeLayoutAttr layout) {
   Operation *owner = operand.getOwner();
   unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
-  if (layout)
+
+  llvm::errs() << "Setting layout for operand 1" << idx << " of op: " << *owner
+               << " with layout: " << layout << "\n";
+
+  if (!layout) {
     return;
+  }
   if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
     if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
       if (idx == 0) {
-        llvm::errs() << "  set DpasOp layout A\n";
+        llvm::errs() << "  Setting DpasOp layout A\n";
         return dpasOp.setLayoutAAttr(layout);
       } else if (idx == 1) {
-        llvm::errs() << "  set DpasOp layout B\n";
+        llvm::errs() << "  Setting DpasOp layout B\n";
         return dpasOp.setLayoutBAttr(layout);
       } else if (idx == 2) {
-        llvm::errs() << "  set DpasOp layout CD\n";
+        llvm::errs() << "  Setting DpasOp layout CD\n";
         return dpasOp.setLayoutCdAttr(layout);
       }
     }
     if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(owner)) {
-      llvm::errs() << "  set ConvertLayoutOp input layout\n";
+      llvm::errs() << "  Setting ConvertLayoutOp input layout\n";
       return convertOp.setInputLayoutAttr(layout);
     }
 
@@ -306,22 +320,28 @@ void xegpu::setDistributeLayoutAttr(const OpOperand &operand,
     if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
             owner)) {
       if (idx < 2) {
-        llvm::errs() << "  set anchor layout for store op\n";
+        llvm::errs() << "  Setting anchor layout for store op\n";
         anchorOp.setAnchorLayout(layout);
+        llvm::errs() << "  After setting anchor layout, op: " << *owner << "\n";
       }
     } else {
       if (idx == 0) {
-        llvm::errs() << "  set anchor layout\n";
+        llvm::errs() << "  Setting anchor layout\n";
         anchorOp.setAnchorLayout(layout);
+        llvm::errs() << "  After setting anchor layout, op: " << *owner << "\n";
       }
     }
   }
 
   std::string name = xegpu::getTempLayoutName(operand);
-  if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
+  if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) {
     return;
-  if (layout)
+  }
+  if (layout) {
+    llvm::errs() << "  Setting temp layout attribute: " << name << "\n";
     owner->setAttr(name, layout);
+    llvm::errs() << "  After setting temp layout, op: " << *owner << "\n";
+  }
 }
 
 void xegpu::retrieveDistributeLayoutAttrsRecursive(Operation *op) {
@@ -448,6 +468,11 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(
         .getResult(0);
   };
 
+  llvm::errs()
+      << "\n=== Stage 1: Converting VectorType to RankedTensorType ===\n";
+  llvm::errs() << "Before conversion:\n";
+  op->dump();
+
   { // convert VectorType to RankedTensorType for SCF Structural ops
     TypeConverter converter;
     converter.addConversion([](Type type) -> Type { return type; });
@@ -466,6 +491,12 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(
     (void)mlir::applyPartialConversion(op, target, std::move(patterns));
   }
 
+  llvm::errs() << "\nAfter Stage 1:\n";
+  op->dump();
+
+  llvm::errs() << "\n=== Stage 2: Propagating layout attributes to "
+                  "RankedTensorType ===\n";
+
   { // propagate the layout attribute to RankedTensorType by checking
     // BuiltInUnrealizedCastOps
     // for VectorType to RankedTensorType cast.
@@ -520,6 +551,12 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(
     });
   }
 
+  llvm::errs() << "\nAfter Stage 2:\n";
+  op->dump();
+
+  llvm::errs()
+      << "\n=== Stage 3: Converting RankedTensorType back to VectorType ===\n";
+
   { // perform the conversion from RankedTensorType to VectorType based on the
     // DistributeLayoutAttr
 
@@ -583,6 +620,10 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(
                                                          target);
     (void)mlir::applyPartialConversion(op, target, std::move(patterns));
   }
+
+  llvm::errs() << "\nAfter Stage 3 (final):\n";
+  op->dump();
+  llvm::errs() << "\n=== Transformation complete ===\n\n";
 }
 
 std::optional<std::string> xegpu::getChipStr(Operation *op) {
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index 216f3d19cff94..6f4678400239e 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -140,10 +140,9 @@ gpu.func @dpas(%laneid: index) {
     %2 = "some_op"() : () -> vector<8x16xf32>
     %3 = xegpu.dpas %0, %1, %2
       {
-        layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-        layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-        layout_operand_2 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-        layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+        layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+        layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+        layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
       }
       : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
     gpu.yield %3 : vector<8x16xf32>
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index ee07b92f6e795..87c67ba6bf324 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -20,18 +20,20 @@ gpu.module @xevm_module{
     %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16>
       -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %1 = xegpu.load_nd %0[%c0, %c0]
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
       !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
 
     %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16>
       -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
     %3 = xegpu.load_nd %2[%c0, %c0]
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
       : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
       -> vector<16x16xf16>
 
     %4 = xegpu.dpas %1, %3
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+       layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+       layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 
     %5 = math.exp %4
@@ -84,7 +86,7 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar
   %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
     !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   %3 = xegpu.load_nd %2[%0, %1]
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
 
   %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
@@ -95,14 +97,16 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar
       -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
 
     %7 = xegpu.load_nd %5[%0, %arg3]
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
     %8 = xegpu.load_nd %6[%arg3, %1]
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
       : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
 
     %9 = xegpu.dpas %7, %8, %arg4
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+       layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+       layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
 
     scf.yield %9 : vector<8x16xf32>
@@ -194,17 +198,20 @@ gpu.module @xevm_module{
     %c0 = arith.constant 0 : index
     %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16>
       -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    %1 = xegpu.load_nd %0[%c0, %c0]  {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
     %2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32>
       -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
-    %3 = xegpu.load_nd %2[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+    %3 = xegpu.load_nd %2[%c0, %c0]  {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
     %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
       : vector<16x8xi32> to vector<16x16xf16>
     %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
       : vector<16x16xf16> to vector<16x16xf16>
-    %6 = xegpu.dpas %1, %5 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    %6 = xegpu.dpas %1, %5 
+      {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+       layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+       layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
     %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32>
       -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -341,7 +348,7 @@ gpu.module @xevm_module{
       -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
       -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+    %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
     %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
     // CHECK: %[[BCAST:.*]] = vector.broadcast %{{.*}} : f16 to vector<16xf16>
     %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x16xf16>
@@ -356,7 +363,7 @@ gpu.module @xevm_module{
    gpu.func  @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case(%arg0: memref<16xf16>, %arg1: memref<16x16xf16>) {
     %c0 = arith.constant 0 : index
     %mask = vector.constant_mask [16] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: vector<16xi1>
-    %1 = xegpu.load %arg0[%c0], %mask {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: memref<16xf16>, index, vector<16xi1> -> vector<16xf16>
+    %1 = xegpu.load %arg0[%c0], %mask {layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: memref<16xf16>, index, vector<16xi1> -> vector<16xf16>
     
     %11 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x1xf16>
     %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index 9b00adaaa56da..6b8b4f282b744 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -78,7 +78,7 @@ gpu.module @test_round_robin_assignment {
   gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) {
     // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     // CHECK-NOT: xegpu.dpas
     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<256x128xf16>
       -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -88,7 +88,10 @@ gpu.module @test_round_robin_assignment {
       -> !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
     %load_b =  xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
       -> vector<128x256xf16>
-    %dpas = xegpu.dpas %load_a, %load_b {layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+    %dpas = xegpu.dpas %load_a, %load_b 
+      {layout_a = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_b = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>,
+      layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32>
     gpu.return
   }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
index b3ba99887d763..f9a79b36c5790 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
@@ -56,7 +56,7 @@ gpu.module @test_distribution {
     // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
     // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
     // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}]  <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     // CHECK-NOT: xegpu.dpas
     %tdesc_a = xegpu.create_nd_tdesc %a : memref<256x128xf16>
       -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -69,7 +69,9 @@ gpu.module @test_distribution {
       : !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
       -> vector<128x256xf16>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+       {layout_a = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+        layout_b = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>,
+        layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32>
     gpu.return
   }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 9179344b8fc2b..da6ad976d3730 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -75,7 +75,7 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: dpas
   gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
-    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_cd =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
+    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
     %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16>
       -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
     %load_a =  xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>}
@@ -87,14 +87,16 @@ gpu.module @test_distribution {
       : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
       -> vector<128x128xf16>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+       {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>,
+        layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>,
+        layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}    
       : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
     gpu.return
   }
 
   // CHECK-LABEL: dpas_no_sg_data
   gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
-    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_cd =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16>
       -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
       order = [1, 0]>>
@@ -111,7 +113,11 @@ gpu.module @test_distribution {
       order = [1, 0]>>
       -> vector<128x128xf16>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout_cd =  #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
+      {layout_a = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
+      order = [1, 0]>,
+       layout_b = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
+      order = [1, 0]>,
+       layout_cd =  #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
       : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
     gpu.return
   }
@@ -120,7 +126,9 @@ gpu.module @test_distribution {
   gpu.func @dpas_with_no_create_nd_desc(%a: vector<256x128xf32>, %b: vector<128x256xf32>) {
     // CHECK-NOT: vector<32x32xf32>
     %dpas = xegpu.dpas %a, %b
-      {layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+      {layout_a =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>, 
+       layout_b =  #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+       layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32>
     gpu.return
   }
@@ -190,8 +198,11 @@ gpu.module @test_distribution {
     %8:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %6, %arg5 = %7, %arg6 = %5)
         -> (vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>) {
       // load_nd with offset inside loop
-      %9 = xegpu.dpas %arg4, %arg5, %arg6 {layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
-                          : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
+      %9 = xegpu.dpas %arg4, %arg5, %arg6 
+          {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>,
+           layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>,
+           layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
+          : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
       %10 = xegpu.load_nd %3[%arg3, %c0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
       %11 = xegpu.load_nd %4[%c0, %arg3] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
       scf.yield %10, %11, %9 : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index fb77cb93b60fc..50081ed34fe78 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -95,9 +95,11 @@ gpu.module @test_1_1_assignment {
       -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
     %load_b =  xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
       -> vector<128x128xf16>
-    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_cd =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
+    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+      {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>,
+       layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>,
+       layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
     gpu.return
   }
@@ -116,9 +118,11 @@ gpu.module @test_1_1_assignment {
     %load_b =  xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
       order = [1, 0]>>
       -> vector<128x128xf16>
-    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_cd =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout_cd =  #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
+      {layout_a = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>,
+       layout_b = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>,
+       layout_cd =  #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
       : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
     gpu.return
   }
@@ -141,7 +145,9 @@ gpu.module @test_1_1_assignment {
   gpu.func @dpas_with_no_create_nd_desc(%a: vector<256x128xf32>, %b: vector<128x256xf32>) {
     // CHECK-NOT: vector<32x32xf32>
     %dpas = xegpu.dpas %a, %b
-      {layout =  #xegpu.layout<sg_layout = [2, 2], sg_data = [12, 12], lane_layout = [2, 2], lane_data = [1, 1]>}
+      {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], inst_data=[8, 8], lane_layout = [2, 8], lane_data = [1, 1]>,
+       layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], inst_data=[8, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+       layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], inst_data=[8, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32>
     gpu.return
   }
@@ -207,8 +213,11 @@ gpu.module @test_1_1_assignment {
         -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>, !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>, vector<128x128xf32>) {
       %8 = xegpu.load_nd %arg4 {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
       %9 = xegpu.load_nd %arg5 {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
-      %10 = xegpu.dpas %8, %9, %arg6 {layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
-                          : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
+      %10 = xegpu.dpas %8, %9, %arg6 
+        {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>,
+         layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>,
+         layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
+        : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
       %11 = xegpu.update_nd_offset %arg4, [%c0, %c128] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
       %12 = xegpu.update_nd_offset %arg5, [%c128, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
       scf.yield %11, %12, %10 : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>,
diff --git a/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir b/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir
index 3f2fff9ab51e9..37f6d33e8ac30 100644
--- a/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir
+++ b/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir
@@ -53,34 +53,34 @@ module @gemm attributes {gpu.container_module} {
       %m = arith.muli %block_id_x, %c256 : index
       %n = arith.muli %block_id_y, %c256 : index
       %c_tdesc = xegpu.create_nd_tdesc %C : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #c>
-      %c_init_value = xegpu.load_nd %c_tdesc[%m, %n] : !xegpu.tensor_desc<256x256xf32, #c> -> vector<256x256xf32>
+      %c_init_value = xegpu.load_nd %c_tdesc[%m, %n] {layout = #c}: !xegpu.tensor_desc<256x256xf32, #c> -> vector<256x256xf32>
       %a_tdesc = xegpu.create_nd_tdesc %A : memref<256x256xf16> -> !xegpu.tensor_desc<256x32xf16, #a>
       %b_tdesc = xegpu.create_nd_tdesc %B : memref<256x256xf16> -> !xegpu.tensor_desc<32x256xf16, #b>
       // Prefetch A 3 times.
       %a_prefetch_tdesc = xegpu.create_nd_tdesc %A : memref<256x256xf16> -> !xegpu.tensor_desc<256x32xf16, #a_prefetch>
-      xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c0] : !xegpu.tensor_desc<256x32xf16, #a_prefetch>
-      xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c32] : !xegpu.tensor_desc<256x32xf16, #a_prefetch>
-      xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c64] : !xegpu.tensor_desc<256x32xf16, #a_prefetch>
+      xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c0] {layout = #a_prefetch}: !xegpu.tensor_desc<256x32xf16, #a_prefetch>
+      xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c32] {layout = #a_prefetch}: !xegpu.tensor_desc<256x32xf16, #a_prefetch>
+      xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c64] {layout = #a_prefetch}: !xegpu.tensor_desc<256x32xf16, #a_prefetch>
        // Prefetch B 3 times.
       %b_prefetch_tdesc = xegpu.create_nd_tdesc %B : memref<256x256xf16> -> !xegpu.tensor_desc<32x256xf16, #b_prefetch>
-      xegpu.prefetch_nd %b_prefetch_tdesc[%c0, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch>
-      xegpu.prefetch_nd %b_prefetch_tdesc[%c32, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch>
-      xegpu.prefetch_nd %b_prefetch_tdesc[%c64, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch>
+      xegpu.prefetch_nd %b_prefetch_tdesc[%c0, %n] {layout = #b_prefetch}: !xegpu.tensor_desc<32x256xf16, #b_prefetch>
+      xegpu.prefetch_nd %b_prefetch_tdesc[%c32, %n] {layout = #b_prefetch}: !xegpu.tensor_desc<32x256xf16, #b_prefetch>
+      xegpu.prefetch_nd %b_prefetch_tdesc[%c64, %n] {layout = #b_prefetch}: !xegpu.tensor_desc<32x256xf16, #b_prefetch>
 
       %out = scf.for %k = %c0 to %c256 step %c32
         iter_args(%c_value = %c_init_value)
         -> (vector<256x256xf32>) {
-        %a_value = xegpu.load_nd %a_tdesc[%m, %k]  : !xegpu.tensor_desc<256x32xf16, #a> -> vector<256x32xf16>
-        %b_value = xegpu.load_nd %b_tdesc[%k, %n] : !xegpu.tensor_desc<32x256xf16, #b> -> vector<32x256xf16>
+        %a_value = xegpu.load_nd %a_tdesc[%m, %k]  {layout = #a}: !xegpu.tensor_desc<256x32xf16, #a> -> vector<256x32xf16>
+        %b_value = xegpu.load_nd %b_tdesc[%k, %n] {layout = #b}: !xegpu.tensor_desc<32x256xf16, #b> -> vector<32x256xf16>
         // Prefetch next tiles.
         %prefetch_offset = arith.addi %k, %c96 : index
-        xegpu.prefetch_nd %a_prefetch_tdesc[%m, %prefetch_offset] : !xegpu.tensor_desc<256x32xf16, #a_prefetch>
-        xegpu.prefetch_nd %b_prefetch_tdesc[%prefetch_offset, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch>
-        %c_new_value = xegpu.dpas %a_value, %b_value, %c_value {layout_result_0 = #c}
+        xegpu.prefetch_nd %a_prefetch_tdesc[%m, %prefetch_offset] {layout = #a_prefetch}: !xegpu.tensor_desc<256x32xf16, #a_prefetch>
+        xegpu.prefetch_nd %b_prefetch_tdesc[%prefetch_offset, %n] {layout = #b_prefetch}: !xegpu.tensor_desc<32x256xf16, #b_prefetch>
+        %c_new_value = xegpu.dpas %a_value, %b_value, %c_value {layout_a = #a, layout_b = #b, layout_cd = #c}
           : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf32> -> vector<256x256xf32>
         scf.yield %c_new_value : vector<256x256xf32>
       }
-      xegpu.store_nd %out, %c_tdesc[%m, %n] : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #c>
+      xegpu.store_nd %out, %c_tdesc[%m, %n] {layout = #c}: vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #c>
       gpu.return
     }
   }

>From ab0ccf0136ba4b28cba1391cca40d75f0c4026c2 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 13 Dec 2025 02:58:27 +0000
Subject: [PATCH 19/23] add get/setTempDistributeLayoutAttr

---
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     |  28 ++--
 .../XeGPU/Transforms/XeGPUBlocking.cpp        |   5 +-
 .../Transforms/XeGPUSubgroupDistribute.cpp    |  63 ++++----
 .../Transforms/XeGPUWgToSgDistribute.cpp      |  53 ++++---
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 134 ++++++++----------
 .../XeGPU/xegpu-wg-to-sg-elemwise.mlir        |  30 ++--
 6 files changed, 155 insertions(+), 158 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index d851ed9e3ccf1..7886c0269a15d 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -76,21 +76,11 @@ std::string getTempLayoutName(const OpResult result);
 /// found.
 DistributeLayoutAttr getDistributeLayoutAttr(const Value value);
 
-template <typename AttrTy>
-AttrTy getDistributeLayoutAttrOfType(const Value value) {
-  return dyn_cast_if_present<AttrTy>(getDistributeLayoutAttr(value));
-}
-
 /// Retrieves the DistributeLayoutAttr associated with a given OpOperand. It
 /// will first check the operand_layout_{id} of the owner operation. If not
 /// found, it will check the operand itself and its defining op.
 DistributeLayoutAttr getDistributeLayoutAttr(const OpOperand &opr);
 
-template <typename AttrTy>
-AttrTy getDistributeLayoutAttrOfType(const OpOperand &opr) {
-  return dyn_cast_if_present<AttrTy>(getDistributeLayoutAttr(opr));
-}
-
 /// Removes the LayoutAttr for a given OpOperand or OpResult if it exists.
 template <typename T,
           typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
@@ -110,11 +100,29 @@ void setDistributeLayoutAttr(const OpResult &Result,
 void setDistributeLayoutAttr(const OpOperand &opr,
                              const DistributeLayoutAttr layout);
 
+/// get and set distribute layout attribute for non-anchor operations
+/// (and offsets/masks of load/store ops before we get rid of their temp attrs)
+template <typename T,
+          typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
+                                      std::is_same_v<T, OpResult>>>
+DistributeLayoutAttr getTempDistributeLayoutAttr(const T &operandOrResult);
+
+template <typename T,
+          typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
+                                      std::is_same_v<T, OpResult>>>
+void setTempDistributeLayoutAttr(const T &operandOrResult,
+                                 const DistributeLayoutAttr layout);
+
 /// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given
 /// operation. If the operation contains regions, it is also applied recursively
 /// to the contained operations
 void retrieveDistributeLayoutAttrsRecursive(Operation *op);
 
+/// Attach layout attributes to all vector-type operands of operations within
+/// the given operation's region. Reports an error if any vector operand lacks
+/// a layout attribute.
+bool localPropagateLayoutsFromAnchor(Operation *rootOp);
+
 /// Extract a set of small vectors from a value with a given shape using
 /// vector.extract_stride_slice
 SmallVector<Value> extractVectorsWithShapeFromValue(OpBuilder &builder,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index bddc8f8c8de68..9d74433e1d673 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -158,8 +158,7 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
       bool skipLeadingUnitDimRemoval =
           definingOp &&
           (isa<xegpu::CreateNdDescOp, xegpu::LoadNdOp, xegpu::DpasOp,
-               xegpu::StoreNdOp, xegpu::PrefetchNdOp, vector::BroadcastOp>(
-              definingOp));
+               xegpu::StoreNdOp, xegpu::PrefetchNdOp>(definingOp));
       if (!skipLeadingUnitDimRemoval) {
         auto it = llvm::find_if(instData, [](auto val) { return val != 1; });
         instData.erase(instData.begin(), it);
@@ -284,6 +283,8 @@ void XeGPUBlockingPass::runOnOperation() {
   // Preserve the LayoutAttr for each operand to the owner's DictionaryAttr.
   // This ensures that the LayoutAttr remains accessible even if the defining
   // operation is replaced.
+  // TODO-LayoutRefactor: unify the local propagation for layout preprocessing
+  // replace the function with localPropagateLayoutsFromAnchor
   xegpu::retrieveDistributeLayoutAttrsRecursive(op);
 
   auto getTileShapeAndCount = [](llvm::ArrayRef<int64_t> shape,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 4e663e1dd6694..bf8764e1f4541 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1273,8 +1273,9 @@ static Value lowerToVectorReductions(TypedValue<VectorType> src,
       rewriter, loc, acc.getType(),
       DenseElementsAttr::get(acc.getType(), zeroAttr));
   // Reduction result should have the same layout as the accumulator.
-  xegpu::setDistributeLayoutAttr(cast<OpResult>(reductionResult),
-                                 xegpu::getDistributeLayoutAttr(acc));
+  xegpu::setDistributeLayoutAttr(
+      cast<OpResult>(reductionResult),
+      xegpu::getTempDistributeLayoutAttr(dyn_cast<OpResult>(acc)));
   // For each slice of the source, extract the slice vector, do a reduction
   // and, insert the reduced value back to the result vector.
   for (int i = 0; i < nSlices; ++i) {
@@ -1302,8 +1303,10 @@ static Value lowerToVectorReductions(TypedValue<VectorType> src,
     // accumulator. Shape cast source has the same layout as the original
     // reduction source.
     // TODO: other ops generated here may also need layout attributes.
-    auto srcLayout = xegpu::getDistributeLayoutAttr(src);
-    auto accLayout = xegpu::getDistributeLayoutAttr(acc);
+    auto srcLayout =
+        xegpu::getTempDistributeLayoutAttr(dyn_cast<OpResult>(src));
+    auto accLayout =
+        xegpu::getTempDistributeLayoutAttr(dyn_cast<OpResult>(acc));
 
     xegpu::setDistributeLayoutAttr(slice->getOpOperand(0), srcLayout);
     xegpu::setDistributeLayoutAttr(slice->getOpResult(0), accLayout);
@@ -1399,7 +1402,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
         cast<VectorType>(warpOp.getResult(operandIdx).getType());
     VectorType resultType = cast<VectorType>(reductionOp.getType());
     xegpu::DistributeLayoutAttr sourceLayout =
-        xegpu::getDistributeLayoutAttr(reductionOp.getSource());
+        xegpu::getTempDistributeLayoutAttr(reductionOp->getOpOperand(0));
 
     FailureOr<VectorType> sourceDistTypeOrFailure =
         getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
@@ -1561,9 +1564,10 @@ struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern {
         dyn_cast<VectorType>(broadcastOp.getResult().getType());
 
     xegpu::DistributeLayoutAttr sourceLayout =
-        xegpu::getDistributeLayoutAttr(broadcastOp->getOpOperand(0));
+        xegpu::getTempDistributeLayoutAttr(broadcastOp->getOpOperand(0));
     xegpu::DistributeLayoutAttr resultLayout =
-        xegpu::getDistributeLayoutAttr(broadcastOp.getResult());
+        xegpu::getTempDistributeLayoutAttr(
+            dyn_cast<OpResult>(broadcastOp.getResult()));
 
     FailureOr<VectorType> sourceDistType;
     Type sourceElemOrDistType;
@@ -1652,9 +1656,10 @@ struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
     auto resultDistTy =
         cast<VectorType>(warpOp.getResult(operandNumber).getType());
     xegpu::DistributeLayoutAttr sourceLayout =
-        xegpu::getDistributeLayoutAttr(shapeCastOp->getOpOperand(0));
+        xegpu::getTempDistributeLayoutAttr(shapeCastOp->getOpOperand(0));
     xegpu::DistributeLayoutAttr resultLayout =
-        xegpu::getDistributeLayoutAttr(shapeCastOp.getResult());
+        xegpu::getTempDistributeLayoutAttr(
+            dyn_cast<OpResult>(shapeCastOp.getResult()));
     if (!sourceLayout || !resultLayout)
       return rewriter.notifyMatchFailure(
           warpOp,
@@ -1737,7 +1742,7 @@ struct VectorExtractStridedSliceDistribution
       int sourceDistrDimSize =
           extractOp.getSourceVectorType().getShape()[distributedDim];
       auto sourceLayout =
-          xegpu::getDistributeLayoutAttr(extractOp->getOpOperand(0));
+          xegpu::getTempDistributeLayoutAttr(extractOp->getOpOperand(0));
       if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty())
         return rewriter.notifyMatchFailure(
             warpOp, "the source of extract_strided_slice op lacks distribution "
@@ -1848,9 +1853,9 @@ struct VectorInsertStridedSliceDistribution
       int64_t srcDistrDimSize = srcType.getDimSize(sourceDistributedDim);
       // Obtain the source and dest layouts.
       auto destLayout =
-          xegpu::getDistributeLayoutAttr(insertOp->getOpOperand(1));
+          xegpu::getTempDistributeLayoutAttr(insertOp->getOpOperand(1));
       auto sourceLayout =
-          xegpu::getDistributeLayoutAttr(insertOp->getOpOperand(0));
+          xegpu::getTempDistributeLayoutAttr(insertOp->getOpOperand(0));
       if (!destLayout || !sourceLayout ||
           destLayout.getEffectiveLaneLayoutAsInt().empty() ||
           sourceLayout.getEffectiveLaneLayoutAsInt().empty())
@@ -1965,7 +1970,7 @@ struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern {
     unsigned operandIdx = operand->getOperandNumber();
     VectorType distributedSourceType =
         getDistVecTypeBasedOnLaneLayout(
-            xegpu::getDistributeLayoutAttr(bitcastOp.getSource()),
+            xegpu::getTempDistributeLayoutAttr(bitcastOp->getOpOperand(0)),
             bitcastOp.getSourceVectorType())
             .value_or(VectorType());
     if (!distributedSourceType)
@@ -2008,9 +2013,9 @@ struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern {
     auto transposeOp = operand->get().getDefiningOp<vector::TransposeOp>();
     unsigned operandIdx = operand->getOperandNumber();
     xegpu::DistributeLayoutAttr sourceLayout =
-        xegpu::getDistributeLayoutAttr(transposeOp.getVector());
+        xegpu::getTempDistributeLayoutAttr(transposeOp->getOpOperand(0));
     xegpu::DistributeLayoutAttr resultLayout =
-        xegpu::getDistributeLayoutAttr(transposeOp.getResult());
+        xegpu::getTempDistributeLayoutAttr(transposeOp->getOpResult(0));
     if (!sourceLayout || !resultLayout)
       return rewriter.notifyMatchFailure(
           transposeOp,
@@ -2093,30 +2098,10 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
   // 1) It is assumed that there are no layout conflicts.
   // 2) Any existing layout attributes attached to the operands are ignored.
   Operation *op = getOperation();
-  op->walk([&](Operation *op) {
-    for (OpOperand &operand : op->getOpOperands()) {
-      // Layouts are needed for vector type only.
-      if (!isa<VectorType>(operand.get().getType()))
-        continue;
-      //      if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(op))
-      // xegpu::DpasOp,
-      // xegpu::LoadGatherOp, xegpu::StoreScatterOp,
-      // xegpu::LoadNdOp, xegpu::StoreNdOp>(op))
-      //      continue;
-
-      auto layout = xegpu::getDistributeLayoutAttr(operand.get());
-      if (!layout) {
-        op->emitError("Could not find layout attribute for operand ")
-            << operand.getOperandNumber() << " of operation " << op->getName();
-        signalPassFailure();
-        return;
-      }
-      xegpu::setDistributeLayoutAttr(operand, layout);
-    }
-  });
-
-  // dump out the op here
-  // getOperation()->dump();
+  if (!xegpu::localPropagateLayoutsFromAnchor(op)) {
+    signalPassFailure();
+    return;
+  }
 
   // Step 2: Move all operations of a GPU function inside
   // gpu.warp_execute_on_lane_0 operation.
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index af90674d4ce0d..f9f5a1f6b285e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -492,8 +492,8 @@ struct WgToSgVectorBroadcastOp
     VectorType resultType = op.getResult().getType();
     ArrayRef<int64_t> wgShape = resultType.getShape();
 
-    xegpu::DistributeLayoutAttr layout =
-        xegpu::getDistributeLayoutAttr(op.getResult());
+    xegpu::DistributeLayoutAttr layout = xegpu::getTempDistributeLayoutAttr(
+        llvm::cast<OpResult>(op.getResult()));
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -535,8 +535,8 @@ struct WgToSgElementwiseOp : public ConversionPattern {
 
     ArrayRef<int64_t> wgShape = resultType.getShape();
 
-    xegpu::DistributeLayoutAttr layout =
-        xegpu::getDistributeLayoutAttr(op->getResult(0));
+    xegpu::DistributeLayoutAttr layout = xegpu::getTempDistributeLayoutAttr(
+        llvm::cast<OpResult>(op->getResult(0)));
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -742,7 +742,7 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
       return failure();
 
     xegpu::DistributeLayoutAttr layout =
-        xegpu::getDistributeLayoutAttr(op.getResult());
+        xegpu::getTempDistributeLayoutAttr(dyn_cast<OpResult>(op.getResult()));
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -903,8 +903,8 @@ struct WgToSgLoadGatherOpWithOffset
       return failure();
     ArrayRef<int64_t> wgShape = resultType.getShape();
 
-    xegpu::DistributeLayoutAttr layout =
-        xegpu::getDistributeLayoutAttr(op.getResult());
+    xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
+
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -957,8 +957,8 @@ struct WgToSgStoreScatterOpWithOffset
     if (!valueType)
       return failure();
 
-    xegpu::DistributeLayoutAttr layout =
-        xegpu::getDistributeLayoutAttr(op.getOperand(0));
+    xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
+
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -1052,7 +1052,7 @@ struct WgToSgVectorStepOp : public OpConversionPattern<vector::StepOp> {
   matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     xegpu::DistributeLayoutAttr layout =
-        xegpu::getDistributeLayoutAttr(op.getResult());
+        xegpu::getTempDistributeLayoutAttr(dyn_cast<OpResult>(op.getResult()));
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -1109,7 +1109,7 @@ struct WgToSgVectorShapeCastOp
 
     ArrayRef<int64_t> wgShape = resultType.getShape();
     xegpu::DistributeLayoutAttr layout =
-        xegpu::getDistributeLayoutAttr(op.getResult());
+        xegpu::getTempDistributeLayoutAttr(dyn_cast<OpResult>(op.getResult()));
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -1142,6 +1142,7 @@ struct WgToSgVectorShapeCastOp
     // must be a slice of higher rank layout.
     int64_t sourceRank = srcType.getRank();
     int64_t resultRank = sgShape.size();
+    // TODO-LayoutRefactor: handle the case using getTempDistributeLayoutAttr
     xegpu::DistributeLayoutAttr sourceLayout =
         xegpu::getDistributeLayoutAttr(op.getSource());
     if (sourceRank < resultRank && !sourceLayout.isSliceOf(layout))
@@ -1182,7 +1183,7 @@ struct WgToSgMultiDimReductionOp
 
     auto srcShape = srcType.getShape();
     xegpu::DistributeLayoutAttr layout =
-        xegpu::getDistributeLayoutAttr(op.getResult());
+        xegpu::getTempDistributeLayoutAttr(dyn_cast<OpResult>(op.getResult()));
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -1237,10 +1238,10 @@ struct WgToSgVectorTransposeOp
 
     ArrayRef<int64_t> wgShape = resultType.getShape();
     xegpu::DistributeLayoutAttr layout =
-        xegpu::getDistributeLayoutAttr(op.getResult());
+        xegpu::getTempDistributeLayoutAttr(dyn_cast<OpResult>(op.getResult()));
     if (!layout || !layout.isForWorkgroup())
       return failure();
-
+    // TODO-LayoutRefactor: handle the case using getTempDistributeLayoutAttr
     xegpu::DistributeLayoutAttr sourceLayout =
         xegpu::getDistributeLayoutAttr(op.getVector());
     if (!sourceLayout || !sourceLayout.isForWorkgroup())
@@ -1299,7 +1300,7 @@ struct WgToSgVectorMaskOp : public OpConversionPattern<MaskOpType> {
       typename OpConversionPattern<MaskOpType>::OneToNOpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
     xegpu::DistributeLayoutAttr layout =
-        xegpu::getDistributeLayoutAttr(op.getResult());
+        xegpu::getTempDistributeLayoutAttr(dyn_cast<OpResult>(op.getResult()));
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -1390,6 +1391,14 @@ struct XeGPUWgToSgDistributePass
 } // namespace
 
 void XeGPUWgToSgDistributePass::runOnOperation() {
+
+  // TODO-LayoutRefactor: unify the local propagation for layout preprocessing
+  // Operation *op = getOperation();
+  // if (!xegpu::localPropagateLayoutsFromAnchor(op)) {
+  //   signalPassFailure();
+  //   return;
+  // }
+
   // Track existing UnrealizedConversionCastOps
   SmallVector<Operation *> existingCastOps;
   getOperation()->walk([&](UnrealizedConversionCastOp castOp) {
@@ -1480,7 +1489,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
   });
 
   target.addDynamicallyLegalOp<xegpu::DpasOp>([=](xegpu::DpasOp op) -> bool {
-    auto layout = xegpu::getDistributeLayoutAttr(op.getResult());
+    auto layout = op.getLayoutCdAttr();
     return isLegal(layout);
   });
 
@@ -1500,7 +1509,8 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
         if (!vecType)
           return true;
 
-        auto layout = xegpu::getDistributeLayoutAttr(op.getResult());
+        auto layout = xegpu::getTempDistributeLayoutAttr(
+            dyn_cast<OpResult>(op.getResult()));
         return isLegal(layout);
       });
 
@@ -1510,19 +1520,20 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
                                vector::ConstantMaskOp, vector::CreateMaskOp>(
       [=](Operation *op) -> bool {
         // Check for either a SliceAttr or LayoutAttr on the result.
-        auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
+        auto layout = xegpu::getTempDistributeLayoutAttr(
+            dyn_cast<OpResult>(op->getResult(0)));
         return isLegal(layout);
       });
 
   target.addDynamicallyLegalOp<xegpu::LoadGatherOp>(
       [=](xegpu::LoadGatherOp op) -> bool {
-        auto layout = xegpu::getDistributeLayoutAttr(op.getResult());
+        auto layout = op.getLayoutAttr();
         return isLegal(layout);
       });
 
   target.addDynamicallyLegalOp<xegpu::StoreScatterOp>(
       [=](xegpu::StoreScatterOp op) -> bool {
-        auto layout = xegpu::getDistributeLayoutAttr(op.getOperand(0));
+        auto layout = op.getLayoutAttr();
         return isLegal(layout);
       });
 
@@ -1552,7 +1563,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
         }
 
         xegpu::DistributeLayoutAttr layout =
-            xegpu::getDistributeLayoutAttr(op->getResult(0));
+            xegpu::getTempDistributeLayoutAttr(op->getResult(0));
         return isLegal(layout);
       });
 
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index f4da557a8fa32..49d6b45c2791c 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -114,27 +114,21 @@ std::string xegpu::getTempLayoutName(const OpResult result) {
 
 xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
   if (!value) {
-    llvm::errs() << "getDistributeLayoutAttr: value is null\n";
     return nullptr;
   }
 
-  llvm::errs() << "Getting layout for value: " << value << "\n";
-
   if (auto tdescTy =
           dyn_cast_if_present<xegpu::TensorDescType>(value.getType())) {
     auto layoutAttr = tdescTy.getLayoutAttr();
-    llvm::errs() << "  Found TensorDescType with layout\n";
     return layoutAttr;
   }
 
   if (auto result = dyn_cast<OpResult>(value)) {
     Operation *defOp = result.getDefiningOp();
     assert(defOp && "result must have a defining op");
-    llvm::errs() << "  Value is OpResult from: " << *defOp << "\n";
 
     if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
       auto layout = anchorOp.getAnchorLayout();
-      llvm::errs() << "  Returning anchor layout from defining op\n";
       return layout;
     }
 
@@ -142,55 +136,39 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
     if (defOp->hasAttr(layoutName)) {
       auto layout =
           defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
-      llvm::errs() << "  Returning temp layout from attribute: " << layoutName
-                   << "\n";
       return layout;
     }
-    llvm::errs() << "  No layout found for OpResult\n";
   }
 
   if (auto arg = dyn_cast<BlockArgument>(value)) {
     auto *parentOp = arg.getOwner()->getParentOp();
-    llvm::errs() << "  Value is BlockArgument, parent op: " << *parentOp
-                 << "\n";
     if (auto loop = dyn_cast<LoopLikeOpInterface>(parentOp)) {
       OpOperand *tiedInit = loop.getTiedLoopInit(arg);
       if (tiedInit) {
-        llvm::errs() << "  Following tied loop init\n";
         auto layout = getDistributeLayoutAttr(tiedInit->get());
         return layout;
       }
     }
-    llvm::errs() << "  No tied loop init found\n";
   }
 
-  llvm::errs() << "  Returning nullptr\n";
   return nullptr;
 }
-
 xegpu::DistributeLayoutAttr
 xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
   Operation *op = opr.getOwner();
   unsigned idx = const_cast<OpOperand &>(opr).getOperandNumber();
 
-  llvm::errs() << "Getting layout for operand " << idx << " of op: " << *op
-               << "\n";
-
   if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
     if (auto dpasOp = dyn_cast<xegpu::DpasOp>(op)) {
       if (idx == 0) {
-        llvm::errs() << "  Returning DpasOp layout A\n";
         return dpasOp.getLayoutAAttr();
       } else if (idx == 1) {
-        llvm::errs() << "  Returning DpasOp layout B\n";
         return dpasOp.getLayoutBAttr();
       } else if (idx == 2) {
-        llvm::errs() << "  Returning DpasOp layout CD\n";
         return dpasOp.getLayoutCdAttr();
       }
     }
     if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
-      llvm::errs() << "  Returning ConvertLayoutOp input layout\n";
       return convertOp.getInputLayoutAttr();
     }
     auto layout = anchorOp.getAnchorLayout();
@@ -200,12 +178,10 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
     if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
             op)) {
       if (idx < 2) {
-        llvm::errs() << "  Returning anchor layout for store op\n";
         return layout;
       }
     } else {
       if (idx == 0) {
-        llvm::errs() << "  Returning anchor layout\n";
         return layout;
       }
     }
@@ -214,13 +190,10 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
   std::string layoutName = xegpu::getTempLayoutName(opr);
   if (op->hasAttr(layoutName)) {
     auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
-    llvm::errs() << "  Returning temp layout from attribute: " << layoutName
-                 << "\n";
     return layout;
   }
 
   auto layout = getDistributeLayoutAttr(opr.get());
-  llvm::errs() << "  Returning layout from operand value\n";
   return layout;
 }
 
@@ -264,14 +237,10 @@ void xegpu::setDistributeLayoutAttr(
     const mlir::xegpu::DistributeLayoutAttr layout) {
   Operation *owner = result.getOwner();
 
-  llvm::errs() << "Setting layout for result of op: " << *owner << "\n";
-
   if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
     if (anchorOp.getAnchorLayout() == layout)
       return;
-    llvm::errs() << "  Setting anchor layout\n";
     anchorOp.setAnchorLayout(layout);
-    llvm::errs() << "  After setting anchor layout, op: " << *owner << "\n";
     return;
   }
 
@@ -280,37 +249,29 @@ void xegpu::setDistributeLayoutAttr(
     return;
   }
   if (layout) {
-    llvm::errs() << "  Setting temp layout attribute: " << name << "\n";
     owner->setAttr(name, layout);
-    llvm::errs() << "  After setting temp layout, op: " << *owner << "\n";
   }
 }
+
 void xegpu::setDistributeLayoutAttr(const OpOperand &operand,
                                     const DistributeLayoutAttr layout) {
   Operation *owner = operand.getOwner();
   unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
 
-  llvm::errs() << "Setting layout for operand 1" << idx << " of op: " << *owner
-               << " with layout: " << layout << "\n";
-
   if (!layout) {
     return;
   }
   if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
     if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
       if (idx == 0) {
-        llvm::errs() << "  Setting DpasOp layout A\n";
         return dpasOp.setLayoutAAttr(layout);
       } else if (idx == 1) {
-        llvm::errs() << "  Setting DpasOp layout B\n";
         return dpasOp.setLayoutBAttr(layout);
       } else if (idx == 2) {
-        llvm::errs() << "  Setting DpasOp layout CD\n";
         return dpasOp.setLayoutCdAttr(layout);
       }
     }
     if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(owner)) {
-      llvm::errs() << "  Setting ConvertLayoutOp input layout\n";
       return convertOp.setInputLayoutAttr(layout);
     }
 
@@ -320,15 +281,11 @@ void xegpu::setDistributeLayoutAttr(const OpOperand &operand,
     if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
             owner)) {
       if (idx < 2) {
-        llvm::errs() << "  Setting anchor layout for store op\n";
         anchorOp.setAnchorLayout(layout);
-        llvm::errs() << "  After setting anchor layout, op: " << *owner << "\n";
       }
     } else {
       if (idx == 0) {
-        llvm::errs() << "  Setting anchor layout\n";
         anchorOp.setAnchorLayout(layout);
-        llvm::errs() << "  After setting anchor layout, op: " << *owner << "\n";
       }
     }
   }
@@ -338,30 +295,86 @@ void xegpu::setDistributeLayoutAttr(const OpOperand &operand,
     return;
   }
   if (layout) {
-    llvm::errs() << "  Setting temp layout attribute: " << name << "\n";
     owner->setAttr(name, layout);
-    llvm::errs() << "  After setting temp layout, op: " << *owner << "\n";
   }
 }
 
+template <typename T, typename>
+xegpu::DistributeLayoutAttr
+xegpu::getTempDistributeLayoutAttr(const T &operandOrResult) {
+  Operation *op = operandOrResult.getOwner();
+
+  std::string layoutName = xegpu::getTempLayoutName(operandOrResult);
+  if (op->hasAttr(layoutName)) {
+    auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+    return layout;
+  }
+
+  return nullptr;
+}
+
+template xegpu::DistributeLayoutAttr
+xegpu::getTempDistributeLayoutAttr<mlir::OpResult>(const OpResult &result);
+template xegpu::DistributeLayoutAttr
+xegpu::getTempDistributeLayoutAttr<mlir::OpOperand>(const OpOperand &operand);
+
+template <typename T, typename>
+void xegpu::setTempDistributeLayoutAttr(
+    const T &operandOrResult, const xegpu::DistributeLayoutAttr layout) {
+  Operation *owner = operandOrResult.getOwner();
+  std::string name = xegpu::getTempLayoutName(operandOrResult);
+  if (owner->hasAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
+    return;
+  }
+  if (layout) {
+    owner->setAttr(name, layout);
+  }
+}
+
+template void xegpu::setTempDistributeLayoutAttr<mlir::OpResult>(
+    const mlir::OpResult &result,
+    const mlir::xegpu::DistributeLayoutAttr layout);
+
+template void xegpu::setTempDistributeLayoutAttr<mlir::OpOperand>(
+    const mlir::OpOperand &operand,
+    const mlir::xegpu::DistributeLayoutAttr layout);
+
 void xegpu::retrieveDistributeLayoutAttrsRecursive(Operation *op) {
   op->walk([&](Operation *nestOp) {
     for (OpOperand &opr : nestOp->getOpOperands()) {
       auto layout = getDistributeLayoutAttr(opr.get());
-      llvm::errs() << "Setting layout for operand " << opr.getOperandNumber()
-                   << " of op: " << *nestOp << "\n";
       setDistributeLayoutAttr(opr, layout);
     }
 
     for (OpResult result : nestOp->getOpResults()) {
       auto layout = getDistributeLayoutAttr(result);
-      llvm::errs() << "Setting layout for result " << result.getResultNumber()
-                   << " of op: " << *nestOp << "\n";
       setDistributeLayoutAttr(result, layout);
     }
   });
 }
 
+/// Attach layout attributes to all vector-type operands of operations within
+/// the given operation's region. Reports an error if any vector operand lacks
+/// a layout attribute.
+bool xegpu::localPropagateLayoutsFromAnchor(Operation *rootOp) {
+  auto result = rootOp->walk([&](Operation *op) {
+    for (OpOperand &operand : op->getOpOperands()) {
+      // Layouts are needed for vector type only.
+      if (!isa<VectorType>(operand.get().getType()))
+        continue;
+      auto layout = xegpu::getDistributeLayoutAttr(operand.get());
+      if (!layout) {
+        op->emitError("Could not find layout attribute for operand ")
+            << operand.getOperandNumber() << " of operation " << op->getName();
+        return WalkResult::interrupt();
+      }
+      xegpu::setDistributeLayoutAttr(operand, layout);
+    }
+    return WalkResult::advance();
+  });
+  return !result.wasInterrupted();
+}
+
 template <typename T, typename>
 void xegpu::removeLayoutAttr(const T &operandOrResult) {
   Operation *owner = operandOrResult.getOwner();
@@ -468,11 +481,6 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(
         .getResult(0);
   };
 
-  llvm::errs()
-      << "\n=== Stage 1: Converting VectorType to RankedTensorType ===\n";
-  llvm::errs() << "Before conversion:\n";
-  op->dump();
-
   { // convert VectorType to RankedTensorType for SCF Structural ops
     TypeConverter converter;
     converter.addConversion([](Type type) -> Type { return type; });
@@ -491,12 +499,6 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(
     (void)mlir::applyPartialConversion(op, target, std::move(patterns));
   }
 
-  llvm::errs() << "\nAfter Stage 1:\n";
-  op->dump();
-
-  llvm::errs() << "\n=== Stage 2: Propagating layout attributes to "
-                  "RankedTensorType ===\n";
-
   { // propagate the layout attribute to RankedTensorType by checking
     // BuiltInUnrealizedCastOps
     // for VectorType to RankedTensorType cast.
@@ -551,12 +553,6 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(
     });
   }
 
-  llvm::errs() << "\nAfter Stage 2:\n";
-  op->dump();
-
-  llvm::errs()
-      << "\n=== Stage 3: Converting RankedTensorType back to VectorType ===\n";
-
   { // perform the conversion from RankedTensorType to VectorType based on the
     // DistributeLayoutAttr
 
@@ -620,10 +616,6 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(
                                                          target);
     (void)mlir::applyPartialConversion(op, target, std::move(patterns));
   }
-
-  llvm::errs() << "\nAfter Stage 3 (final):\n";
-  op->dump();
-  llvm::errs() << "\n=== Transformation complete ===\n\n";
 }
 
 std::optional<std::string> xegpu::getChipStr(Operation *op) {
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
index 9580769d37313..6e9711442b92d 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
@@ -6,7 +6,7 @@ gpu.module @test_elementwise_ops {
   gpu.func @unary_ops_sg_layout_only(%a: memref<24x32xf32>) {
     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
       -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>>
-    %load_a = xegpu.load_nd %tdesc_a
+    %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>}
       : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>>
       -> vector<24x32xf32>
     // CHECK: math.exp {{.*}} : vector<12x8xf32>
@@ -24,7 +24,7 @@ gpu.module @test_elementwise_ops {
   gpu.func @unary_ops(%a: memref<24x32xf32>) {
     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
       -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-    %load_a = xegpu.load_nd %tdesc_a
+    %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
       -> vector<24x32xf32>
     // CHECK: math.exp {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>} : vector<12x8xf32>
@@ -44,10 +44,10 @@ gpu.module @test_elementwise_ops {
       -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
     %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
       -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-    %load_a = xegpu.load_nd %tdesc_a
+    %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
       -> vector<24x32xf32>
-    %load_b = xegpu.load_nd %tdesc_b
+    %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
       -> vector<24x32xf32>
     // CHECK: arith.addf {{.*}}, {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
@@ -71,13 +71,13 @@ gpu.module @test_elementwise_ops {
       -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
     %tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<24x32xi1>
       -> !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-    %load_a = xegpu.load_nd %tdesc_a
+    %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
       -> vector<24x32xf32>
-    %load_b = xegpu.load_nd %tdesc_b
+    %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
       -> vector<24x32xf32>
-    %load_c = xegpu.load_nd %tdesc_c
+    %load_c = xegpu.load_nd %tdesc_c {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
       -> vector<24x32xi1>
     // CHECK: arith.select {{.*}}, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
@@ -99,10 +99,10 @@ gpu.module @test_elementwise_ops {
       -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
     %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xi32>
       -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-    %load_a = xegpu.load_nd %tdesc_a
+    %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
       -> vector<24x32xf32>
-    %load_b = xegpu.load_nd %tdesc_b
+    %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
       -> vector<24x32xi32>
     // CHECK: arith.truncf {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
@@ -128,16 +128,16 @@ gpu.module @test_elementwise_ops {
       -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
     %tdesc_d = xegpu.create_nd_tdesc %d[0, 0] : memref<24x32xi32>
       -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-    %load_a = xegpu.load_nd %tdesc_a
+    %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
       -> vector<24x32xf32>
-    %load_b = xegpu.load_nd %tdesc_b
+    %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
       -> vector<24x32xf32>
-    %load_c = xegpu.load_nd %tdesc_c
+    %load_c = xegpu.load_nd %tdesc_c {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
       -> vector<24x32xi32>
-    %load_d = xegpu.load_nd %tdesc_d
+    %load_d = xegpu.load_nd %tdesc_d {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
       -> vector<24x32xi32>
     // CHECK: arith.cmpf ult, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
@@ -160,10 +160,10 @@ gpu.module @test_elementwise_ops {
       -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
     %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
       -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
-    %load_a = xegpu.load_nd %tdesc_a
+    %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
       -> vector<24x32xf32>
-    %load_b = xegpu.load_nd %tdesc_b
+    %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
       -> vector<24x32xf32>
     // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>

>From 2c80b0ee5c79c4d19220336bec0fae5344d1e13c Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 13 Dec 2025 04:24:45 +0000
Subject: [PATCH 20/23] remove debug print

---
 .../XeGPU/Transforms/XeGPUBlocking.cpp        |  7 +--
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 51 +------------------
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 13 ++---
 3 files changed, 12 insertions(+), 59 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 9d74433e1d673..0c0a922c56305 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -280,11 +280,12 @@ void XeGPUBlockingPass::runOnOperation() {
   MLIRContext *ctx = &getContext();
   Operation *op = getOperation();
 
-  // Preserve the LayoutAttr for each operand to the owner's DictionaryAttr.
-  // This ensures that the LayoutAttr remains accessible even if the defining
-  // operation is replaced.
   // TODO-LayoutRefactor: unify the local propagation for layout preprocessing
   // replace the function with localPropagateLayoutsFromAnchor
+  // if (!xegpu::localPropagateLayoutsFromAnchor(op)) {
+  //   signalPassFailure();
+  //   return;
+  // }
   xegpu::retrieveDistributeLayoutAttrsRecursive(op);
 
   auto getTileShapeAndCount = [](llvm::ArrayRef<int64_t> shape,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index bf8764e1f4541..94da382e3b04c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -630,15 +630,12 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<xegpu::DpasOp>);
-    if (!operand) {
-      DBGS() << "No dpas op result found in warp op\n";
+    if (!operand)
       return rewriter.notifyMatchFailure(warpOp,
                                          "warp result is not a xegpu::Dpas op");
-    }
 
     auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
     unsigned operandIdx = operand->getOperandNumber();
-    DBGS() << "Found dpas op: " << *dpasOp << "\n";
 
     xegpu::LayoutAttr layoutA =
         dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutAAttr());
@@ -647,9 +644,6 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
     xegpu::LayoutAttr layoutOut =
         dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutCdAttr());
 
-    DBGS() << "Layout A: " << layoutA << ", Layout B: " << layoutB
-           << ", Layout Out: " << layoutOut << "\n";
-
     if (!layoutA || !layoutB || !layoutOut)
       return rewriter.notifyMatchFailure(
           dpasOp,
@@ -662,23 +656,6 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
     FailureOr<VectorType> distResultTypeByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());
 
-    DBGS() << "Original LHS type: " << dpasOp.getLhsType() << ", distributed: "
-           << (succeeded(distLhsTypeByWarpOpOrFailure)
-                   ? distLhsTypeByWarpOpOrFailure.value()
-                   : Type())
-           << "\n";
-    DBGS() << "Original RHS type: " << dpasOp.getRhsType() << ", distributed: "
-           << (succeeded(distRhsTypeByWarpOpOrFailure)
-                   ? distRhsTypeByWarpOpOrFailure.value()
-                   : Type())
-           << "\n";
-    DBGS() << "Original Result type: " << dpasOp.getResultType()
-           << ", distributed: "
-           << (succeeded(distResultTypeByWarpOpOrFailure)
-                   ? distResultTypeByWarpOpOrFailure.value()
-                   : Type())
-           << "\n";
-
     if (failed(distLhsTypeByWarpOpOrFailure) ||
         failed(distRhsTypeByWarpOpOrFailure) ||
         failed(distResultTypeByWarpOpOrFailure))
@@ -693,14 +670,11 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
         distRhsTypeByWarpOpOrFailure.value()};
     // Dpas acc operand is optional.
     if (dpasOp.getAcc()) {
-      DBGS() << "Dpas has accumulator operand\n";
       newYieldValues.push_back(dpasOp.getAcc());
       newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
     }
     // Create a new warp op without the dpas.
     SmallVector<size_t> newRetIndices;
-    DBGS() << "Creating new warp op with " << newYieldValues.size()
-           << " yielded values\n";
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
 
@@ -711,22 +685,6 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
     FailureOr<VectorType> expectedDistResultTyOrFailure =
         xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut);
 
-    DBGS() << "Expected dist LHS type: "
-           << (succeeded(expectedDistLhsTyOrFailure)
-                   ? expectedDistLhsTyOrFailure.value()
-                   : Type())
-           << "\n";
-    DBGS() << "Expected dist RHS type: "
-           << (succeeded(expectedDistRhsTyOrFailure)
-                   ? expectedDistRhsTyOrFailure.value()
-                   : Type())
-           << "\n";
-    DBGS() << "Expected dist Result type: "
-           << (succeeded(expectedDistResultTyOrFailure)
-                   ? expectedDistResultTyOrFailure.value()
-                   : Type())
-           << "\n";
-
     if (failed(expectedDistLhsTyOrFailure) ||
         failed(expectedDistRhsTyOrFailure) ||
         failed(expectedDistResultTyOrFailure))
@@ -746,9 +704,6 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
       newDpasOperandExpectedTypes.push_back(distributedResultTy);
 
     for (unsigned i = 0; i < newRetIndices.size(); i++) {
-      DBGS() << "Resolving operand " << i << " with type "
-             << newWarpOp.getResult(newRetIndices[i]).getType() << " to "
-             << newDpasOperandExpectedTypes[i] << "\n";
       newDpasOperands.push_back(
           resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
                                newDpasOperandExpectedTypes[i], rewriter));
@@ -756,17 +711,13 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
     auto newDpasOp = xegpu::DpasOp::create(rewriter, newWarpOp->getLoc(),
                                            distributedResultTy, newDpasOperands,
                                            dpasOp->getAttrs());
-    DBGS() << "Created new dpas op: " << *newDpasOp << "\n";
     xegpu::removeLayoutAttrs(newDpasOp);
     Value distributedVal = newWarpOp.getResult(operandIdx);
     // Resolve the output type.
-    DBGS() << "Resolving output from " << newDpasOp.getResult().getType()
-           << " to " << distResultTypeByWarpOpOrFailure.value() << "\n";
     Value typeResolved =
         resolveDistributedTy(newDpasOp.getResult(),
                              distResultTypeByWarpOpOrFailure.value(), rewriter);
     rewriter.replaceAllUsesWith(distributedVal, typeResolved);
-    DBGS() << "Successfully distributed dpas op\n";
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 49d6b45c2791c..6d6ca47dc10d5 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -113,15 +113,12 @@ std::string xegpu::getTempLayoutName(const OpResult result) {
 }
 
 xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
-  if (!value) {
+  if (!value)
     return nullptr;
-  }
 
   if (auto tdescTy =
-          dyn_cast_if_present<xegpu::TensorDescType>(value.getType())) {
-    auto layoutAttr = tdescTy.getLayoutAttr();
-    return layoutAttr;
-  }
+          dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
+    return tdescTy.getLayoutAttr();
 
   if (auto result = dyn_cast<OpResult>(value)) {
     Operation *defOp = result.getDefiningOp();
@@ -232,6 +229,8 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
   return candidate;
 }
 
+// TODO-LayoutRefactor: Remove this function after replacing use
+//  with setTempDistributeLayoutAttr or setAnchorLayout
 void xegpu::setDistributeLayoutAttr(
     const mlir::OpResult &result,
     const mlir::xegpu::DistributeLayoutAttr layout) {
@@ -253,6 +252,8 @@ void xegpu::setDistributeLayoutAttr(
   }
 }
 
+// TODO-LayoutRefactor: Remove this function after replacing use
+//  with setTempDistributeLayoutAttr or setAnchorLayout
 void xegpu::setDistributeLayoutAttr(const OpOperand &operand,
                                     const DistributeLayoutAttr layout) {
   Operation *owner = operand.getOwner();

>From 2a57b036e710a0f0fc1818a0d874c5fd55c43f85 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 13 Dec 2025 18:08:13 +0000
Subject: [PATCH 21/23] clean up setDistributeLayoutAttr in sg distribution

---
 .../Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 94da382e3b04c..6df7205438b03 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1224,7 +1224,7 @@ static Value lowerToVectorReductions(TypedValue<VectorType> src,
       rewriter, loc, acc.getType(),
       DenseElementsAttr::get(acc.getType(), zeroAttr));
   // Reduction result should have the same layout as the accumulator.
-  xegpu::setDistributeLayoutAttr(
+  xegpu::setTempDistributeLayoutAttr(
       cast<OpResult>(reductionResult),
       xegpu::getTempDistributeLayoutAttr(dyn_cast<OpResult>(acc)));
   // For each slice of the source, extract the slice vector, do a reduction
@@ -1259,8 +1259,8 @@ static Value lowerToVectorReductions(TypedValue<VectorType> src,
     auto accLayout =
         xegpu::getTempDistributeLayoutAttr(dyn_cast<OpResult>(acc));
 
-    xegpu::setDistributeLayoutAttr(slice->getOpOperand(0), srcLayout);
-    xegpu::setDistributeLayoutAttr(slice->getOpResult(0), accLayout);
+    xegpu::setTempDistributeLayoutAttr(slice->getOpOperand(0), srcLayout);
+    xegpu::setTempDistributeLayoutAttr(slice->getOpResult(0), accLayout);
     // Extract and reduction results in scalars, so no result layout is needed.
     Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
     Value reduction = vector::ReductionOp::create(

>From 1dbd1e87ed4dc5f2ed01b1cc220ea77484cffd3e Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 13 Dec 2025 19:12:00 +0000
Subject: [PATCH 22/23] clean up setDistributeLayoutAttr in load optimization
 pass

---
 mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
index ab41fe4298d99..1b59f4da2ecfb 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
@@ -465,7 +465,8 @@ struct XeGPUOptimizeBlockLoadsPass final
     // converted.
     target.addDynamicallyLegalOp<vector::ExtractOp>(
         [&](vector::ExtractOp extractOp) {
-          auto layout = xegpu::getDistributeLayoutAttr(extractOp.getResult());
+          auto layout = xegpu::getTempDistributeLayoutAttr(
+              dyn_cast<OpResult>(extractOp.getResult()));
           if (!layout)
             return true;
           auto laneLayout = layout.getEffectiveLaneLayoutAsInt();

>From 52d560e2a109317199124d260db5744c8f147452 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 13 Dec 2025 20:03:18 +0000
Subject: [PATCH 23/23] clean up setDistributeLayoutAttr in sg distribution
 pass

---
 .../Transforms/XeGPUOptimizeBlockLoads.cpp    | 12 +++---
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 41 ++++++++++---------
 2 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
index 1b59f4da2ecfb..bcafd2b53799f 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
@@ -217,14 +217,14 @@ static Value generateLoads(ConversionPatternRewriter &rewriter,
           origLoadOp.getL3HintAttr(), origLoadOp.getLayoutAttr());
       // Set the layout for the loadOp.
       auto layoutAttr = newTensorDesc.getType().getLayoutAttr();
-      xegpu::setDistributeLayoutAttr(loadOp->getOpResult(0), layoutAttr);
+      loadOp.setAnchorLayout(layoutAttr);
       // Insert the loaded block into the right position in data.
       auto insertOp = vector::InsertStridedSliceOp::create(
           rewriter, loc, loadOp.getResult(), data,
           ArrayRef<int64_t>{localOffsetDim0, localOffsetDim1},
           ArrayRef<int64_t>{1, 1});
       // InsertOp must have the same layout as newTensorDesc.
-      xegpu::setDistributeLayoutAttr(insertOp->getOpResult(0), layoutAttr);
+      xegpu::setTempDistributeLayoutAttr(insertOp->getOpResult(0), layoutAttr);
       data = insertOp.getResult();
     }
   }
@@ -366,8 +366,8 @@ class XeGPULoadNdDescOpPattern final
         auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
                                                    bitcastType, slice);
         // BitCastOp must have the same layout as the original loadNdOp.
-        xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0),
-                                       origTensorDescType.getLayoutAttr());
+        xegpu::setTempDistributeLayoutAttr(bitCastOp->getOpResult(0),
+                                           origTensorDescType.getLayoutAttr());
         arraySlices.push_back(bitCastOp.getResult());
       }
       rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices});
@@ -384,8 +384,8 @@ class XeGPULoadNdDescOpPattern final
     auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
                                                loadNdOp.getType(), data);
     // BitCastOp must have the same layout as the original loadNdOp.
-    xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0),
-                                   origTensorDescType.getLayoutAttr());
+    xegpu::setTempDistributeLayoutAttr(bitCastOp->getOpResult(0),
+                                       origTensorDescType.getLayoutAttr());
     rewriter.replaceOp(loadNdOp, bitCastOp);
     return success();
   }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index f9f5a1f6b285e..eb2f389fd13b8 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -508,8 +508,8 @@ struct WgToSgVectorBroadcastOp
     for (auto operand : adaptor.getOperands().front()) {
       auto newBroadcast = vector::BroadcastOp::create(rewriter, op.getLoc(),
                                                       newResultType, operand);
-      xegpu::setDistributeLayoutAttr(newBroadcast->getResult(0),
-                                     layout.dropSgLayoutAndData());
+      xegpu::setTempDistributeLayoutAttr(newBroadcast->getResult(0),
+                                         layout.dropSgLayoutAndData());
 
       newBroadcastOps.push_back(newBroadcast.getResult());
     }
@@ -756,8 +756,8 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
     auto eltType = vecType.getElementType();
 
     auto setLayout = [&](Value val) {
-      xegpu::setDistributeLayoutAttr(llvm::dyn_cast<OpResult>(val),
-                                     layout.dropSgLayoutAndData());
+      xegpu::setTempDistributeLayoutAttr(llvm::dyn_cast<OpResult>(val),
+                                         layout.dropSgLayoutAndData());
     };
 
     if (vecAttr.isSplat()) {
@@ -932,7 +932,7 @@ struct WgToSgLoadGatherOpWithOffset
           rewriter, loc, newTy, op.getSource(), offsets, mask, chunkSizeAttr,
           op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(),
           newLayout);
-      xegpu::setDistributeLayoutAttr(newLoadOp->getResult(0), newLayout);
+      newLoadOp.setAnchorLayout(newLayout);
       newLoadOps.push_back(newLoadOp);
     }
     rewriter.replaceOpWithMultiple(op, {newLoadOps});
@@ -987,7 +987,8 @@ struct WgToSgStoreScatterOpWithOffset
         // Skip for operand one (memref)
         if (operand.getOperandNumber() == 1)
           continue;
-        xegpu::setDistributeLayoutAttr(operand, layout.dropSgLayoutAndData());
+        xegpu::setTempDistributeLayoutAttr(operand,
+                                           layout.dropSgLayoutAndData());
       }
     }
     rewriter.eraseOp(op);
@@ -1080,12 +1081,12 @@ struct WgToSgVectorStepOp : public OpConversionPattern<vector::StepOp> {
           vector::BroadcastOp::create(rewriter, loc, newTy, offsets[0]);
       auto finalSteps =
           arith::AddIOp::create(rewriter, loc, steps, bcastOffset);
-      xegpu::setDistributeLayoutAttr(steps->getResult(0),
-                                     layout.dropSgLayoutAndData());
-      xegpu::setDistributeLayoutAttr(bcastOffset->getResult(0),
-                                     layout.dropSgLayoutAndData());
-      xegpu::setDistributeLayoutAttr(finalSteps->getResult(0),
-                                     layout.dropSgLayoutAndData());
+      xegpu::setTempDistributeLayoutAttr(steps->getResult(0),
+                                         layout.dropSgLayoutAndData());
+      xegpu::setTempDistributeLayoutAttr(bcastOffset->getResult(0),
+                                         layout.dropSgLayoutAndData());
+      xegpu::setTempDistributeLayoutAttr(finalSteps->getResult(0),
+                                         layout.dropSgLayoutAndData());
       newOps.push_back(finalSteps);
     }
 
@@ -1154,8 +1155,8 @@ struct WgToSgVectorShapeCastOp
     for (auto src : adaptor.getSource()) {
       auto newShapeCast = vector::ShapeCastOp::create(rewriter, op.getLoc(),
                                                       newResultType, src);
-      xegpu::setDistributeLayoutAttr(newShapeCast->getResult(0),
-                                     layout.dropSgLayoutAndData());
+      xegpu::setTempDistributeLayoutAttr(newShapeCast->getResult(0),
+                                         layout.dropSgLayoutAndData());
       newShapeCastOps.push_back(newShapeCast.getResult());
     }
 
@@ -1216,8 +1217,8 @@ struct WgToSgMultiDimReductionOp
       auto newOp = vector::MultiDimReductionOp::create(
           rewriter, op.getLoc(), newDstType, op.getKind(), sgSrc,
           adaptor.getAcc()[0], op.getReductionDims());
-      xegpu::setDistributeLayoutAttr(newOp->getResult(0),
-                                     layout.dropSgLayoutAndData());
+      xegpu::setTempDistributeLayoutAttr(newOp->getResult(0),
+                                         layout.dropSgLayoutAndData());
       newReductions.push_back(newOp.getResult());
     }
 
@@ -1280,8 +1281,8 @@ struct WgToSgVectorTransposeOp
     for (auto src : adaptor.getVector()) {
       auto newTranspose = vector::TransposeOp::create(
           rewriter, op.getLoc(), newResultType, src, permutation);
-      xegpu::setDistributeLayoutAttr(newTranspose->getResult(0),
-                                     layout.dropSgLayoutAndData());
+      xegpu::setTempDistributeLayoutAttr(newTranspose->getResult(0),
+                                         layout.dropSgLayoutAndData());
       newTransposeOps.push_back(newTranspose.getResult());
     }
 
@@ -1350,8 +1351,8 @@ struct WgToSgVectorMaskOp : public OpConversionPattern<MaskOpType> {
 
       auto newCreateMaskOp =
           vector::CreateMaskOp::create(rewriter, loc, resultType, maskOperands);
-      xegpu::setDistributeLayoutAttr(newCreateMaskOp->getResult(0),
-                                     layout.dropSgLayoutAndData());
+      xegpu::setTempDistributeLayoutAttr(newCreateMaskOp->getResult(0),
+                                         layout.dropSgLayoutAndData());
       newCreateMaskOps.push_back(newCreateMaskOp.getResult());
     }