[Mlir-commits] [mlir] [mlir][xegpu] SIMT distribution patterns for XeGPU CreateNdTdesc, LoadNd, StoreNd and Dpas Ops. (PR #135271)

Mon Apr 14 14:25:18 PDT 2025

https://github.com/charithaintc updated https://github.com/llvm/llvm-project/pull/135271

>From 39dcf9dbcd85ee7f9b413f6ae01128420d0f7ad0 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 18 Mar 2025 20:32:24 +0000
Subject: [PATCH 01/45] save work

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  8 +-
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |  6 +-
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 74 +++++++++++++++++--
 3 files changed, 77 insertions(+), 11 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 78c242571935c..f09919f99c756 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -131,10 +131,10 @@ LogicalResult
 SGMapAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
                   llvm::ArrayRef<uint32_t> wi_layout,
                   llvm::ArrayRef<uint32_t> wi_data) {
-  if (wi_layout.size() != 2)
-    return emitError() << "expected wi_layout of size 2";
-  if (wi_data.size() != 2)
-    return emitError() << "expected wi_data of size 2";
+  if (wi_layout.size() != 1 && wi_layout.size() != 2)
+    return emitError() << "expected 1D or 2D wi_layout";
+  if (wi_data.size() != 1 && wi_data.size() != 2)
+    return emitError() << "expected 1D or 2D wi_data";
   return success();
 }
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 3bdf3fb218b45..89b96383699f6 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -95,12 +95,14 @@ isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
   assert(succeeded(expectedValueShapeOrFailure) &&
          "Failed to compute distributed vector shape for "
          "tensor descriptor ");
+  bool isSIMD = valueShape == adjustedTdescShape;
+  bool isSIMT = valueShape == expectedValueShapeOrFailure.value().getShape();
 
-  return valueTy == expectedValueShapeOrFailure.value()
+  return (isSIMD || isSIMT)
              ? success()
              : emitError()
                    << "Result shape " << makeString(valueShape)
-                   << " is not consistent with distributed vector shape "
+                   << " is not consistent with SIMD/SIMT vector shape "
                    << makeString(expectedValueShapeOrFailure.value().getShape())
                    << " for tensor descriptor " << tdescTy;
 }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 9531837625878..2e8f91b252ab0 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -15,9 +15,16 @@
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/Visitors.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace mlir {
@@ -120,6 +127,8 @@ struct SGMap {
 
   const WiLayout &getLayout() const { return wiLayout; }
   const WiData &getData() const { return wiData; }
+  ArrayRef<int64_t> getLayoutAsArrayRef() const { return wiLayout.layout; }
+  ArrayRef<int64_t> getDataAsArrayRef() const { return wiData.layout; }
 };
 
 void SGMap::print(raw_ostream &os) const {
@@ -223,6 +232,10 @@ static SGMap getSGMapForDPASOperand(VectorType vectorTy, unsigned operandNum) {
   return getDefaultSgMap(vectorTy);
 }
 
+static SGMap getSupportedSGMapForOp(Operation *op) {
+  return getDefaultSgMap(2);
+}
+
 ///===----------------------------------------------------------------------===///
 /// SGMapPropagation
 ///===----------------------------------------------------------------------===///
@@ -634,6 +647,56 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
   }
 }
 
+static LogicalResult
+attachLayoutAttributes(Operation *top,
+                       llvm::function_ref<SGMap(Value)> getPropagatedLayout) {
+  /// Helper to convert SGMap to xegpu::SGMapAttr.
+  auto getSGMapForResult = [&](Value r) -> Attribute {
+    auto layout = getPropagatedLayout(r);
+    if (!layout.isAssigned())
+      return {};
+    SmallVector<uint32_t, 2> wiLayout, wiData;
+    for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
+                                               layout.getDataAsArrayRef())) {
+      wiLayout.push_back(static_cast<uint32_t>(layout));
+      wiData.push_back(static_cast<uint32_t>(data));
+    }
+    return xegpu::SGMapAttr::get(top->getContext(), wiLayout, wiData);
+  };
+  /// Attach the layout attributes to the results of the operations.
+  top->walk([&](Operation *op) {
+    /// If no results, skip the operation.
+    if (op->getNumResults() == 0)
+      return;
+    if (auto tensorDescTy =
+            dyn_cast<xegpu::TensorDescType>(op->getResult(0).getType())) {
+      auto sgMapAttr = getSGMapForResult(op->getResult(0));
+      if (!sgMapAttr)
+        op->emitError("Expecting a layout for the result tensor descriptor.");
+      /// Clone the op, attach the sg_map to the result tensor descriptor, and
+      /// remove the original op.
+      OpBuilder builder(op);
+      auto *newOp = builder.clone(*op);
+      auto newTensorDescTy = xegpu::TensorDescType::get(
+          tensorDescTy.getContext(), tensorDescTy.getShape(),
+          tensorDescTy.getElementType(), tensorDescTy.getEncoding(), sgMapAttr);
+      newOp->getResult(0).setType(newTensorDescTy);
+      op->replaceAllUsesWith(newOp->getResults());
+      op->erase();
+      return;
+    }
+    /// Otherwise simply attach the sg_map to the op itself.
+    for (auto [i, r] : llvm::enumerate(op->getResults())) {
+      auto sgMapAttr = getSGMapForResult(r);
+      if (sgMapAttr) {
+        auto attrName = "r" + std::to_string(i);
+        op->setAttr(attrName, sgMapAttr);
+      }
+    }
+  });
+  return success();
+}
+
 namespace {
 struct XeGPUSubgroupDistributePass final
     : public xegpu::impl::XeGPUSubgroupDistributeBase<
@@ -648,13 +711,14 @@ struct XeGPUSubgroupDistributePass final
 } // namespace
 
 void XeGPUSubgroupDistributePass::runOnOperation() {
-  Operation *op = getOperation();
-  RunSGMapPropagation solver(op);
-
-  // Print the analysis result and exit.
+  auto &analyis = getAnalysis<RunSGMapPropagation>();
+  // Print the analysis result and exit. (for testing purposes)
   if (printOnly) {
     auto &os = llvm::outs();
-    solver.printAnalysisResult(os);
+    analyis.printAnalysisResult(os);
     return;
   }
+  auto getPropagatedLayout = [&](Value val) { return analyis.getSGMap(val); };
+  if (failed(attachLayoutAttributes(getOperation(), getPropagatedLayout)))
+    signalPassFailure();
 }

>From 20587736992b0eac55454006b67d53225d5c9494 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 20 Mar 2025 21:12:33 +0000
Subject: [PATCH 02/45] moving all ops to region working

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 108 ++++++-------
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 148 +++++++++++++++++-
 2 files changed, 196 insertions(+), 60 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 89b96383699f6..f82084eed6570 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -569,60 +569,60 @@ void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
 // XeGPU_DpasOp
 //===----------------------------------------------------------------------===//
 LogicalResult DpasOp::verify() {
-  int64_t lhsRank = getLhsType().getRank();
-  int64_t rhsRank = getRhsType().getRank();
-  int64_t resultRank = getResultType().getRank();
-  auto lhsShape = getLhsType().getShape();
-  auto rhsShape = getRhsType().getShape();
-  auto resultShape = getResultType().getShape();
-
-  auto sgMapA = getSgMapAAttr();
-  auto sgMapB = getSgMapBAttr();
-  auto sgMapC = getSgMapCAttr();
-
-  // If sg_maps are not present, then the operation is in SIMD mode.
-  if (!sgMapA && !sgMapB && !sgMapC) {
-    if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resultRank != 2)
-      return emitOpError(
-          "expecting lhs and result to be a 2D vector, and rhs to be either "
-          "2D or 3D (packed) vector.");
-    auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
-    if (bK != lhsShape[1])
-      return emitOpError("K-dimension mismatch.");
-    if (lhsShape[0] != resultShape[0])
-      return emitOpError("M-dimension mismatch.");
-    if (rhsShape[1] != resultShape[1])
-      return emitOpError("N-dimension mismatch.");
-    return success();
-  }
-  // Otherwise, in SIMT mode we expect sg_map attributes for all operands and
-  // result of DPAS operation.
-  if (!sgMapA || !sgMapB || !sgMapC)
-    return emitOpError("sg_map attributes for all operands and outputs are "
-                       "expected in SIMT xegpu::Dpas operation");
-
-  // In SIMT mode, All data fragments must be 2D
-  if (lhsRank != 2 || rhsRank != 2 || resultRank != 2)
-    return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
-
-  auto wiLayoutA = sgMapA.getWiLayout();
-  auto wiLayoutB = sgMapB.getWiLayout();
-  auto wiLayoutC = sgMapC.getWiLayout();
-  // Obtain the expanded shapes of the operands and result using wi_layout.
-  // NOTE: For B, get rid of the packed dimension for the expanded shape.
-  SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],
-                                         lhsShape[1] * wiLayoutA[1]};
-  SmallVector<int64_t> expandedShapeB = {
-      rhsShape[0] * rhsShape[1] * wiLayoutB[0], 1 * wiLayoutB[1]};
-  SmallVector<int64_t> expandedShapeC = {resultShape[0] * wiLayoutC[0],
-                                         resultShape[1] * wiLayoutC[1]};
-  auto bK = expandedShapeB[0];
-  if (bK != expandedShapeA[1])
-    return emitOpError("K-dimension mismatch.");
-  if (expandedShapeA[0] != expandedShapeC[0])
-    return emitOpError("M-dimension mismatch.");
-  if (expandedShapeB[1] != expandedShapeC[1])
-    return emitOpError("N-dimension mismatch.");
+  // int64_t lhsRank = getLhsType().getRank();
+  // int64_t rhsRank = getRhsType().getRank();
+  // int64_t resultRank = getResultType().getRank();
+  // auto lhsShape = getLhsType().getShape();
+  // auto rhsShape = getRhsType().getShape();
+  // auto resultShape = getResultType().getShape();
+
+  // auto sgMapA = getSgMapAAttr();
+  // auto sgMapB = getSgMapBAttr();
+  // auto sgMapC = getSgMapCAttr();
+
+  // // If sg_maps are not present, then the operation is in SIMD mode.
+  // if (!sgMapA && !sgMapB && !sgMapC) {
+  //   if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resultRank != 2)
+  //     return emitOpError(
+  //         "expecting lhs and result to be a 2D vector, and rhs to be either "
+  //         "2D or 3D (packed) vector.");
+  //   auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
+  //   if (bK != lhsShape[1])
+  //     return emitOpError("K-dimension mismatch.");
+  //   if (lhsShape[0] != resultShape[0])
+  //     return emitOpError("M-dimension mismatch.");
+  //   if (rhsShape[1] != resultShape[1])
+  //     return emitOpError("N-dimension mismatch.");
+  //   return success();
+  // }
+  // // Otherwise, in SIMT mode we expect sg_map attributes for all operands and
+  // // result of DPAS operation.
+  // if (!sgMapA || !sgMapB || !sgMapC)
+  //   return emitOpError("sg_map attributes for all operands and outputs are "
+  //                      "expected in SIMT xegpu::Dpas operation");
+
+  // // In SIMT mode, All data fragments must be 2D
+  // if (lhsRank != 2 || rhsRank != 2 || resultRank != 2)
+  //   return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
+
+  // auto wiLayoutA = sgMapA.getWiLayout();
+  // auto wiLayoutB = sgMapB.getWiLayout();
+  // auto wiLayoutC = sgMapC.getWiLayout();
+  // // Obtain the expanded shapes of the operands and result using wi_layout.
+  // // NOTE: For B, get rid of the packed dimension for the expanded shape.
+  // SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],
+  //                                        lhsShape[1] * wiLayoutA[1]};
+  // SmallVector<int64_t> expandedShapeB = {
+  //     rhsShape[0] * rhsShape[1] * wiLayoutB[0], 1 * wiLayoutB[1]};
+  // SmallVector<int64_t> expandedShapeC = {resultShape[0] * wiLayoutC[0],
+  //                                        resultShape[1] * wiLayoutC[1]};
+  // auto bK = expandedShapeB[0];
+  // if (bK != expandedShapeA[1])
+  //   return emitOpError("K-dimension mismatch.");
+  // if (expandedShapeA[0] != expandedShapeC[0])
+  //   return emitOpError("M-dimension mismatch.");
+  // if (expandedShapeB[1] != expandedShapeC[1])
+  //   return emitOpError("N-dimension mismatch.");
 
   return success();
 }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 2e8f91b252ab0..8ec817693a183 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -17,9 +17,17 @@
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeRange.h"
+#include "mlir/IR/Value.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -34,6 +42,9 @@ namespace xegpu {
 } // namespace xegpu
 } // namespace mlir
 
+#define DEBUG_TYPE "xegpu-subgroup-distribute"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
 using namespace mlir;
 using namespace mlir::dataflow;
 
@@ -647,6 +658,27 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
   }
 }
 
+void attachLayoutAttributeToUsers(Value v, Attribute layout) {
+  for (OpOperand &user : v.getUses()) {
+    Operation *owner = user.getOwner();
+    unsigned operandNumber = user.getOperandNumber();
+    /// If the user is a DpasOp, set "sg_map_a", "sg_map_b", or "sg_map_c"
+    /// attribute.
+    if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
+      if (operandNumber == 0)
+        dpasOp->setAttr("sg_map_a", layout);
+      else if (operandNumber == 1)
+        dpasOp->setAttr("sg_map_b", layout);
+      else if (operandNumber == 2)
+        dpasOp->setAttr("sg_map_c", layout);
+      continue;
+    }
+    /// For every other user, use a generic attribute name.
+    std::string attrName = "op" + std::to_string(operandNumber);
+    owner->setAttr(attrName, layout);
+  }
+}
+
 static LogicalResult
 attachLayoutAttributes(Operation *top,
                        llvm::function_ref<SGMap(Value)> getPropagatedLayout) {
@@ -664,15 +696,18 @@ attachLayoutAttributes(Operation *top,
     return xegpu::SGMapAttr::get(top->getContext(), wiLayout, wiData);
   };
   /// Attach the layout attributes to the results of the operations.
-  top->walk([&](Operation *op) {
-    /// If no results, skip the operation.
+  auto walkResult = top->walk([&](Operation *op) {
+    /// If no results, move on.
     if (op->getNumResults() == 0)
-      return;
+      return WalkResult::advance();
     if (auto tensorDescTy =
             dyn_cast<xegpu::TensorDescType>(op->getResult(0).getType())) {
       auto sgMapAttr = getSGMapForResult(op->getResult(0));
-      if (!sgMapAttr)
-        op->emitError("Expecting a layout for the result tensor descriptor.");
+      if (!sgMapAttr) {
+        LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n");
+        return WalkResult::interrupt();
+      }
+
       /// Clone the op, attach the sg_map to the result tensor descriptor, and
       /// remove the original op.
       OpBuilder builder(op);
@@ -683,7 +718,7 @@ attachLayoutAttributes(Operation *top,
       newOp->getResult(0).setType(newTensorDescTy);
       op->replaceAllUsesWith(newOp->getResults());
       op->erase();
-      return;
+      return WalkResult::advance();
     }
     /// Otherwise simply attach the sg_map to the op itself.
     for (auto [i, r] : llvm::enumerate(op->getResults())) {
@@ -691,12 +726,104 @@ attachLayoutAttributes(Operation *top,
       if (sgMapAttr) {
         auto attrName = "r" + std::to_string(i);
         op->setAttr(attrName, sgMapAttr);
+        /// Attach the layout attribute to the users of the result.
+        attachLayoutAttributeToUsers(r, sgMapAttr);
       }
     }
+    return WalkResult::advance();
   });
+
+  return failure(walkResult.wasInterrupted());
+}
+
+static LogicalResult resolveLayoutConflicts(Operation *top) {
+  /// TODO: Implement the layout conflict resolution.
   return success();
 }
 
+namespace {
+
+struct MoveFuncBodyToWarpExecuteOnLane0
+    : public OpRewritePattern<gpu::GPUFuncOp> {
+  using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFunc,
+                                PatternRewriter &rewriter) const override {
+    // if the function contains warp_execute_on_lane0, return
+    if (llvm::any_of(gpuFunc.getBody().getOps(), [](Operation &op) {
+          return isa<gpu::WarpExecuteOnLane0Op>(op);
+        }))
+      return failure();
+    // // if the first op is already warp_execute_on_lane0, return
+    // auto &body = gpuFunc.getBody();
+    // auto &entryBlock = body.front();
+    // if (entryBlock.empty())
+    //   return failure();
+    // // llvm::errs() << "entry block: " << entryBlock << "\n";
+    // auto &firstOp = entryBlock.front();
+    // if (isa<gpu::LaneIdOp>(firstOp))
+    //   return failure();
+
+    // llvm::errs() << "First op: " << firstOp << "\n";
+
+    // create a new function with the same signature
+    auto newFunc = rewriter.create<gpu::GPUFuncOp>(
+        gpuFunc.getLoc(), gpuFunc.getName(), gpuFunc.getFunctionType());
+    rewriter.setInsertionPointToEnd(&newFunc.getFunctionBody().front());
+    auto laneId = rewriter.create<gpu::LaneIdOp>(
+        newFunc.getLoc(), rewriter.getIndexType(),
+        /** upperBound = **/ mlir::IntegerAttr());
+
+    // rewriter.startOpModification(gpuFunc);
+    // rewriter.setInsertionPoint(&firstOp);
+    auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
+        laneId.getLoc(), newFunc->getResultTypes(), laneId, subgroupSize,
+        newFunc.getArguments(), newFunc.getArgumentTypes());
+    auto &warpBodyBlock = warpOp.getBodyRegion().front();
+    auto origRetunOp =
+        cast<gpu::ReturnOp>(gpuFunc.getBlocks().back().getTerminator());
+    rewriter.setInsertionPointAfter(origRetunOp);
+    rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
+                                  origRetunOp.getOperands());
+    // erase return op
+    rewriter.eraseOp(origRetunOp);
+    // auto returnOp =
+    // cast<gpu::ReturnOp>(gpuFunc.getBlocks().end()->getTerminator());
+    // rewriter.startOpModification(returnOp);
+    // rewriter.replaceOpWithNewOp<gpu::YieldOp>(returnOp,
+    // newFunc.getArguments()); rewriter.finalizeOpModification(returnOp);
+    rewriter.inlineRegionBefore(gpuFunc.getBody(), warpOp.getBodyRegion(),
+                                warpOp.getBodyRegion().begin());
+    rewriter.eraseBlock(&warpBodyBlock);
+    // auto &newWarpBody = warpOp.getBodyRegion();
+    // auto returnOp = cast<gpu::ReturnOp>(newWarpBody.end()->getTerminator());
+    // rewriter.replaceOpWithNewOp<gpu::YieldOp>(returnOp,
+    // returnOp.getOperands());
+    // rewriter.setInsertionPointToEnd(&warpOp.getBodyRegion().front());
+    // add a gpu.yield
+    // rewriter.create<gpu::YieldOp>(warpOp.getLoc(), warpOp.getResults());
+    // rewriter.inlineRegionBefore(gpuFunc.getBody(),
+    // warpOp.getBodyRegion(),
+    rewriter.setInsertionPointAfter(warpOp);
+    rewriter.create<gpu::ReturnOp>(newFunc.getLoc(), warpOp.getResults());
+    //                             warpOp.getBodyRegion().begin());
+    // // rewriter.eraseOp(gpuFunc);
+    // // get the function return op
+    // auto returnOp = cast<gpu::ReturnOp>(warpOp.getBody()->getTerminator());
+    // rewriter.replaceOpWithNewOp<gpu::YieldOp>(returnOp,
+    // returnOp.getOperands());
+    // // rewriter.eraseOp(returnOp);
+    // // create a new function return which retuns the result of the warp
+    // rewriter.setInsertionPointAfter(warpOp);
+    // rewriter.create<gpu::ReturnOp>(warpOp.getLoc(), warpOp.getResults());
+    // rewriter.finalizeOpModification(gpuFunc);
+    rewriter.replaceOp(gpuFunc, newFunc);
+    return success();
+  }
+};
+
+} // namespace
+
 namespace {
 struct XeGPUSubgroupDistributePass final
     : public xegpu::impl::XeGPUSubgroupDistributeBase<
@@ -721,4 +848,13 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
   auto getPropagatedLayout = [&](Value val) { return analyis.getSGMap(val); };
   if (failed(attachLayoutAttributes(getOperation(), getPropagatedLayout)))
     signalPassFailure();
+  if (failed(resolveLayoutConflicts(getOperation())))
+    signalPassFailure();
+  /// Move all operations inside a GPU functions inside
+  /// gpu.warp_execute_on_lane0
+  {
+    RewritePatternSet patterns(&getContext());
+    patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
+  }
 }

>From 14233fa812ac0f1743547a1cd13ac672b1a64a7e Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 20 Mar 2025 21:35:24 +0000
Subject: [PATCH 03/45] moving all ops to region working

---
 .../Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp    | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 8ec817693a183..73256b822db29 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -855,6 +855,10 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
   {
     RewritePatternSet patterns(&getContext());
     patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
-    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
+    GreedyRewriteConfig config;
+    config.fold = false;
+    // config.cseConstants = false;
+    // config.enableRegionSimplification = GreedySimplifyRegionLevel::Disabled;
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
   }
 }

>From f599873a8cbb823b1134993eb8a22591b0a409db Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 20 Mar 2025 22:25:31 +0000
Subject: [PATCH 04/45] save work

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 84 +++++--------------
 1 file changed, 23 insertions(+), 61 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 73256b822db29..82401f542543e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -243,10 +243,6 @@ static SGMap getSGMapForDPASOperand(VectorType vectorTy, unsigned operandNum) {
   return getDefaultSgMap(vectorTy);
 }
 
-static SGMap getSupportedSGMapForOp(Operation *op) {
-  return getDefaultSgMap(2);
-}
-
 ///===----------------------------------------------------------------------===///
 /// SGMapPropagation
 ///===----------------------------------------------------------------------===///
@@ -747,77 +743,42 @@ struct MoveFuncBodyToWarpExecuteOnLane0
     : public OpRewritePattern<gpu::GPUFuncOp> {
   using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFunc,
+  LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
                                 PatternRewriter &rewriter) const override {
-    // if the function contains warp_execute_on_lane0, return
-    if (llvm::any_of(gpuFunc.getBody().getOps(), [](Operation &op) {
+    /// If the function all ready moved inside a warp_execute_on_lane0, skip.
+    if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
           return isa<gpu::WarpExecuteOnLane0Op>(op);
         }))
       return failure();
-    // // if the first op is already warp_execute_on_lane0, return
-    // auto &body = gpuFunc.getBody();
-    // auto &entryBlock = body.front();
-    // if (entryBlock.empty())
-    //   return failure();
-    // // llvm::errs() << "entry block: " << entryBlock << "\n";
-    // auto &firstOp = entryBlock.front();
-    // if (isa<gpu::LaneIdOp>(firstOp))
-    //   return failure();
-
-    // llvm::errs() << "First op: " << firstOp << "\n";
-
-    // create a new function with the same signature
-    auto newFunc = rewriter.create<gpu::GPUFuncOp>(
-        gpuFunc.getLoc(), gpuFunc.getName(), gpuFunc.getFunctionType());
-    rewriter.setInsertionPointToEnd(&newFunc.getFunctionBody().front());
+    /// Create a new function with the same signature.
+    auto newGpuFunc = rewriter.create<gpu::GPUFuncOp>(
+        gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType());
+    /// Create a WarpExecuteOnLane0Op with same arguments and results as the
+    /// original gpuFuncOp.
+    rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
     auto laneId = rewriter.create<gpu::LaneIdOp>(
-        newFunc.getLoc(), rewriter.getIndexType(),
+        newGpuFunc.getLoc(), rewriter.getIndexType(),
         /** upperBound = **/ mlir::IntegerAttr());
-
-    // rewriter.startOpModification(gpuFunc);
-    // rewriter.setInsertionPoint(&firstOp);
+    auto gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
     auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
-        laneId.getLoc(), newFunc->getResultTypes(), laneId, subgroupSize,
-        newFunc.getArguments(), newFunc.getArgumentTypes());
+        laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize,
+        newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes());
     auto &warpBodyBlock = warpOp.getBodyRegion().front();
+    /// Replace the ReturnOp of the original gpu function with a YieldOp.
     auto origRetunOp =
-        cast<gpu::ReturnOp>(gpuFunc.getBlocks().back().getTerminator());
+        cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
     rewriter.setInsertionPointAfter(origRetunOp);
     rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
                                   origRetunOp.getOperands());
-    // erase return op
     rewriter.eraseOp(origRetunOp);
-    // auto returnOp =
-    // cast<gpu::ReturnOp>(gpuFunc.getBlocks().end()->getTerminator());
-    // rewriter.startOpModification(returnOp);
-    // rewriter.replaceOpWithNewOp<gpu::YieldOp>(returnOp,
-    // newFunc.getArguments()); rewriter.finalizeOpModification(returnOp);
-    rewriter.inlineRegionBefore(gpuFunc.getBody(), warpOp.getBodyRegion(),
+    /// Move the original function body to the warp body.
+    rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
                                 warpOp.getBodyRegion().begin());
     rewriter.eraseBlock(&warpBodyBlock);
-    // auto &newWarpBody = warpOp.getBodyRegion();
-    // auto returnOp = cast<gpu::ReturnOp>(newWarpBody.end()->getTerminator());
-    // rewriter.replaceOpWithNewOp<gpu::YieldOp>(returnOp,
-    // returnOp.getOperands());
-    // rewriter.setInsertionPointToEnd(&warpOp.getBodyRegion().front());
-    // add a gpu.yield
-    // rewriter.create<gpu::YieldOp>(warpOp.getLoc(), warpOp.getResults());
-    // rewriter.inlineRegionBefore(gpuFunc.getBody(),
-    // warpOp.getBodyRegion(),
+    /// Insert a new ReturnOp after the WarpExecuteOnLane0Op.
     rewriter.setInsertionPointAfter(warpOp);
-    rewriter.create<gpu::ReturnOp>(newFunc.getLoc(), warpOp.getResults());
-    //                             warpOp.getBodyRegion().begin());
-    // // rewriter.eraseOp(gpuFunc);
-    // // get the function return op
-    // auto returnOp = cast<gpu::ReturnOp>(warpOp.getBody()->getTerminator());
-    // rewriter.replaceOpWithNewOp<gpu::YieldOp>(returnOp,
-    // returnOp.getOperands());
-    // // rewriter.eraseOp(returnOp);
-    // // create a new function return which retuns the result of the warp
-    // rewriter.setInsertionPointAfter(warpOp);
-    // rewriter.create<gpu::ReturnOp>(warpOp.getLoc(), warpOp.getResults());
-    // rewriter.finalizeOpModification(gpuFunc);
-    rewriter.replaceOp(gpuFunc, newFunc);
+    rewriter.create<gpu::ReturnOp>(newGpuFunc.getLoc(), warpOp.getResults());
+    rewriter.replaceOp(gpuFuncOp, newGpuFunc);
     return success();
   }
 };
@@ -855,10 +816,11 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
   {
     RewritePatternSet patterns(&getContext());
     patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
+    /// We want to avoid ops from hoisted out of the gpu.warp_execute_on_lane0
+    /// region.
     GreedyRewriteConfig config;
+    config.cseConstants = false;
     config.fold = false;
-    // config.cseConstants = false;
-    // config.enableRegionSimplification = GreedySimplifyRegionLevel::Disabled;
     (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
   }
 }

>From 220ed1f95c6b55ab0f4fd932fd2d6b51caa04777 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 21 Mar 2025 01:41:57 +0000
Subject: [PATCH 05/45] save work

---
 mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 82401f542543e..09e4e2ce2feba 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -745,7 +745,7 @@ struct MoveFuncBodyToWarpExecuteOnLane0
 
   LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
                                 PatternRewriter &rewriter) const override {
-    /// If the function all ready moved inside a warp_execute_on_lane0, skip.
+    /// If the function already moved inside a warp_execute_on_lane0, skip.
     if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
           return isa<gpu::WarpExecuteOnLane0Op>(op);
         }))
@@ -771,7 +771,7 @@ struct MoveFuncBodyToWarpExecuteOnLane0
     rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
                                   origRetunOp.getOperands());
     rewriter.eraseOp(origRetunOp);
-    /// Move the original function body to the warp body.
+    /// Move the original function body to the WarpExecuteOnLane0Op body.
     rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
                                 warpOp.getBodyRegion().begin());
     rewriter.eraseBlock(&warpBodyBlock);

>From 2a8070feae17b213c18b18510d1b09b088f7d274 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 21 Mar 2025 16:04:53 +0000
Subject: [PATCH 06/45] save work

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 455 ++++++++++++++++--
 1 file changed, 417 insertions(+), 38 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 09e4e2ce2feba..db8c321487a1c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -10,6 +10,7 @@
 #include "mlir/Analysis/DataFlow/SparseAnalysis.h"
 #include "mlir/Analysis/DataFlowFramework.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
@@ -739,52 +740,430 @@ static LogicalResult resolveLayoutConflicts(Operation *top) {
 
 namespace {
 
+///===----------------------------------------------------------------------===///
+/// SIMT Distribution Patterns
+///===----------------------------------------------------------------------===///
+
+/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
+/// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
+/// contained within a WarpExecuteOnLane0Op.
+/// Example:
+///
+/// ```
+///   gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
+///     ...
+///     ...
+///     gpu.yield %result: vector<8x16xf32>
+///   }
+/// ```
+/// To
+/// ```
+///   gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
+///     %laneid = gpu.lane_id : index
+///     %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> {
+///       ...
+///       ...
+///       gpu.yield %result: vector<8x16xf32>
+///     }
+///     return %0
+///   }
 struct MoveFuncBodyToWarpExecuteOnLane0
     : public OpRewritePattern<gpu::GPUFuncOp> {
   using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
-
   LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
-                                PatternRewriter &rewriter) const override {
-    /// If the function already moved inside a warp_execute_on_lane0, skip.
-    if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
-          return isa<gpu::WarpExecuteOnLane0Op>(op);
-        }))
-      return failure();
-    /// Create a new function with the same signature.
-    auto newGpuFunc = rewriter.create<gpu::GPUFuncOp>(
-        gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType());
-    /// Create a WarpExecuteOnLane0Op with same arguments and results as the
-    /// original gpuFuncOp.
-    rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
-    auto laneId = rewriter.create<gpu::LaneIdOp>(
-        newGpuFunc.getLoc(), rewriter.getIndexType(),
-        /** upperBound = **/ mlir::IntegerAttr());
-    auto gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
-    auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
-        laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize,
-        newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes());
-    auto &warpBodyBlock = warpOp.getBodyRegion().front();
-    /// Replace the ReturnOp of the original gpu function with a YieldOp.
-    auto origRetunOp =
-        cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
-    rewriter.setInsertionPointAfter(origRetunOp);
-    rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
-                                  origRetunOp.getOperands());
-    rewriter.eraseOp(origRetunOp);
-    /// Move the original function body to the WarpExecuteOnLane0Op body.
-    rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
-                                warpOp.getBodyRegion().begin());
-    rewriter.eraseBlock(&warpBodyBlock);
-    /// Insert a new ReturnOp after the WarpExecuteOnLane0Op.
-    rewriter.setInsertionPointAfter(warpOp);
-    rewriter.create<gpu::ReturnOp>(newGpuFunc.getLoc(), warpOp.getResults());
-    rewriter.replaceOp(gpuFuncOp, newGpuFunc);
-    return success();
-  }
+                                PatternRewriter &rewriter) const override;
+};
+
+/// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op. The warp op will
+/// still contain the original op that will not be used by the yield op (and
+/// should be cleaned up later with dce). The yield op will bypass the
+/// create_nd_tdesc's arguments. Tensor descriptor is not distributed because it
+/// is a uniform value accorss all work items within the subgroup.
+///
+/// Example:
+///
+/// ```
+///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
+///                   (!xegpu.tensor_desc<4x8xf32>) {
+///     ...
+///     %td = xegpu.create_nd_tdesc %arg0[0, 0]
+///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+///     vector.yield %td
+///   }
+/// ```
+/// To
+/// ```
+///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> () {
+///     ...
+///     %dead = xegpu.create_nd_tdesc %arg0[0, 0]
+///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+///     vector.yield %arg0, %dead
+///   }
+///   %td = xegpu.create_nd_tdesc %r#0[0, 0]: memref<4x8xf32>
+///                                 -> !xegpu.tensor_desc<4x8xf32>
+///
+/// ```
+struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                PatternRewriter &rewriter) const override;
+};
+
+/// Sink a store_nd op at the end of enclosing `gpu.warp_execute_on_lane_0`. In
+/// case arguments for the store are passed through the warp op interface they
+/// would be propagated as returned values. Only the source vector for the store
+/// is distributed according to sg_map attribute.
+///
+/// Example:
+///
+/// ```
+///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+///   gpu.warp_execute_on_lane_0(%laneid) -> () {
+///     ...
+///     xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
+///                                 !xegpu.tensor_desc<4x8xf32>
+///   }
+/// ```
+/// To
+/// ```
+///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> () {
+///     gpu.yield %arg0, %arg1: vector<4x8xf32>, !xegpu.tensor_desc<4x8xf32>
+///   }
+///   xegpu.store_nd %r#0, %r#1: vector<4x1xf32>,
+///     !xegpu.tensor_desc<4x8xf32>
+///
+/// ```
+struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                PatternRewriter &rewriter) const override;
+};
+
+/// Clone a load_nd feeding into vector.yield op for the enclosing
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
+/// The warp op will still contain the original op that will not be used by
+/// the yield op (and should be cleaned up later with dce). The yield op will
+/// bypass the load's arguments. Only the loaded vector is distributed according
+/// to sg_map attribute and, tensor descriptor types is not distributed.
+///
+/// Example:
+///
+/// ```
+///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
+///                   (vector<4x1xf32>) {
+///     ...
+///     %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32> ->
+///       vector<4x8xf32>
+///     gpu.yield %ld
+///   }
+/// ```
+/// To
+/// ```
+///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> () {
+///     ...
+///     %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32> ->
+///     vector<4x8xf32> gpu.yield %arg0, %arg1
+///   }
+///   %ld = xegpu.load_nd %r#0: !xegpu.tensor_desc<4x8xf32> -> vector<4x1xf32>
+///
+/// ```
+struct SubgroupOpLoadNd final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                PatternRewriter &rewriter) const override;
+};
+
+struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                PatternRewriter &rewriter) const override;
 };
 
 } // namespace
 
+/// Returns the distributed vector type for a source vector type according to
+/// the wi_layout. We simply divide each dimension of tensor descriptor shape by
+/// corresponding wi_layout dimension. If array_length > 1, that is appended to
+/// the front of the disributed shape.
+/// Examples:
+/// | original vector shape | wi_layout | distributed vector shape |
+/// |-----------------------|-----------|--------------------------|
+/// | 32x16                 | [1, 16]   | 32x1                     |
+/// | 32x16                 | [2, 8]    | 16x2                     |
+/// | 2x32x16               | [1, 16]   | 2x32x1                   |
+FailureOr<VectorType> getDistributedVectorType(xegpu::SGMapAttr sgMap,
+                                               VectorType originalType) {
+  llvm::SmallVector<int64_t, 2> distributedShape;
+  if (!sgMap)
+    return failure();
+
+  auto wiLayout = sgMap.getWiLayout();
+  assert((originalType.getRank() == 2 || originalType.getRank() == 3) &&
+         "expecting 2D or 3D shape for the original vector type");
+  assert(wiLayout.size() == 2 && "expecting 2D shape for the wi layout");
+  // Original type can be 2D or 3D (array_length > 1), the last two dims are the
+  // block shape.
+  auto blockShape = originalType.getShape().take_back(2);
+  // Check if the block vector shape can be distributed evenly.
+  if (blockShape[0] % wiLayout[0] != 0 || blockShape[1] % wiLayout[1] != 0)
+    return failure();
+
+  if (originalType.getRank() == 3) {
+    distributedShape.push_back(originalType.getShape()[0]);
+  }
+  for (unsigned i = 0; i < 2; ++i) {
+    distributedShape.push_back(blockShape[i] / wiLayout[i]);
+  }
+  auto newVectorType =
+      VectorType::get(distributedShape, originalType.getElementType());
+  return newVectorType;
+}
+
+LogicalResult MoveFuncBodyToWarpExecuteOnLane0::matchAndRewrite(
+    gpu::GPUFuncOp gpuFuncOp, PatternRewriter &rewriter) const {
+  /// If the function already moved inside a warp_execute_on_lane0, skip.
+  if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
+        return isa<gpu::WarpExecuteOnLane0Op>(op);
+      }))
+    return failure();
+  /// Create a new function with the same signature.
+  auto newGpuFunc = rewriter.create<gpu::GPUFuncOp>(
+      gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType());
+  /// Create a WarpExecuteOnLane0Op with same arguments and results as the
+  /// original gpuFuncOp.
+  rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
+  auto laneId = rewriter.create<gpu::LaneIdOp>(
+      newGpuFunc.getLoc(), rewriter.getIndexType(),
+      /** upperBound = **/ mlir::IntegerAttr());
+  auto gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
+  auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
+      laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize,
+      newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes());
+  auto &warpBodyBlock = warpOp.getBodyRegion().front();
+  /// Replace the ReturnOp of the original gpu function with a YieldOp.
+  auto origRetunOp =
+      cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
+  rewriter.setInsertionPointAfter(origRetunOp);
+  rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
+                                origRetunOp.getOperands());
+  rewriter.eraseOp(origRetunOp);
+  /// Move the original function body to the WarpExecuteOnLane0Op body.
+  rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
+                              warpOp.getBodyRegion().begin());
+  rewriter.eraseBlock(&warpBodyBlock);
+  /// Insert a new ReturnOp after the WarpExecuteOnLane0Op.
+  rewriter.setInsertionPointAfter(warpOp);
+  rewriter.create<gpu::ReturnOp>(newGpuFunc.getLoc(), warpOp.getResults());
+  rewriter.replaceOp(gpuFuncOp, newGpuFunc);
+  return success();
+}
+
+LogicalResult
+SubgroupOpStoreNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                   PatternRewriter &rewriter) const {
+  auto yield = cast<gpu::YieldOp>(
+      subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+  Operation *lastNode = yield->getPrevNode();
+  auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
+  if (!storeOp)
+    return failure();
+
+  auto tensorDescTy = storeOp.getTensorDescType();
+  xegpu::SGMapAttr sgMap = tensorDescTy.getSGMapAttr();
+  if (!sgMap)
+    return rewriter.notifyMatchFailure(
+        storeOp, "the source tensor descriptor lacks sg_map attribute");
+
+  if (storeOp.getTensorDescType().getShape().size() != 2)
+    return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
+
+  auto distributedTypeOrFailure =
+      getDistributedVectorType(sgMap, storeOp.getValueType());
+  if (failed(distributedTypeOrFailure))
+    return rewriter.notifyMatchFailure(storeOp,
+                                       "Failed to distribute the type");
+  VectorType newVectorType = distributedTypeOrFailure.value();
+
+  SmallVector<size_t> newRetIndices;
+  gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+      rewriter, subgroupOp,
+      /* new yielded values = */
+      ValueRange{storeOp.getTensorDesc(), storeOp.getValue()},
+      /* new yielded types = */
+      TypeRange{storeOp.getTensorDescType(), newVectorType}, newRetIndices);
+
+  // Create a new store op outside the warp op with the distributed vector type.
+  // Tensor descriptor is not distributed.
+  rewriter.setInsertionPointAfter(newWarpOp);
+  auto newStoreOp =
+      cast<xegpu::StoreNdOp>(rewriter.clone(*storeOp.getOperation()));
+  rewriter.eraseOp(storeOp);
+  newStoreOp.getTensorDescMutable().assign(
+      newWarpOp.getResult(newRetIndices[0]));
+  newStoreOp.getValueMutable().assign(newWarpOp.getResult(newRetIndices[1]));
+
+  return success();
+}
+
+LogicalResult
+SubgroupOpLoadNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                  PatternRewriter &rewriter) const {
+  OpOperand *operand = getWarpResult(subgroupOp, [](Operation *op) {
+    return isa<xegpu::LoadNdOp>(op) && op->hasOneUse();
+  });
+  if (!operand)
+    return rewriter.notifyMatchFailure(subgroupOp,
+                                       "warp result is not a xegpu::LoadNd op");
+
+  auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
+  xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
+  xegpu::SGMapAttr sgMap = tensorDescTy.getSGMapAttr();
+  if (!sgMap)
+    return rewriter.notifyMatchFailure(
+        loadOp, "the source tensor descriptor lacks sg_map attribute");
+
+  auto tensorDecShape = tensorDescTy.getShape();
+  if (tensorDecShape.size() != 2)
+    return rewriter.notifyMatchFailure(loadOp,
+                                       "unsupported tensor descriptor shape");
+
+  auto distributedTypeOrFailure =
+      getDistributedVectorType(sgMap, loadOp.getType());
+  if (failed(distributedTypeOrFailure))
+    return rewriter.notifyMatchFailure(loadOp, "Failed to distribute the type");
+  VectorType newVectorType = distributedTypeOrFailure.value();
+
+  unsigned operandIdx = operand->getOperandNumber();
+  SmallVector<size_t> newRetIndices;
+  gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+      rewriter, subgroupOp, /* new yielded values = */ loadOp.getTensorDesc(),
+      /* new yielded types = */ TypeRange{tensorDescTy}, newRetIndices);
+
+  // Create a new load op outside the warp op with the distributed vector type.
+  rewriter.setInsertionPointAfter(newWarpOp);
+  auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(
+      loadOp.getLoc(), newVectorType, loadOp.getTensorDesc(),
+      loadOp.getPackedAttr(), loadOp.getTransposeAttr(), loadOp.getL1HintAttr(),
+      loadOp.getL2HintAttr(), loadOp.getL3HintAttr());
+
+  newLoadOp.getTensorDescMutable().assign(
+      newWarpOp.getResult(newRetIndices[0]));
+  Value distributedVal = newWarpOp.getResult(operandIdx);
+  rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
+  return success();
+}
+
+LogicalResult
+SubgroupOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                        PatternRewriter &rewriter) const {
+  OpOperand *operand = getWarpResult(subgroupOp, [](Operation *op) {
+    return isa<xegpu::CreateNdDescOp>(op) && op->hasOneUse();
+  });
+
+  if (!operand)
+    return rewriter.notifyMatchFailure(
+        subgroupOp, "warp result is not a xegpu::CreateNdDesc op");
+  auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
+  unsigned operandIdx = operand->getOperandNumber();
+
+  auto srcTypedVal = dyn_cast<TypedValue<MemRefType>>(descOp.getSource());
+  if (!srcTypedVal)
+    return rewriter.notifyMatchFailure(
+        descOp, "expecting a memref typed value as the source");
+
+  auto descOffsets = descOp.getMixedOffsets();
+  if (descOffsets.size() != 2)
+    return rewriter.notifyMatchFailure(descOp,
+                                       "offsets size is expected to be 2");
+
+  xegpu::SGMapAttr sgMap = descOp.getType().getSGMapAttr();
+  if (!sgMap)
+    return rewriter.notifyMatchFailure(
+        descOp, "the tensor descriptor lacks sg_map attribute");
+
+  SmallVector<size_t> newRetIndices;
+  rewriter.setInsertionPoint(subgroupOp);
+  gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+      rewriter, subgroupOp, /* new yieled values = */ descOp.getSource(),
+      /* new yielded types = */ descOp.getSourceType(), newRetIndices);
+
+  rewriter.setInsertionPointAfter(newWarpOp);
+  auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
+      newWarpOp.getLoc(), descOp.getType(),
+      dyn_cast<TypedValue<MemRefType>>(newWarpOp.getResult(newRetIndices[0])),
+      descOffsets);
+
+  Value distributedVal = newWarpOp.getResult(operandIdx);
+  rewriter.replaceAllUsesWith(distributedVal, newDescOp);
+  return success();
+}
+
+LogicalResult
+SubgroupOpDpas::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                PatternRewriter &rewriter) const {
+  OpOperand *operand = getWarpResult(subgroupOp, [](Operation *op) {
+    return isa<xegpu::DpasOp>(op) && op->hasOneUse();
+  });
+
+  if (!operand)
+    return rewriter.notifyMatchFailure(subgroupOp,
+                                       "warp result is not a xegpu::Dpas op");
+
+  auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
+  unsigned operandIdx = operand->getOperandNumber();
+  xegpu::SGMapAttr sgMapA =
+      mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_a"));
+  xegpu::SGMapAttr sgMapB =
+      mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_b"));
+  xegpu::SGMapAttr sgMapResult =
+      mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_out"));
+  if (!sgMapA || !sgMapB || !sgMapResult)
+    return rewriter.notifyMatchFailure(
+        dpasOp, "the xegpu::Dpas op lacks sg_map attribute for A, B or result");
+
+  auto distributedLhsTypeOrFailure =
+      getDistributedVectorType(sgMapA, dpasOp.getLhsType());
+  auto distributedRhsTypeOrFailure =
+      getDistributedVectorType(sgMapB, dpasOp.getRhsType());
+  auto distributedResultTypeOrFailure =
+      getDistributedVectorType(sgMapResult, dpasOp.getResultType());
+  if (failed(distributedLhsTypeOrFailure) ||
+      failed(distributedRhsTypeOrFailure) ||
+      failed(distributedResultTypeOrFailure))
+    return rewriter.notifyMatchFailure(
+        dpasOp,
+        "Failed to distribute the A, B or result types in xegpu::Dpas op");
+
+  llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(), dpasOp.getRhs()};
+  llvm::SmallVector<Type, 3> newYieldTypes{distributedLhsTypeOrFailure.value(),
+                                           distributedRhsTypeOrFailure.value()};
+  // Dpas acc operand is optional.
+  if (dpasOp.getAcc()) {
+    newYieldValues.push_back(dpasOp.getAcc());
+    newYieldTypes.push_back(distributedResultTypeOrFailure.value());
+  }
+  // Create a new warp op without the dpas.
+  SmallVector<size_t> newRetIndices;
+  gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+      rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
+
+  // Create a new dpas op outside the warp op.
+  rewriter.setInsertionPointAfter(newWarpOp);
+  auto newDpasOp = cast<xegpu::DpasOp>(*dpasOp.clone());
+  newDpasOp.getLhsMutable().assign(newWarpOp.getResult(newRetIndices[0]));
+  newDpasOp.getRhsMutable().assign(newWarpOp.getResult(newRetIndices[1]));
+  if (dpasOp.getAcc())
+    newDpasOp.getAccMutable().assign(newWarpOp.getResult(newRetIndices[2]));
+  newDpasOp->getOpResult(0).setType(distributedResultTypeOrFailure.value());
+  Value disributedVal = newWarpOp.getResult(operandIdx);
+  rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
+
+  return success();
+}
+
 namespace {
 struct XeGPUSubgroupDistributePass final
     : public xegpu::impl::XeGPUSubgroupDistributeBase<

>From 4838b524a635e566175aa087440283b909555402 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 21 Mar 2025 16:54:15 +0000
Subject: [PATCH 07/45] extend sg_map from subgroup to workgroup

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  76 ++++--
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |  63 +++--
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       |  16 +-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 126 ++++------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 157 ++++++++----
 mlir/test/Dialect/XeGPU/invalid.mlir          | 110 +++++----
 mlir/test/Dialect/XeGPU/ops.mlir              | 230 ++++++++++--------
 7 files changed, 457 insertions(+), 321 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 0136b18ccfa94..7adb9df3c6b25 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -154,33 +154,81 @@ def XeGPU_FenceScopeAttr:
     let assemblyFormat = "$value";
 }
 
-def XeGPU_SGMapAttr : XeGPUAttr<"SGMap", "sg_map"> {
+def XeGPU_ScopeWG:   I32EnumAttrCase<"WG", 0, "wg">;      // workgroup level code
+def XeGPU_ScopeSG:   I32EnumAttrCase<"SG", 1, "sg">;      // subgroup level code
+def XeGPU_ScopeWI:   I32EnumAttrCase<"WI", 2, "wi">;      // simt level code
+
+def XeGPU_ScopeEnums : I32EnumAttr<"Scope", "enumerate of scope",
+  [XeGPU_ScopeWG,XeGPU_ScopeSG,XeGPU_ScopeWI]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::xegpu";
+}
+
+def XeGPU_ScopeAttr
+  : EnumAttr<XeGPU_Dialect,XeGPU_ScopeEnums, "Stage"> {
+    let summary = [{Describe the stage of lowering progress}];
+    let assemblyFormat = "``$value";
+}
+
+def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
   let summary = [{
     Describes the mapping between work item (WI) and the 2D tensor specified by the tensor descriptor.
   }];
   let description = [{
-    To distribute the XeGPU operation to work items, the tensor_desc must be specified with the sg_map
-    attribute at the tensor description creation time.
-    Within the `sg_map`, `wi_layout` specifies the layout of work items,
-    describing the mapping of work items to the tensor.
-    wi_layout[0] x wi_layout[1] must be equal to the total number of work items within a subgroup.
-    `wi_data` specifies the minimum number of data elements assigned to each work item for a single distribution.
-
-    E.g., #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
-    In this example, the subgroup has 16 work items in wi_layout=[1, 16],
-    each accessing 1 element as specified by wi_data=[1, 1].
+    XeGPU operations leverages LayoutAttr to distribute data across work-item. It is specified in tensor_descs
+    upon the tensor description creation. LayoutAttr contains the following parameters.
+
+    * scope: specifies the scope of current code. It can be either wg (workgroup), sg (subgroup) or wi (workitem).
+             it is hard required for subgroup, but optional for workgroup and wi. By default, if a LayoutAttr
+             contains sg_layout and sg_data, it will be treated as workgroup code; and if it only contains
+             wi_layout and wi_data, it will be considered as workitem level.
+    * sg_layout: [optional] specifies the total number of subgroups and their layout in a workgroup.
+    * sg_data: [optional] specifies the data size accessed per subgroup.
+    * order: [optional] specifies the dimension order used to linearize n-d sbugroup ids to 1-d.
+            The first dimension in the order list is the fastest-changing dimension.
+    * wi_layout: [required] specifies the total number of work-items and their layout in a subgroup
+    * wi_data: [required] specifies the data size accessed per work-item for a single distribution.
 
     `wi_data[0] * wi_data[1]` can be greater than 1, meaning that each work item operates on multiple elements,
     which is eventually lowered to "SIMT-flavor" vector, like SPIR-V vector or llvm vector, or packed to a storage data type.
     The multiple elements indicated by `wi_data` can only be from one dimension and must be contiguous in the memory along either dimension.
+
+    E.g., #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
+    In this example, the subgroup has 16 work items in wi_layout=[1, 16], each accessing 1 element as specified by wi_data=[1, 1].
+
+    E.g., #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>
+    In this example, the layout representing a workgroup work distribution. A workgroup has 8 subgroups organized as 2x4 layout.
+    and each subgroup accesses a 16x16 block per instruction, which is further disbributed to 16 work items as described above.
+
   }];
   let parameters = (ins
-    ArrayRefParameter<"uint32_t">:$wi_layout,
-    ArrayRefParameter<"uint32_t">:$wi_data
+    OptionalParameter<"ScopeAttr">: $scope,
+    OptionalParameter<"DenseI32ArrayAttr">: $sg_layout,
+    OptionalParameter<"DenseI32ArrayAttr">: $sg_data,
+    OptionalParameter<"DenseI32ArrayAttr">: $order,
+    "DenseI32ArrayAttr": $wi_layout,
+    "DenseI32ArrayAttr": $wi_data
   );
 
+  let extraClassDeclaration = [{
+    bool isForWorkgroupLevel() {
+      if (!getScope())
+        return getSgLayout() && getSgData();
+      return getScope() == ScopeAttr::get(getContext(), Scope::WG);
+    }
+
+    bool isForSubgroupLevel() {
+      return getScope() == ScopeAttr::get(getContext(), Scope::SG);
+    }
+
+    bool isForWorkItemLevel() {
+      if (!getScope())
+        return !getSgLayout() && !getSgData() && !getOrder();
+      return getScope() == ScopeAttr::get(getContext(), Scope::WI);
+    }
+  }];
 
-  let hasCustomAssemblyFormat = 1;
+  let assemblyFormat = "`<` struct(params) `>`";
   let genVerifyDecl = 1;
 }
 
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 56b836d707a7d..6b27ae3b2754c 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -80,7 +80,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
         information e.g., memref<?x?xf16>, the strides information has to be explicitly
         passed via the "strides" and "const_strides" argument.
 
-    In SIMT mode, tensor descriptor is augmented with `SGMapAttr` which describes the
+    In SIMT mode, tensor descriptor is augmented with `LayoutAttr` which describes the
     mapping of the tensor descriptor to the work items.
 
     Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
@@ -113,7 +113,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 8 : index
     %1 = xegpu.create_nd_tdesc %0[%c0, %c0] : memref<1024x1024xf32>
-          -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+          -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
     ```
   }];
 
@@ -306,7 +306,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
     fp32 or fp64. It implies that vnni and transpose cannot exit at the
     same time.
 
-    In SIMT mode, LoadNdOp expects the tensor descriptor to be augmented with `SGMapAttr`
+    In SIMT mode, LoadNdOp expects the tensor descriptor to be augmented with `LayoutAttr`
     which describes the mapping of the tensor to the work items. In this case, result
     vector represents the data to be loaded by each work-item.
 
@@ -323,7 +323,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
       xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint<cached>,
                         l2_hint = #xegpu.cache_hint<uncached>}>
         : !xegpu.tensor_desc<8x16xf32,
-          #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+          #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
     ```
 
 
@@ -364,7 +364,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
     of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
     Corresponding cache hint attribute will be masked.
 
-    In SIMT mode, StoreNdOp expects the tensor descriptor to be augmented with `SGMapAttr`
+    In SIMT mode, StoreNdOp expects the tensor descriptor to be augmented with `LayoutAttr`
     which describes the mapping of the tensor to the work items. In this case, input
     vector represents the data to be stored by each work-item.
 
@@ -381,7 +381,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
                              l2_hint = #xegpu.cache_hint<write_back>,
                              l3_hint = #xegpu.cache_hint<write_through>}
                              : vector<8x1xf16>, !xegpu.tensor_desc<8x16xf16,
-                               #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+                               #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
     ```
 
 
@@ -422,7 +422,7 @@ def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
   Example 2 (SIMT mode):
   ```
     %2 = xegpu.update_nd_offset %1, [0, 16]:
-      !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+      !xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
   ```
   }];
 
@@ -482,7 +482,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
     the chunk_size if the chunk size is larger than 1.
 
     In SIMT mode, similar to `create_nd_tdesc` the resulting tensor descriptor is augmented
-    with `SGMapAttr` which describes the mapping of the tensor descriptor to the work items.
+    with `LayoutAttr` which describes the mapping of the tensor descriptor to the work items.
     In this case, the first dimension of the tensor descriptor represents the work-items, and
     the second dimension represents the chunk size.
 
@@ -517,7 +517,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
     %off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
     %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
           -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>,
-          #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+          #xegpu.layout<wi_layout = [4, 1], wi_data = [1, 1]>>
     ```
   }];
 
@@ -571,7 +571,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
   let hasVerifier = 1;
 }
 
-def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
+def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [MemoryEffects<[MemRead]>]> {
   let summary = "prefetches a set of scattered data points to cache";
 
   let description = [{
@@ -623,7 +623,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
     The mask operand masks out memory access so that it is safe to pass out-of-boundary
     addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
 
-    In SIMT mode, LoadGatherOp expects the tensor descriptor to be augmented with `SGMapAttr`
+    In SIMT mode, LoadGatherOp expects the tensor descriptor to be augmented with `LayoutAttr`
     which describes the mapping of the tensor to the work items. In this case, result vector
     represents the data to be loaded by each work-item. Each work-item recieves a `chunk_size`
     number of elements.
@@ -653,7 +653,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
                             l2_hint = #xegpu.cache_hint<uncached>,
                             l3_hint = #xegpu.cache_hint<uncached>}
           : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>,
-            !xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>>
+            !xegpu.layout<wi_layout = [16, 1], wi_data = [1, 1]>>
             vector<16xi1> -> vector<8x1xf32>
   ```
 
@@ -704,7 +704,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
   has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
   introduced on purpose, making sure users are aware of this implicit transformation.
 
-  In SIMT mode, StoreScatterOp expects the tensor descriptor to be augmented with `SGMapAttr`
+  In SIMT mode, StoreScatterOp expects the tensor descriptor to be augmented with `LayoutAttr`
   which describes the mapping of the tensor to the work items. In this case, input vector
   represents the data to be stored by each work-item. Each work-item recieves a `chunk_size`
   number of elements.
@@ -732,7 +732,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
                                  l2_hint = #xegpu.cache_hint<write_back>,
                                  l3_hint = #xegpu.cache_hint<write_through>}
           : vector<8x1xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>,
-            !xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>> vector<16xi1>
+            !xegpu.layout<wi_layout = [16, 1], wi_data = [1, 1]>> vector<16xi1>
   ```
 
   }];
@@ -790,7 +790,7 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
       %off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
       %2 = xegpu.update_offset %1, %off :
               !xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<chunk_size=2>,
-              #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+              #xegpu.layout<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
     ```
   }];
 
@@ -840,9 +840,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
     factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>`
     can be represented as `B: vector<8x16x2xf16>`.
 
-    In SIMT mode, DpasOp expects attributes `sg_map_a`, `sg_map_b`, and `sg_map_c`
-    which descibes the data fragment owned by each work-item w.r.t. the tensor
-    descriptor these data are loaded from.
+    In SIMT mode, DpasOp expects layout attributes `a`, `b`, and `c` (only if acc is used)
+    which descibe the data fragment owned by each work-item w.r.t. the tensor descriptor
+    these data are loaded from.
 
     Note: on PVC, the hardware can perform load with VNNI transformation when data
           element type is 16-bit or lower precision, taking 2 or 4 elements from
@@ -853,9 +853,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
     XeGPU_DpasOpType : $lhs,
     XeGPU_DpasOpType : $rhs,
     Optional<XeGPU_Vector2DType>: $acc,
-    OptionalAttr<XeGPU_SGMapAttr>:$sg_map_a,
-    OptionalAttr<XeGPU_SGMapAttr>:$sg_map_b,
-    OptionalAttr<XeGPU_SGMapAttr>:$sg_map_c);
+    OptionalAttr<XeGPU_LayoutAttr>:$a_layout,
+    OptionalAttr<XeGPU_LayoutAttr>:$b_layout,
+    OptionalAttr<XeGPU_LayoutAttr>:$c_layout);
   let results = (outs XeGPU_Vector2DType: $result);
 
   let extraClassDeclaration = [{
@@ -876,6 +876,10 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
     VectorType getResultType() {
       return getResult().getType();
     }
+
+    bool hasAcc() {
+      return getAcc() != nullptr;
+    }
   }];
 
   let assemblyFormat = [{
@@ -979,4 +983,21 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
   let extraClassDeclaration = extraBaseClassDeclaration;
 }
 
+def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> {
+    let summary = "Convert the sg layout of the input operand";
+    let description = [{
+        convert_layout remaps the distribution of data across workitems by updating the LayoutAttr.
+    }];
+    let arguments = (ins XeGPU_Vector2DType: $source,
+                         XeGPU_LayoutAttr: $srcMap,
+                         XeGPU_LayoutAttr: $resMap
+                         );
+    let results = (outs XeGPU_Vector2DType: $result);
+    let assemblyFormat = [{
+        $source attr-dict `:` type($source)
+    }];
+
+    let hasVerifier = 1;
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index ccd91a928e1dd..c92ea42efde3b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -63,7 +63,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     element-type ::= float-type | integer-type | index-type
     dim-list := (static-dim-list `x`)?
     static-dim-list ::= decimal-literal `x` decimal-literal
-    attr-list = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)? (, sg_map `<` wi_layout = value, wi_data = value `>`)?
+    attr-list = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)? (, layout `<` wi_layout = value, wi_data = value `>`)?
     ```
 
     Examples:
@@ -78,15 +78,15 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     // A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
     xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_space = slm>>
 
-    // A TensorDesc with a sg_map
-    xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    // A TensorDesc with a layout
+    xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
     ```
   }];
 
   let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
                         "mlir::Type": $elementType,
                         OptionalParameter<"mlir::Attribute">: $encoding,
-                        OptionalParameter<"mlir::Attribute">: $sg_map);
+                        OptionalParameter<"mlir::Attribute">: $layout);
 
   let builders = [
     TypeBuilderWithInferredContext<(ins
@@ -95,13 +95,13 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
       CArg<"int", "1">: $array_length,
       CArg<"bool", "true">: $boundary_check,
       CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
-      CArg<"mlir::Attribute", "mlir::Attribute()">:$sg_map)>,
+      CArg<"mlir::Attribute", "mlir::Attribute()">:$layout)>,
     TypeBuilderWithInferredContext<(ins
       "llvm::ArrayRef<int64_t>": $shape,
       "mlir::Type": $elementType,
       CArg<"int", "1">: $chunk_size,
       CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
-      CArg<"mlir::Attribute", "mlir::Attribute()">:$sg_map)>
+      CArg<"mlir::Attribute", "mlir::Attribute()">:$layout)>
   ];
 
   let extraClassDeclaration = [{
@@ -127,8 +127,8 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
       return llvm::dyn_cast_if_present<ScatterTensorDescAttr>(getEncoding());
     }
 
-    SGMapAttr getSGMapAttr() const {
-      return llvm::dyn_cast_if_present<SGMapAttr>(getSgMap());
+    LayoutAttr getLayoutAttr() const {
+      return llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
     }
 
     xegpu::MemorySpace getMemorySpace() const {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 78c242571935c..52b9f2c192b3f 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -68,73 +68,39 @@ LogicalResult ScatterTensorDescAttr::verify(
 }
 
 //===----------------------------------------------------------------------===//
-// XeGPU_SGMapAttr
+// XeGPU_LayoutAttr
 //===----------------------------------------------------------------------===//
-namespace {
-template <typename T, unsigned N>
-LogicalResult parseIntArrayField(::mlir::AsmParser &parser,
-                                 llvm::SmallVector<T, N> &result,
-                                 llvm::StringRef fieldName) {
-  if (failed(parser.parseKeyword(fieldName))) {
-    parser.emitError(parser.getCurrentLocation(),
-                     "unexpected field name. Expected " + fieldName + ".");
-    return failure();
+LogicalResult
+LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+                  ScopeAttr scope,
+                  DenseI32ArrayAttr sg_layout,
+                  DenseI32ArrayAttr sg_data,
+                  DenseI32ArrayAttr order,
+                  DenseI32ArrayAttr wi_layout,
+                  DenseI32ArrayAttr wi_data) {
+
+  if (scope && scope.getValue() != Scope::WG && (sg_layout || sg_data || order)) {
+    return emitError() << "expected sg_layout, sg_data, and order being only used at workgroup level.";
   }
 
-  if (failed(parser.parseEqual())) {
-    parser.emitError(parser.getCurrentLocation(), "expected '=' sign.");
-    return failure();
+  if ((sg_layout != nullptr) ^ (sg_data != nullptr)) {
+    return emitError() << "expected sg_layout and sg_data being both present or both absent";
   }
 
-  auto elemParser = [&]() -> llvm::ParseResult {
-    uint32_t elem = 0;
-    auto res = parser.parseInteger(elem);
-    result.push_back(elem);
-    return res;
-  };
-
-  return parser.parseCommaSeparatedList(AsmParser::Delimiter::Square,
-                                        elemParser, fieldName);
-}
-} // namespace
-
-mlir::Attribute SGMapAttr::parse(::mlir::AsmParser &parser,
-                                 ::mlir::Type attrType) {
-  if (failed(parser.parseLess()))
-    return {};
-
-  llvm::SmallVector<uint32_t, 2> wi_layout, wi_data;
-  if (failed(parseIntArrayField(parser, wi_layout, "wi_layout")))
-    return {};
-
-  if (failed(parser.parseComma()))
-    return {};
-
-  if (failed(parseIntArrayField(parser, wi_data, "wi_data")))
-    return {};
+  if (order) {
+    if (!sg_layout)
+      return emitError() << "expected order being used with sg_layout and sg_data.";
+    if (order.size() != sg_layout.size())
+      return emitError() << "expected order having the same rank as sg_layout and sg_data";
+  }
 
-  return SGMapAttr::getChecked(
-      [&]() { return parser.emitError(parser.getNameLoc()); },
-      parser.getContext(), wi_layout, wi_data);
-}
+  if (sg_layout && (sg_layout.size() != sg_data.size() || sg_layout.size() > 2)) {
+    return emitError() << "expected sg_layout and sg_data having the same rank, which is not larger than 2";
+  }
 
-void SGMapAttr::print(::mlir::AsmPrinter &printer) const {
-  printer << "<";
-  printer.printKeywordOrString("wi_layout");
-  printer << " = [" << getWiLayout() << "], ";
-  printer.printKeywordOrString("wi_data");
-  printer << " = [" << getWiData() << "]";
-  printer << ">";
-}
+  if (wi_layout.size() != wi_data.size() || wi_layout.size() > 2)
+    return emitError() << "expected wi_layout and wi_data having the same rank, which is not larger than 2";
 
-LogicalResult
-SGMapAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
-                  llvm::ArrayRef<uint32_t> wi_layout,
-                  llvm::ArrayRef<uint32_t> wi_data) {
-  if (wi_layout.size() != 2)
-    return emitError() << "expected wi_layout of size 2";
-  if (wi_data.size() != 2)
-    return emitError() << "expected wi_data of size 2";
   return success();
 }
 
@@ -146,7 +112,7 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
   llvm::SmallVector<int64_t> shape;
   mlir::Type elementType;
   mlir::FailureOr<mlir::Attribute> encoding;
-  mlir::FailureOr<mlir::Attribute> sg_map;
+  mlir::FailureOr<mlir::Attribute> layout;
 
   // Parse literal '<'
   if (parser.parseLess())
@@ -169,8 +135,8 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
     mlir::Attribute attr;
     ParseResult res = parser.parseAttribute(attr);
     if (mlir::succeeded(res)) {
-      if (mlir::isa<SGMapAttr>(attr)) {
-        sg_map = attr;
+      if (mlir::isa<LayoutAttr>(attr)) {
+        layout = attr;
         continue;
       }
       if (mlir::isa<BlockTensorDescAttr, ScatterTensorDescAttr>(attr)) {
@@ -188,7 +154,7 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
   return TensorDescType::getChecked(
       [&]() { return parser.emitError(parser.getNameLoc()); },
       parser.getContext(), shape, elementType,
-      encoding.value_or(mlir::Attribute()), sg_map.value_or(mlir::Attribute()));
+      encoding.value_or(mlir::Attribute()), layout.value_or(mlir::Attribute()));
 }
 
 void TensorDescType::print(::mlir::AsmPrinter &printer) const {
@@ -208,8 +174,8 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const {
   if (auto encoding = getEncoding())
     printer << ", " << encoding;
 
-  if (auto sg_map = getSgMap())
-    printer << ", " << sg_map;
+  if (auto layout = getLayout())
+    printer << ", " << layout;
 
   printer << ">";
 }
@@ -218,29 +184,29 @@ TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
                                    mlir::Type elementType, int array_length,
                                    bool boundary_check,
                                    MemorySpace memory_space,
-                                   mlir::Attribute sg_map) {
+                                   mlir::Attribute layout) {
   auto context = elementType.getContext();
   auto attr = BlockTensorDescAttr::get(context, memory_space, array_length,
                                        boundary_check);
-  return Base::get(context, shape, elementType, attr, sg_map);
+  return Base::get(context, shape, elementType, attr, layout);
 }
 
 TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
                                    mlir::Type elementType, int chunk_size,
                                    MemorySpace memory_space,
-                                   mlir::Attribute sg_map) {
+                                   mlir::Attribute layout) {
   auto context = elementType.getContext();
   auto attr = ScatterTensorDescAttr::get(context, memory_space, chunk_size);
-  return Base::get(context, shape, elementType, attr, sg_map);
+  return Base::get(context, shape, elementType, attr, layout);
 }
 
 LogicalResult TensorDescType::verify(
     llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
     llvm::ArrayRef<int64_t> shape, mlir::Type elementType,
-    mlir::Attribute encoding, mlir::Attribute sg_map) {
+    mlir::Attribute encoding, mlir::Attribute layout) {
   size_t rank = shape.size();
   // Low-pressure types are packed in 32-bit units.
-  unsigned packingFactor = 32 / elementType.getIntOrFloatBitWidth();
+  int32_t packingFactor = 32 / elementType.getIntOrFloatBitWidth();
   if (rank != 1 && rank != 2)
     return emitError() << "expected 1D or 2D tensor";
 
@@ -274,9 +240,9 @@ LogicalResult TensorDescType::verify(
       return emitError() << "SLM is not supported for 2D block tensor";
   }
 
-  if (auto sgMapAttr = llvm::dyn_cast_if_present<SGMapAttr>(sg_map)) {
-    ArrayRef<uint32_t> wiLayout = sgMapAttr.getWiLayout();
-    ArrayRef<uint32_t> wiData = sgMapAttr.getWiData();
+  if (auto layoutAttr = llvm::dyn_cast_if_present<LayoutAttr>(layout)) {
+    ArrayRef<int32_t> wiLayout = layoutAttr.getWiLayout().asArrayRef();
+    ArrayRef<int32_t> wiData = layoutAttr.getWiData().asArrayRef();
 
     if (rank == 1) {
       if (wiLayout[0] != 1 || wiData[0] != 1)
@@ -318,7 +284,7 @@ LogicalResult TensorDescType::verify(
   return success();
 }
 
-// If tensor descriptor has a sg_map attribute it is used in SIMT mode.
+// If tensor descriptor has a layout attribute it is used in SIMT mode.
 // In this mode, the distributed vector shape is determined as follows:
 // Definitions:
 //        wi_data_size = wi_data[0] × wi_data[1]
@@ -343,13 +309,13 @@ LogicalResult TensorDescType::verify(
 // Distributed vector shape must be:
 //        [n_distribution_units, wi_data_size]
 FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
-  auto sgMap = llvm::dyn_cast_if_present<SGMapAttr>(getSgMap());
-  // If no sg_map is provided, tensor desc is not used in SIMT mode.
-  if (!sgMap)
+  auto layout = llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
+  // If no layout is provided, tensor desc is not used in SIMT mode.
+  if (!layout || !layout.isForWorkItemLevel())
     return failure();
 
-  SmallVector<int64_t> wiData(sgMap.getWiData());
-  SmallVector<int64_t> wiLayout(sgMap.getWiLayout());
+  SmallVector<int64_t> wiData(layout.getWiData().asArrayRef());
+  SmallVector<int64_t> wiLayout(layout.getWiLayout().asArrayRef());
   auto tdescShape = getShape();
 
   auto wiDataSize = 1, sgSize = 1;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 3bdf3fb218b45..c7e863256f235 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -78,18 +78,18 @@ static LogicalResult
 isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
                  ArrayRef<int64_t> adjustedTdescShape,
                  function_ref<InFlightDiagnostic()> emitError) {
-  auto sgMap = tdescTy.getSGMapAttr();
+  auto layout = tdescTy.getLayoutAttr();
   auto valueShape = valueTy.getShape();
-  // sg_map not present means IR is in SIMD mode. In this case value shape must
+  // layout not present means IR is in SIMD mode. In this case value shape must
   // match adjusted tensor descriptor shape.
-  if (!sgMap)
+  if (!layout || !layout.isForWorkItemLevel())
     return valueShape == adjustedTdescShape
                ? success()
                : emitError()
                      << "Value shape " << makeString(valueShape)
                      << " is not consistent with tensor descriptor " << tdescTy;
 
-  // sg_map present means IR is in SIMT mode. In this case sg_map determines the
+  // layout present means IR is in SIMT mode. In this case layout determines the
   // value shape.
   auto expectedValueShapeOrFailure = tdescTy.getDistributedVectorType();
   assert(succeeded(expectedValueShapeOrFailure) &&
@@ -105,6 +105,25 @@ isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
                    << " for tensor descriptor " << tdescTy;
 }
 
+static bool isEvenDistributed(llvm::ArrayRef<int64_t> shape,
+                              xegpu::LayoutAttr attr) {
+  assert(attr && "workgroup map attribute is missing.");
+  llvm::ArrayRef<int32_t> layout, data;
+  if (attr.getSgLayout()) {
+    data = attr.getSgData().asArrayRef();
+    layout = attr.getSgLayout().asArrayRef();
+  } else {
+    data = attr.getWiData().asArrayRef();
+    layout = attr.getWiLayout().asArrayRef();
+  }
+  for (auto [s, d, l] : llvm::zip_equal(shape, data, layout)) {
+    // check s % (d * l) != 0
+    if (s % d != 0 || (s / d) % l != 0)
+      return false;
+  }
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_CreateNdDescOp
 //===----------------------------------------------------------------------===//
@@ -541,7 +560,7 @@ LogicalResult StoreScatterOp::verify() {
                           [&]() { return emitOpError(); });
 }
 
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 // XeGPU_UpdateOffsetOp
 //===----------------------------------------------------------------------===//
 void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
@@ -569,61 +588,107 @@ void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
 LogicalResult DpasOp::verify() {
   int64_t lhsRank = getLhsType().getRank();
   int64_t rhsRank = getRhsType().getRank();
-  int64_t resultRank = getResultType().getRank();
+  int64_t resRank = getResultType().getRank();
   auto lhsShape = getLhsType().getShape();
   auto rhsShape = getRhsType().getShape();
-  auto resultShape = getResultType().getShape();
-
-  auto sgMapA = getSgMapAAttr();
-  auto sgMapB = getSgMapBAttr();
-  auto sgMapC = getSgMapCAttr();
+  auto resShape = getResultType().getShape();
+
+  auto layoutA = getALayoutAttr();
+  auto layoutB = getBLayoutAttr();
+  auto layoutC = getCLayoutAttr();
+
+  // make sure the layout attribute is either set for every available
+  // operand or simply not set at all. C is special, since ACC is optional.
+  // If they are all set, they also should be in the same scope.
+  auto isValidSet = [&]() {
+    bool result = (layoutA != nullptr) ^ (layoutB != nullptr);
+    if (hasAcc()) {
+      result |= (layoutA != nullptr) ^ (layoutC != nullptr);
+    }
+    result = !result;
 
-  // If sg_maps are not present, then the operation is in SIMD mode.
-  if (!sgMapA && !sgMapB && !sgMapC) {
-    if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resultRank != 2)
+    if (layoutA) {
+      auto scope = layoutA.getScope();
+      result &= layoutB ? scope == layoutB.getScope() : false;
+      if (hasAcc())
+        result &= layoutC ? scope == layoutC.getScope() : false;
+    }
+    return result;
+  };
+
+  if (!isValidSet())
+    return emitOpError("layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code).");
+
+  // query the scope from layoutA (a valid setting).
+  if (layoutA && layoutA.isForWorkItemLevel()) {
+    // In SIMT mode, All data fragments must be 2D
+    if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
+      return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
+
+    auto wiLayoutA = layoutA.getWiLayout();
+    auto wiLayoutB = layoutB.getWiLayout();
+    auto wiLayoutC = layoutC.getWiLayout();
+    // Obtain the expanded shapes of the operands and result using wi_layout.
+    // NOTE: For B, get rid of the packed dimension for the expanded shape.
+    SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],
+                                           lhsShape[1] * wiLayoutA[1]};
+    SmallVector<int64_t> expandedShapeB = {
+        rhsShape[0] * rhsShape[1] * wiLayoutB[0], 1 * wiLayoutB[1]};
+    SmallVector<int64_t> expandedShapeC = {resShape[0] * wiLayoutC[0],
+                                           resShape[1] * wiLayoutC[1]};
+    auto bK = expandedShapeB[0];
+    if (bK != expandedShapeA[1])
+      return emitOpError("K-dimension mismatch.");
+    if (expandedShapeA[0] != expandedShapeC[0])
+      return emitOpError("M-dimension mismatch.");
+    if (expandedShapeB[1] != expandedShapeC[1])
+      return emitOpError("N-dimension mismatch.");
+  } else { // For other scopes, operands' shape should match the mxkxn semantics.
+    if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resRank != 2)
       return emitOpError(
           "expecting lhs and result to be a 2D vector, and rhs to be either "
           "2D or 3D (packed) vector.");
     auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
     if (bK != lhsShape[1])
       return emitOpError("K-dimension mismatch.");
-    if (lhsShape[0] != resultShape[0])
+    if (lhsShape[0] != resShape[0])
       return emitOpError("M-dimension mismatch.");
-    if (rhsShape[1] != resultShape[1])
+    if (rhsShape[1] != resShape[1])
       return emitOpError("N-dimension mismatch.");
-    return success();
   }
-  // Otherwise, in SIMT mode we expect sg_map attributes for all operands and
-  // result of DPAS operation.
-  if (!sgMapA || !sgMapB || !sgMapC)
-    return emitOpError("sg_map attributes for all operands and outputs are "
-                       "expected in SIMT xegpu::Dpas operation");
-
-  // In SIMT mode, All data fragments must be 2D
-  if (lhsRank != 2 || rhsRank != 2 || resultRank != 2)
-    return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
-
-  auto wiLayoutA = sgMapA.getWiLayout();
-  auto wiLayoutB = sgMapB.getWiLayout();
-  auto wiLayoutC = sgMapC.getWiLayout();
-  // Obtain the expanded shapes of the operands and result using wi_layout.
-  // NOTE: For B, get rid of the packed dimension for the expanded shape.
-  SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],
-                                         lhsShape[1] * wiLayoutA[1]};
-  SmallVector<int64_t> expandedShapeB = {
-      rhsShape[0] * rhsShape[1] * wiLayoutB[0], 1 * wiLayoutB[1]};
-  SmallVector<int64_t> expandedShapeC = {resultShape[0] * wiLayoutC[0],
-                                         resultShape[1] * wiLayoutC[1]};
-  auto bK = expandedShapeB[0];
-  if (bK != expandedShapeA[1])
-    return emitOpError("K-dimension mismatch.");
-  if (expandedShapeA[0] != expandedShapeC[0])
-    return emitOpError("M-dimension mismatch.");
-  if (expandedShapeB[1] != expandedShapeC[1])
-    return emitOpError("N-dimension mismatch.");
-
   return success();
 }
+
+//===----------------------------------------------------------------------===//
+// XeGPU_ConvertLayoutOp
+//===----------------------------------------------------------------------===//
+LogicalResult ConvertLayoutOp::verify() {
+  auto srcMap = getSrcMapAttr();
+  auto resMap = getResMapAttr();
+  if (!srcMap)
+    return emitOpError("expected srcMap.");
+  if (!resMap)
+    return emitOpError("expected resMap.");
+
+  if (srcMap.getScope() != resMap.getScope())
+    return emitOpError("expected srcMap and resMap be in the same scope.");
+
+  if (srcMap == resMap)
+    return emitOpError("expected different srcMap and resMap.");
+
+  if (srcMap.isForWorkItemLevel())
+    return emitOpError("doesn't work on SIMT code.");
+
+  auto shape = getSource().getType().getShape();
+  if (!isEvenDistributed(shape, srcMap))
+    return emitOpError("invalid srcMap, data cannot be evenly distributed.");
+
+  if (!isEvenDistributed(shape, resMap))
+    return emitOpError("invalid resMap, data cannot be evenly distributed.");
+
+  return mlir::success();
+}
+
 } // namespace xegpu
 } // namespace mlir
 
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 88e9bbf78945b..c4958d920a89f 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -78,25 +78,25 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
 }
 
 // -----
-func.func @test_load_nd_sg_map(%src: memref<24x32xf32>) {
+func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   // expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1]}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
       l2_hint = #xegpu.cache_hint<uncached>}>
-    : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    : !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
     -> vector<8x2xf32>
   return
 }
 
 // -----
-func.func @test_load_nd_sg_map(%src: memref<24x32xf32>) {
+func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   // expected-error at +1 {{Result shape [8] is not consistent with distributed vector shape [1, 1]}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
       l2_hint = #xegpu.cache_hint<uncached>}>
-    : !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    : !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
     -> vector<8xf32>
   return
 }
@@ -134,22 +134,22 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
 }
 
 // -----
-func.func @test_store_nd_sg_map(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) {
+func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) {
   %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   // expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1] for tensor descriptor}}
   xegpu.store_nd %data, %1
-    : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   return
 }
 
 // -----
-func.func @test_store_nd_sg_map(%dst: memref<24x32xf32>, %data: vector<2xf32>) {
+func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<2xf32>) {
   %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   // expected-error at +1 {{Result shape [2] is not consistent with distributed vector shape [1, 1] for tensor descriptor}}
   xegpu.store_nd %data, %1
-    : vector<2xf32>, !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    : vector<2xf32>, !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   return
 }
 
@@ -245,69 +245,69 @@ func.func @test_prefetch_vc_2(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_sg_map_1(%src: ui64) {
+func.func @test_create_tdesc_layout_1(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>,   #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   return
 }
 
 // -----
-func.func @test_create_tdesc_sg_map_2(%src: ui64) {
+func.func @test_create_tdesc_layout_2(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error at +1 {{cannot map over non-contiguous scattered row elements}}
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [2, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,   #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [2, 1]>>
   return
 }
 
 // -----
-func.func @test_create_tdesc_sg_map_3(%src: ui64) {
+func.func @test_create_tdesc_layout_3(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error at +1 {{work item data mapping must match the number of contiguous elements}}
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>,   #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
   return
 }
 
 // -----
-func.func @test_load_gather_sg_map_1(%src: ui64) {
+func.func @test_load_gather_layout_1(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,   #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   // expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
+  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
   return
 }
 
 // -----
-func.func @test_load_gather_sg_map_2(%src: ui64) {
+func.func @test_load_gather_layout_2(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   // expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
+  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
   return
 }
 
 
 // -----
-func.func @test_store_scatter_sg_map_1(%src: ui64) {
+func.func @test_store_scatter_layout_1(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %val = arith.constant dense<2.9>: vector<1x2xf32>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   // expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
   return
 }
 
 // -----
-func.func @test_store_scatter_sg_map_2(%src: ui64) {
+func.func @test_store_scatter_layout_2(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %val = arith.constant dense<2.9>: vector<2xf32>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   // expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
   return
 }
 
@@ -394,18 +394,18 @@ func.func @test_dpas_4(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) {
 }
 
 // -----
-func.func @test_dpas_sg_map_1(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
-  // expected-error at +1 {{sg_map attributes for all operands and outputs are expected in SIMT xegpu::Dpas operation}}
-  %1 = xegpu.dpas %a, %b {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+func.func @test_dpas_layout_1(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
+  // expected-error at +1 {{layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code)}}
+  %1 = xegpu.dpas %a, %b {a_layout =  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
   return
 }
 
 // -----
-func.func @test_dpas_sg_map_2(%a : vector<8x1xf16>, %b: vector<4x2xf16>) {
+func.func @test_dpas_layout_2(%a : vector<8x1xf16>, %b: vector<4x2xf16>) {
   // expected-error at +1 {{K-dimension mismatch}}
-  %1 = xegpu.dpas %a, %b {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>,
-                          sg_map_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>,
-                          sg_map_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>}
+  %1 = xegpu.dpas %a, %b {a_layout =  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
+                          b_layout =  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
+                          c_layout =  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>}
                           : vector<8x1xf16>, vector<4x2xf16> -> vector<8x1xf32>
   return
 }
@@ -439,7 +439,7 @@ func.func @tensor_desc_invalid_rank_1(%src: memref<24x32xf32>) {
 func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
-      !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [2, 16], wi_data = [1, 1]>>
+      !xegpu.tensor_desc<16xf32,  #xegpu.layout<scope = wi, wi_layout = [2, 16], wi_data = [1, 1]>>
   return
 }
 
@@ -447,7 +447,7 @@ func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) {
 func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
-      !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
+      !xegpu.tensor_desc<16xf32,  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
   return
 }
 
@@ -455,7 +455,7 @@ func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
 func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{cannot distribute 8 over 16 work items with 1 elements each}}
-      !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   return
 }
 
@@ -463,7 +463,7 @@ func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
 func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{cannot distribute 4 over 8 work items with 1 elements each}}
-      !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map<wi_layout = [8, 2], wi_data = [1, 1]>>
+      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = wi, wi_layout = [8, 2], wi_data = [1, 1]>>
   return
 }
 
@@ -471,7 +471,7 @@ func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
 func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{cannot distribute 4 over 2 work items with 4 elements each}}
-      !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [4, 1]>>
+      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = wi, wi_layout = [2, 8], wi_data = [4, 1]>>
   return
 }
 
@@ -479,7 +479,7 @@ func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
 func.func @tensor_desc_invalid_map_data_1(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{cannot distribute 4 over 8 work items with 1 elements each}}
-      !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map<wi_layout = [8, 2], wi_data = [1, 2]>>
+      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = wi, wi_layout = [8, 2], wi_data = [1, 2]>>
   return
 }
 
@@ -490,7 +490,7 @@ func.func @tensor_desc_scatter_invalid_map_data(%src: ui64) {
       // expected-error at +1 {{cannot map over non-contiguous scattered row elements}}
       !xegpu.tensor_desc<4x2xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 2>,
-        #xegpu.sg_map<wi_layout = [1, 1], wi_data = [2, 1]>>
+         #xegpu.layout<scope = wi, wi_layout = [1, 1], wi_data = [2, 1]>>
   return
 }
 
@@ -500,7 +500,7 @@ func.func @tensor_desc_scatter_invalid_map_data_1(%src: ui64, %offsets: vector<1
       // expected-error at +1 {{work item data mapping must match the number of contiguous elements}}
       !xegpu.tensor_desc<16xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 1>,
-        #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 2]>>
+         #xegpu.layout<scope = wi, wi_layout = [1, 8], wi_data = [1, 2]>>
   return
 }
 
@@ -510,7 +510,7 @@ func.func @tensor_desc_scatter_invalid_chunk_size_1D(%src: ui64, %offsets: vecto
       // expected-error at +1 {{expected non-contiguous elements for 1D tensor}}
       !xegpu.tensor_desc<16xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 2>,
-        #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 2]>>
+         #xegpu.layout<scope = wi, wi_layout = [1, 8], wi_data = [1, 2]>>
   return
 }
 
@@ -520,6 +520,22 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto
       // expected-error at +1 {{expected chunk blocks for 2D tensor}}
       !xegpu.tensor_desc<16x2xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 1>,
-        #xegpu.sg_map<wi_layout = [8, 1], wi_data = [1, 2]>>
+         #xegpu.layout<scope = wi, wi_layout = [8, 1], wi_data = [1, 2]>>
   return
 }
+
+// -----
+func.func @test_convert_layout_same_map(%a: vector<32x64xf16>) {
+  // expected-error at +1 {{expected different srcMap and resMap}}
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>,
+                                resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+  gpu.return
+}
+
+// -----
+func.func @test_convert_layout_unmatch(%a: vector<32x64xf16>) {
+  // expected-error at +1 {{expected srcMap and resMap be in the same scope}}
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>,
+                                resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+  gpu.return
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index c32f1905454b6..6a29a73a20612 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -15,9 +15,9 @@ gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_1(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_create_nd_tdesc_simt_1(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -34,8 +34,8 @@ gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : ind
 gpu.func @test_create_nd_tdesc_simt_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
   //CHECK: %[[C:.*]] = arith.constant 1 : index
   %c1 = arith.constant 1 : index
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -48,8 +48,8 @@ gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_create_nd_tdesc_simt_3(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -62,8 +62,8 @@ gpu.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
 gpu.func @test_create_nd_tdesc_simt_4(%src: memref<2x24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -76,8 +76,8 @@ gpu.func @test_create_nd_tdesc_vc_5(%src: memref<2x24x32xf32, 3>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
 gpu.func @test_create_nd_tdesc_simt_5(%src: memref<2x24x32xf32, 3>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -90,8 +90,8 @@ gpu.func @test_create_nd_tdesc_vc_6(%src: memref<24x32xf32>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_6(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_create_nd_tdesc_simt_6(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -106,10 +106,10 @@ gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
 
 // CHECK: gpu.func @test_prefetch_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_prefetch_nd_simt(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -125,11 +125,11 @@ gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
 
 // CHECK: func @test_load_nd_simt(%[[arg0:.*]]: memref<8x16xf16>) {
 gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
   %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
-       : !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
+       : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
   gpu.return
 }
 
@@ -144,10 +144,10 @@ gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
 
 // CHECK: func @test_load_nd_simt_2(%[[arg0:.*]]: memref<8x16xf16>) {
 gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
   gpu.return
 }
 
@@ -162,11 +162,11 @@ gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
 
 // CHECK: func @test_load_nd_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
   gpu.return
 }
 
@@ -181,11 +181,11 @@ gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) {
 
 // CHECK: func @test_load_nd_simt_4(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
-    !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
+    !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
   gpu.return
 }
 
@@ -200,11 +200,11 @@ gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
 
 // CHECK: func @test_load_nd_simt_5(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
+    !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
   gpu.return
 }
 
@@ -219,11 +219,11 @@ gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) {
 
 // CHECK: func @test_load_nd_simt_6(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
-    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
+    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
   gpu.return
 }
 
@@ -238,11 +238,11 @@ gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) {
 
 // CHECK: func @test_load_nd_simt_7(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
-    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
+    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
   gpu.return
 }
 
@@ -257,10 +257,10 @@ gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) {
 
 // CHECK: func @test_load_nd_simt_8(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
   gpu.return
 }
 
@@ -279,11 +279,11 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
 gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) {
    // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48x1xf16>
   %1 = arith.constant dense<1.0>: vector<48x1xf16>
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
-    !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -305,11 +305,11 @@ gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
 gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) {
    // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2x1xf16>
   %1 = arith.constant dense<1.0>: vector<2x1xf16>
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
-    !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -324,10 +324,10 @@ gpu.func @test_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
 
 // CHECK: gpu.func @test_update_nd_tdesc_simt(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_update_nd_tdesc_simt(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -344,8 +344,8 @@ gpu.func @test_create_tdesc_vc(%src: ui64) {
 gpu.func @test_create_tdesc_simt(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -363,8 +363,8 @@ gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) {
 gpu.func @test_create_tdesc_simt_1(%src: memref<?xf32, 3>) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space =  slm, chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex>  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space =  slm, chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex>  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -383,7 +383,7 @@ gpu.func @test_create_tdesc_simt_2(%src: memref<?xf32>) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>
-  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>  -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>  -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -401,8 +401,8 @@ gpu.func @test_create_tdesc_vc_3(%src: ui64) {
 gpu.func @test_create_tdesc_simt_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
   gpu.return
 }
 
@@ -425,10 +425,10 @@ gpu.func @test_load_simt(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
   %1 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
-  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
   gpu.return
 }
 
@@ -451,10 +451,10 @@ gpu.func @test_load_simt_2(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
   %1 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
-  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
-  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
-  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
+  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
   gpu.return
 }
 
@@ -477,10 +477,10 @@ gpu.func @test_load_simt_3(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
   %1 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
-  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
-  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
-  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
   gpu.return
 }
 
@@ -509,10 +509,10 @@ gpu.func @test_store_simt(%src: ui64) {
   %1 = arith.constant dense<1>: vector<4xi1>
   //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2x1xf32>
   %2 = arith.constant dense<2.9>: vector<2x1xf32>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
-  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
   gpu.return
 }
 
@@ -541,10 +541,10 @@ gpu.func @test_store_simt_2(%src: ui64) {
   %1 = arith.constant dense<1>: vector<4xi1>
   //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<1x2xf16>
   %2 = arith.constant dense<2.9>: vector<1x2xf16>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
-  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
-  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
-  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
+  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
   gpu.return
 }
 
@@ -572,10 +572,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
   %1 = arith.constant dense<1>: vector<4xi1>
   //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32>
   %2 = arith.constant dense<2.9>: vector<1xf32>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
-  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
-  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
-  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
+  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
   gpu.return
 }
 
@@ -583,10 +583,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
 gpu.func @test_prefetch_simt(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -605,13 +605,13 @@ gpu.func @test_prefetch_vc(%src: ui64) {
 // CHECK: gpu.func @test_create_update_tdesc_simt(%[[arg0:.*]]: ui64) {
 gpu.func @test_create_update_tdesc_simt(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
-  //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+  //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   %s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
-  %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+  %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
   gpu.return
 }
 
@@ -637,12 +637,12 @@ gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
 
 // CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8x1xf16>, %[[arg1:.*]]: vector<8x2xf16>)
 gpu.func @test_dpas_simt(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
-  // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>,
-  // CHECK: sg_map_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>,
-  // CHECK: sg_map_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
-  %1 = xegpu.dpas %a, %b {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>,
-                          sg_map_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>,
-                          sg_map_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>}
+  // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
+  // CHECK: b_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
+  // CHECK: c_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+  %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
+                          b_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
+                          c_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>}
                           : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
   gpu.return
 }
@@ -704,4 +704,24 @@ gpu.func @fence() {
   gpu.return
 }
 
+// CHECK: gpu.func @test_create_nd_tdesc_wg_1(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], wi_layout = [1, 16], wi_data = [8, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], wi_layout = [1, 16], wi_data = [8, 1]>>
+  gpu.return
+}
+
+gpu.func @test_convert_layout(%a: vector<32x64xf16>) {
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [2, 1]>,
+                                resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+  gpu.return
+}
+
+gpu.func @test_convert_layout_wg(%a: vector<32x64xf16>) {
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>,
+                                resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+  gpu.return
+}
+
+
 }

>From cb2697927bc75b00abd03a39ffb0698ba8b9e0a4 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 21 Mar 2025 17:14:59 +0000
Subject: [PATCH 08/45] format code

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 35 ++++++++++++----------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp     |  7 +++--
 2 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 52b9f2c192b3f..5e21bb805a6a5 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -72,34 +72,39 @@ LogicalResult ScatterTensorDescAttr::verify(
 //===----------------------------------------------------------------------===//
 LogicalResult
 LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
-                  ScopeAttr scope,
-                  DenseI32ArrayAttr sg_layout,
-                  DenseI32ArrayAttr sg_data,
-                  DenseI32ArrayAttr order,
-                  DenseI32ArrayAttr wi_layout,
-                  DenseI32ArrayAttr wi_data) {
-
-  if (scope && scope.getValue() != Scope::WG && (sg_layout || sg_data || order)) {
-    return emitError() << "expected sg_layout, sg_data, and order being only used at workgroup level.";
+                   ScopeAttr scope, DenseI32ArrayAttr sg_layout,
+                   DenseI32ArrayAttr sg_data, DenseI32ArrayAttr order,
+                   DenseI32ArrayAttr wi_layout, DenseI32ArrayAttr wi_data) {
+
+  if (scope && scope.getValue() != Scope::WG &&
+      (sg_layout || sg_data || order)) {
+    return emitError() << "expected sg_layout, sg_data, and order being only "
+                          "used at workgroup level.";
   }
 
   if ((sg_layout != nullptr) ^ (sg_data != nullptr)) {
-    return emitError() << "expected sg_layout and sg_data being both present or both absent";
+    return emitError() << "expected sg_layout and sg_data being both present "
+                          "or both absent";
   }
 
   if (order) {
     if (!sg_layout)
-      return emitError() << "expected order being used with sg_layout and sg_data.";
+      return emitError()
+             << "expected order being used with sg_layout and sg_data.";
     if (order.size() != sg_layout.size())
-      return emitError() << "expected order having the same rank as sg_layout and sg_data";
+      return emitError()
+             << "expected order having the same rank as sg_layout and sg_data";
   }
 
-  if (sg_layout && (sg_layout.size() != sg_data.size() || sg_layout.size() > 2)) {
-    return emitError() << "expected sg_layout and sg_data having the same rank, which is not larger than 2";
+  if (sg_layout &&
+      (sg_layout.size() != sg_data.size() || sg_layout.size() > 2)) {
+    return emitError() << "expected sg_layout and sg_data having the same "
+                          "rank, which is not larger than 2";
   }
 
   if (wi_layout.size() != wi_data.size() || wi_layout.size() > 2)
-    return emitError() << "expected wi_layout and wi_data having the same rank, which is not larger than 2";
+    return emitError() << "expected wi_layout and wi_data having the same "
+                          "rank, which is not larger than 2";
 
   return success();
 }
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index c7e863256f235..66b5054278c8c 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -617,7 +617,9 @@ LogicalResult DpasOp::verify() {
   };
 
   if (!isValidSet())
-    return emitOpError("layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code).");
+    return emitOpError(
+        "layout attributes should be either set for all operands (for SIMT "
+        "code) or not set at all (for SIMD code).");
 
   // query the scope from layoutA (a valid setting).
   if (layoutA && layoutA.isForWorkItemLevel()) {
@@ -643,7 +645,8 @@ LogicalResult DpasOp::verify() {
       return emitOpError("M-dimension mismatch.");
     if (expandedShapeB[1] != expandedShapeC[1])
       return emitOpError("N-dimension mismatch.");
-  } else { // For other scopes, operands' shape should match the mxkxn semantics.
+  } else { // For other scopes, operands' shape should match the mxkxn
+           // semantics.
     if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resRank != 2)
       return emitOpError(
           "expecting lhs and result to be a 2D vector, and rhs to be either "

>From 273fc408a1c63fe3d4100708cad190f01b6d2523 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 21 Mar 2025 17:19:07 +0000
Subject: [PATCH 09/45] remove changes to prefetch op

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 6b27ae3b2754c..a3ee6e901a775 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -571,7 +571,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
   let hasVerifier = 1;
 }
 
-def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [MemoryEffects<[MemRead]>]> {
+def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
   let summary = "prefetches a set of scattered data points to cache";
 
   let description = [{

>From 504d2748efb1ad3d29a3187a5e692d58247a3bdd Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 21 Mar 2025 18:06:52 +0000
Subject: [PATCH 10/45] refine the doc for TensorDesc

---
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       | 43 +++++++++++--------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index c92ea42efde3b..82d6a4ec39e6b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -34,27 +34,24 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
         [ShapedTypeInterface], "::mlir::TensorType"> {
   let summary = "TensorDesc describing regions of interested data.";
   let description = [{
-    TensorDesc is a type designed to describe regions of the interested data as well as some
-    features that are unique to Intel hardware. Different with the builtin tensor type in MLIR,
-    it essentially only contains the meta data, and doesn't hold the data by itself. It is designed
-    to mainly support 2D block load/store and DPAS (matrix multiplication instruction) on Intel GPU.
-    It encodes the following information:
+    TensorDesc is a type designed to describe regions of interest in data, as well as some features
+    unique to Intel hardware. Unlike the built-in tensor type in MLIR, it essentially contains only
+    metadata and does not hold the data itself. It is primarily designed to support 2D block load/store
+    and DPAS (matrix multiplication instruction) on Intel GPUs. It encodes the following information:
 
     * shape:  the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows
               and each row contains 16 contiguous data element. The rows could be
-              either contiguous or not, depends on whether the encoding attribute
-              is set or not.
-    * element_type: the data type of the data element, e.g., f16, f32.
+              either contiguous or not, depends on the encoding attribute. If the
+              encoding is a BlockTensorDescAttr, rows are contiguous. If the encoding
+              is a ScatterTensorDescAttr, rows are not necessary to be contiguous. If
+              encoding is not set, it is considered as a default BlockTensorDescAttr.
 
-    Similar to the builtin tensor, it also provides an optinal attribute to encoding
-    the following information via the TensorDescAttr object:
-    * memory_space (xegpu::MemorySpace): [optional] where the data is located,
-                global memory or shared memory. It is default to Global.
-    * array_length (int): [optional] The number of contiguous blocks with size as `shape`,
-               that will be loaded by block load at a time. It is default to 1.
-    * boundary_check (bool): [optional] indicates whether the operation detects the boundary
-                and pads with zero for out-of-boundary access. It is default to do boundary check.
+    * element_type: the data type of the data element, e.g., f16, f32.
 
+    Similar to the built-in tensor, it also provides optional attributes for encoding
+    additional information via either BlockTensorDescAttr or ScatterTensorDescAttr, or
+    supporting Workgroup, Subgroup, and workitem (or SIMT) level programmings via the
+    Layout attribute. Please check their definition for details.
 
     Syntax:
 
@@ -63,7 +60,9 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     element-type ::= float-type | integer-type | index-type
     dim-list := (static-dim-list `x`)?
     static-dim-list ::= decimal-literal `x` decimal-literal
-    attr-list = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)? (, layout `<` wi_layout = value, wi_data = value `>`)?
+    attr-list = (, encoding-attr)? (, layout-attr)?
+    enconding-attr = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)?
+    layout-attr = (, layout `<` (scope = value,)? (sg_layout = value, sg_data = value, order = value)? wi_layout = value, wi_data = value `>`)?
     ```
 
     Examples:
@@ -78,8 +77,14 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     // A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
     xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_space = slm>>
 
-    // A TensorDesc with a layout
-    xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
+    // A TensorDesc with a layout for workgroup level programming
+    xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
+
+    // A TensorDesc with a layout for subgroup level programming
+    xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>>
+
+    // A TensorDesc with a layout for workitem level programming
+    xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
     ```
   }];
 

>From 90e070493d85af6fbdd31bb78b0e12b2a726ce49 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 21 Mar 2025 19:41:57 +0000
Subject: [PATCH 11/45] save work

---
 .../Dialect/XeGPU/Transforms/Transforms.h     |  2 +
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 57 ++++++++++++-------
 2 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index 63ea26df06937..3e94021c7a1ea 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -16,6 +16,8 @@ namespace xegpu {
 
 /// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`.
 void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns);
+/// Appends patterns for XeGPU SIMT distribution into `patterns`.
+void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns);
 
 } // namespace xegpu
 } // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index db8c321487a1c..925ba88a7a1db 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -753,7 +753,7 @@ namespace {
 ///   gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
 ///     ...
 ///     ...
-///     gpu.yield %result: vector<8x16xf32>
+///     gpu.return %result: vector<8x16xf32>
 ///   }
 /// ```
 /// To
@@ -1075,9 +1075,6 @@ SubgroupOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
         descOp, "expecting a memref typed value as the source");
 
   auto descOffsets = descOp.getMixedOffsets();
-  if (descOffsets.size() != 2)
-    return rewriter.notifyMatchFailure(descOp,
-                                       "offsets size is expected to be 2");
 
   xegpu::SGMapAttr sgMap = descOp.getType().getSGMapAttr();
   if (!sgMap)
@@ -1085,16 +1082,26 @@ SubgroupOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
         descOp, "the tensor descriptor lacks sg_map attribute");
 
   SmallVector<size_t> newRetIndices;
+  SmallVector<Value> newYieldValues;
+  SmallVector<Type> newYieldTypes;
+
+  for (auto arg : descOp->getOperands()) {
+    newYieldValues.push_back(arg);
+    newYieldTypes.push_back(arg.getType());
+  }
   rewriter.setInsertionPoint(subgroupOp);
   gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-      rewriter, subgroupOp, /* new yieled values = */ descOp.getSource(),
-      /* new yielded types = */ descOp.getSourceType(), newRetIndices);
+      rewriter, subgroupOp, /* new yieled values = */ newYieldValues,
+      /* new yielded types = */ newYieldTypes, newRetIndices);
 
+  SmallVector<Value> newDescOperands;
+  for (auto i : newRetIndices) {
+    newDescOperands.push_back(newWarpOp.getResult(i));
+  }
   rewriter.setInsertionPointAfter(newWarpOp);
   auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
-      newWarpOp.getLoc(), descOp.getType(),
-      dyn_cast<TypedValue<MemRefType>>(newWarpOp.getResult(newRetIndices[0])),
-      descOffsets);
+      newWarpOp.getLoc(), descOp.getType(), newDescOperands,
+      descOp->getAttrs());
 
   Value distributedVal = newWarpOp.getResult(operandIdx);
   rewriter.replaceAllUsesWith(distributedVal, newDescOp);
@@ -1119,7 +1126,7 @@ SubgroupOpDpas::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
   xegpu::SGMapAttr sgMapB =
       mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_b"));
   xegpu::SGMapAttr sgMapResult =
-      mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_out"));
+      mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_c"));
   if (!sgMapA || !sgMapB || !sgMapResult)
     return rewriter.notifyMatchFailure(
         dpasOp, "the xegpu::Dpas op lacks sg_map attribute for A, B or result");
@@ -1177,6 +1184,12 @@ struct XeGPUSubgroupDistributePass final
 };
 } // namespace
 
+void xegpu::populateXeGPUSubgroupDistributePatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<SubgroupOpTensorDescOp, SubgroupOpStoreNd, SubgroupOpLoadNd,
+               SubgroupOpDpas>(patterns.getContext());
+}
+
 void XeGPUSubgroupDistributePass::runOnOperation() {
   auto &analyis = getAnalysis<RunSGMapPropagation>();
   // Print the analysis result and exit. (for testing purposes)
@@ -1192,14 +1205,18 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
     signalPassFailure();
   /// Move all operations inside a GPU functions inside
   /// gpu.warp_execute_on_lane0
-  {
-    RewritePatternSet patterns(&getContext());
-    patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
-    /// We want to avoid ops from hoisted out of the gpu.warp_execute_on_lane0
-    /// region.
-    GreedyRewriteConfig config;
-    config.cseConstants = false;
-    config.fold = false;
-    (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
-  }
+
+  RewritePatternSet patterns(&getContext());
+  patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
+  /// We want to avoid ops from hoisted out of the gpu.warp_execute_on_lane0
+  /// region.
+  GreedyRewriteConfig config;
+  config.cseConstants = false;
+  config.fold = false;
+  (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
+
+  /// Finally, do the SIMD to SIMT distribution.
+  patterns.clear();
+  xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
+  (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
 }

>From 3abe7cb1655d3519f54dfde94015cd7f9a40c9be Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 21 Mar 2025 20:25:27 +0000
Subject: [PATCH 12/45] save work

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 925ba88a7a1db..0a7edb441f981 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -311,6 +311,11 @@ class SGMapPropagation : public SparseBackwardDataFlowAnalysis<SGMapLattice> {
   void setToExitState(SGMapLattice *lattice) override {
     (void)lattice->meet(SGMap());
   }
+
+  LogicalResult initialize(Operation *top) override {
+    llvm::errs() << "SGMapPropagation::initialize\n";
+    return success();
+  }
 };
 } // namespace
 
@@ -581,8 +586,8 @@ class RunSGMapPropagation {
 public:
   RunSGMapPropagation(Operation *op) : target(op) {
     SymbolTableCollection symbolTable;
-    solver.load<DeadCodeAnalysis>();
-    solver.load<SparseConstantPropagation>();
+    // solver.load<DeadCodeAnalysis>();
+    // solver.load<SparseConstantPropagation>();
     solver.load<SGMapPropagation>(symbolTable);
     (void)solver.initializeAndRun(op);
   }
@@ -679,6 +684,7 @@ void attachLayoutAttributeToUsers(Value v, Attribute layout) {
 static LogicalResult
 attachLayoutAttributes(Operation *top,
                        llvm::function_ref<SGMap(Value)> getPropagatedLayout) {
+  llvm::errs() << "op name : " << top->getName() << "\n";
   /// Helper to convert SGMap to xegpu::SGMapAttr.
   auto getSGMapForResult = [&](Value r) -> Attribute {
     auto layout = getPropagatedLayout(r);
@@ -694,6 +700,16 @@ attachLayoutAttributes(Operation *top,
   };
   /// Attach the layout attributes to the results of the operations.
   auto walkResult = top->walk([&](Operation *op) {
+    /// For function ops, propagate the argument layout to the users.
+    if (auto func = dyn_cast<FunctionOpInterface>(op)) {
+      for (auto arg : func.getArguments()) {
+        auto sgMapAttr = getSGMapForResult(arg);
+        if (sgMapAttr) {
+          attachLayoutAttributeToUsers(arg, sgMapAttr);
+        }
+      }
+      return WalkResult::advance();
+    }
     /// If no results, move on.
     if (op->getNumResults() == 0)
       return WalkResult::advance();

>From 596c953468e4c4c91f59975563594f52640df070 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 21 Mar 2025 21:27:56 +0000
Subject: [PATCH 13/45] update doc

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 40 ++++++++++--------
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 41 ++++++++++---------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 28 ++++++-------
 3 files changed, 59 insertions(+), 50 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 7adb9df3c6b25..8eb1b99c9d2c3 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -158,30 +158,33 @@ def XeGPU_ScopeWG:   I32EnumAttrCase<"WG", 0, "wg">;      // workgroup level cod
 def XeGPU_ScopeSG:   I32EnumAttrCase<"SG", 1, "sg">;      // subgroup level code
 def XeGPU_ScopeWI:   I32EnumAttrCase<"WI", 2, "wi">;      // simt level code
 
-def XeGPU_ScopeEnums : I32EnumAttr<"Scope", "enumerate of scope",
+def XeGPU_ScopeEnums : I32EnumAttr<"Scope", "enumeration of scope",
   [XeGPU_ScopeWG,XeGPU_ScopeSG,XeGPU_ScopeWI]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::xegpu";
 }
 
 def XeGPU_ScopeAttr
-  : EnumAttr<XeGPU_Dialect,XeGPU_ScopeEnums, "Stage"> {
-    let summary = [{Describe the stage of lowering progress}];
+  : EnumAttr<XeGPU_Dialect, XeGPU_ScopeEnums, "Scope"> {
+    let summary = [{Describe the programming scope of the IR}];
     let assemblyFormat = "``$value";
 }
 
 def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
   let summary = [{
-    Describes the mapping between work item (WI) and the 2D tensor specified by the tensor descriptor.
+    Describes the data distribution to subgroups and work-items for a tensor
+    specified by the tensor descriptor.
   }];
   let description = [{
-    XeGPU operations leverages LayoutAttr to distribute data across work-item. It is specified in tensor_descs
-    upon the tensor description creation. LayoutAttr contains the following parameters.
-
-    * scope: specifies the scope of current code. It can be either wg (workgroup), sg (subgroup) or wi (workitem).
-             it is hard required for subgroup, but optional for workgroup and wi. By default, if a LayoutAttr
-             contains sg_layout and sg_data, it will be treated as workgroup code; and if it only contains
-             wi_layout and wi_data, it will be considered as workitem level.
+    XeGPU operations leverages LayoutAttr to distribute data across subgroups and workitems.
+    It is specified in tensor_descs upon the tensor description creation. LayoutAttr contains
+    the following parameters.
+
+    * scope: Specifies the scope of the current code, which can be either wg (workgroup), sg (subgroup),
+            or wi (workitem). It is mandatory for subgroup-level programming and optional for workgroup
+            and workitem-level programming. By default, if a LayoutAttr includes sg_layout and sg_data,
+            it will be treated as workgroup level. Similarly, if it only includes wi_layout and wi_data,
+            it will be considered as workitem level.
     * sg_layout: [optional] specifies the total number of subgroups and their layout in a workgroup.
     * sg_data: [optional] specifies the data size accessed per subgroup.
     * order: [optional] specifies the dimension order used to linearize n-d sbugroup ids to 1-d.
@@ -189,16 +192,19 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
     * wi_layout: [required] specifies the total number of work-items and their layout in a subgroup
     * wi_data: [required] specifies the data size accessed per work-item for a single distribution.
 
-    `wi_data[0] * wi_data[1]` can be greater than 1, meaning that each work item operates on multiple elements,
-    which is eventually lowered to "SIMT-flavor" vector, like SPIR-V vector or llvm vector, or packed to a storage data type.
-    The multiple elements indicated by `wi_data` can only be from one dimension and must be contiguous in the memory along either dimension.
+    `wi_data[0] * wi_data[1]` can be greater than 1, indicating that each work item operates on multiple
+    elements. These elements are eventually lowered to a "SIMT-flavor" vector, such as a SPIR-V vector or
+    an LLVM vector, or packed into a storage data type. The multiple elements specified by wi_data must
+    come from a single dimension and be contiguous in memory along either dimension.
 
     E.g., #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
-    In this example, the subgroup has 16 work items in wi_layout=[1, 16], each accessing 1 element as specified by wi_data=[1, 1].
+    In this example, the subgroup consists of 16 work items arranged as wi_layout=[1, 16], with
+    each work item accessing a single element as defined by wi_data=[1, 1].
 
     E.g., #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>
-    In this example, the layout representing a workgroup work distribution. A workgroup has 8 subgroups organized as 2x4 layout.
-    and each subgroup accesses a 16x16 block per instruction, which is further disbributed to 16 work items as described above.
+    In this example, the layout represents a workgroup distribution. A workgroup consists of 8 subgroups
+    arranged in a 2x4 layout. Each subgroup accesses a 16x16 block per instruction, which is further
+    distributed to 16 work items as described above.
 
   }];
   let parameters = (ins
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 5e21bb805a6a5..9557a06e8e2a4 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -76,35 +76,38 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
                    DenseI32ArrayAttr sg_data, DenseI32ArrayAttr order,
                    DenseI32ArrayAttr wi_layout, DenseI32ArrayAttr wi_data) {
 
-  if (scope && scope.getValue() != Scope::WG &&
-      (sg_layout || sg_data || order)) {
-    return emitError() << "expected sg_layout, sg_data, and order being only "
-                          "used at workgroup level.";
-  }
-
-  if ((sg_layout != nullptr) ^ (sg_data != nullptr)) {
-    return emitError() << "expected sg_layout and sg_data being both present "
-                          "or both absent";
+  if (sg_data) {
+    if (!sg_layout)
+      return emitError() << "expected sg_layout being used with sg_data.";
+    if (sg_data.size() != sg_layout.size())
+      return emitError() << "expected sg_data having the same rank as sg_layout";
   }
 
   if (order) {
     if (!sg_layout)
-      return emitError()
-             << "expected order being used with sg_layout and sg_data.";
+      return emitError() << "expected order being used with sg_layout.";
     if (order.size() != sg_layout.size())
-      return emitError()
-             << "expected order having the same rank as sg_layout and sg_data";
+      return emitError() << "expected order having the same rank as sg_layout";
+  }
+
+  if (sg_layout && sg_layout.size() > 2) {
+    return emitError() << "expected the rank of the layout to be at most 2";
   }
 
-  if (sg_layout &&
-      (sg_layout.size() != sg_data.size() || sg_layout.size() > 2)) {
-    return emitError() << "expected sg_layout and sg_data having the same "
-                          "rank, which is not larger than 2";
+  if (scope && scope.getValue() != Scope::WG &&
+      (sg_layout || sg_data || order)) {
+    return emitError() << "expected sg_layout, sg_data, or order being only "
+                          "used at workgroup level.";
   }
 
-  if (wi_layout.size() != wi_data.size() || wi_layout.size() > 2)
+  if (scope && scope.getValue() == Scope::WG && !sg_layout ) {
+    return emitError() << "expected sg_layout for workgroup level layout";
+  }
+
+  if (wi_layout.size() != wi_data.size() || wi_layout.size() > 2) {
     return emitError() << "expected wi_layout and wi_data having the same "
-                          "rank, which is not larger than 2";
+                          "rank, with a maximum rank of 2";
+  }
 
   return success();
 }
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 66b5054278c8c..59faa1d31454d 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -593,25 +593,25 @@ LogicalResult DpasOp::verify() {
   auto rhsShape = getRhsType().getShape();
   auto resShape = getResultType().getShape();
 
-  auto layoutA = getALayoutAttr();
-  auto layoutB = getBLayoutAttr();
-  auto layoutC = getCLayoutAttr();
+  auto aLayout = getALayoutAttr();
+  auto bLayout = getBLayoutAttr();
+  auto cLayout = getCLayoutAttr();
 
   // make sure the layout attribute is either set for every available
   // operand or simply not set at all. C is special, since ACC is optional.
   // If they are all set, they also should be in the same scope.
   auto isValidSet = [&]() {
-    bool result = (layoutA != nullptr) ^ (layoutB != nullptr);
+    bool result = (aLayout != nullptr) ^ (bLayout != nullptr);
     if (hasAcc()) {
-      result |= (layoutA != nullptr) ^ (layoutC != nullptr);
+      result |= (aLayout != nullptr) ^ (cLayout != nullptr);
     }
     result = !result;
 
-    if (layoutA) {
-      auto scope = layoutA.getScope();
-      result &= layoutB ? scope == layoutB.getScope() : false;
+    if (aLayout) {
+      auto scope = aLayout.getScope();
+      result &= bLayout ? scope == bLayout.getScope() : false;
       if (hasAcc())
-        result &= layoutC ? scope == layoutC.getScope() : false;
+        result &= cLayout ? scope == cLayout.getScope() : false;
     }
     return result;
   };
@@ -621,15 +621,15 @@ LogicalResult DpasOp::verify() {
         "layout attributes should be either set for all operands (for SIMT "
         "code) or not set at all (for SIMD code).");
 
-  // query the scope from layoutA (a valid setting).
-  if (layoutA && layoutA.isForWorkItemLevel()) {
+  // query the scope from aLayout (a valid setting).
+  if (aLayout && aLayout.isForWorkItemLevel()) {
     // In SIMT mode, All data fragments must be 2D
     if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
       return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
 
-    auto wiLayoutA = layoutA.getWiLayout();
-    auto wiLayoutB = layoutB.getWiLayout();
-    auto wiLayoutC = layoutC.getWiLayout();
+    auto wiLayoutA = aLayout.getWiLayout();
+    auto wiLayoutB = bLayout.getWiLayout();
+    auto wiLayoutC = cLayout.getWiLayout();
     // Obtain the expanded shapes of the operands and result using wi_layout.
     // NOTE: For B, get rid of the packed dimension for the expanded shape.
     SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],

>From 2065764a5b76f76ad543a4c92a4d0112e38691a4 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 21 Mar 2025 21:42:11 +0000
Subject: [PATCH 14/45] save work

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 51 ++++++++-----------
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 0a7edb441f981..3ed3f462aa530 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -33,6 +33,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -311,11 +312,6 @@ class SGMapPropagation : public SparseBackwardDataFlowAnalysis<SGMapLattice> {
   void setToExitState(SGMapLattice *lattice) override {
     (void)lattice->meet(SGMap());
   }
-
-  LogicalResult initialize(Operation *top) override {
-    llvm::errs() << "SGMapPropagation::initialize\n";
-    return success();
-  }
 };
 } // namespace
 
@@ -586,8 +582,8 @@ class RunSGMapPropagation {
 public:
   RunSGMapPropagation(Operation *op) : target(op) {
     SymbolTableCollection symbolTable;
-    // solver.load<DeadCodeAnalysis>();
-    // solver.load<SparseConstantPropagation>();
+    solver.load<DeadCodeAnalysis>();
+    solver.load<SparseConstantPropagation>();
     solver.load<SGMapPropagation>(symbolTable);
     (void)solver.initializeAndRun(op);
   }
@@ -660,7 +656,7 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
   }
 }
 
-void attachLayoutAttributeToUsers(Value v, Attribute layout) {
+void attachLayoutAttributeToUsers(Value v, xegpu::SGMapAttr layout) {
   for (OpOperand &user : v.getUses()) {
     Operation *owner = user.getOwner();
     unsigned operandNumber = user.getOperandNumber();
@@ -668,11 +664,11 @@ void attachLayoutAttributeToUsers(Value v, Attribute layout) {
     /// attribute.
     if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
       if (operandNumber == 0)
-        dpasOp->setAttr("sg_map_a", layout);
+        dpasOp.setSgMapAAttr(layout);
       else if (operandNumber == 1)
-        dpasOp->setAttr("sg_map_b", layout);
+        dpasOp.setSgMapBAttr(layout);
       else if (operandNumber == 2)
-        dpasOp->setAttr("sg_map_c", layout);
+        dpasOp.setSgMapCAttr(layout);
       continue;
     }
     /// For every other user, use a generic attribute name.
@@ -686,7 +682,7 @@ attachLayoutAttributes(Operation *top,
                        llvm::function_ref<SGMap(Value)> getPropagatedLayout) {
   llvm::errs() << "op name : " << top->getName() << "\n";
   /// Helper to convert SGMap to xegpu::SGMapAttr.
-  auto getSGMapForResult = [&](Value r) -> Attribute {
+  auto getSGMapForResult = [&](Value r) -> xegpu::SGMapAttr {
     auto layout = getPropagatedLayout(r);
     if (!layout.isAssigned())
       return {};
@@ -1137,28 +1133,25 @@ SubgroupOpDpas::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
 
   auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
   unsigned operandIdx = operand->getOperandNumber();
-  xegpu::SGMapAttr sgMapA =
-      mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_a"));
-  xegpu::SGMapAttr sgMapB =
-      mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_b"));
-  xegpu::SGMapAttr sgMapResult =
-      mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_c"));
-  if (!sgMapA || !sgMapB || !sgMapResult)
+  xegpu::SGMapAttr sgMapA = dpasOp.getSgMapAAttr();
+  xegpu::SGMapAttr sgMapB = dpasOp.getSgMapBAttr();
+  xegpu::SGMapAttr sgMapOut = dpasOp->getAttrOfType<xegpu::SGMapAttr>("r0");
+  if (!sgMapA || !sgMapB || !sgMapOut)
     return rewriter.notifyMatchFailure(
-        dpasOp, "the xegpu::Dpas op lacks sg_map attribute for A, B or result");
+        dpasOp, "the xegpu::Dpas op lacks sg_map attribute for A, B or output");
 
   auto distributedLhsTypeOrFailure =
       getDistributedVectorType(sgMapA, dpasOp.getLhsType());
   auto distributedRhsTypeOrFailure =
       getDistributedVectorType(sgMapB, dpasOp.getRhsType());
   auto distributedResultTypeOrFailure =
-      getDistributedVectorType(sgMapResult, dpasOp.getResultType());
+      getDistributedVectorType(sgMapOut, dpasOp.getResultType());
   if (failed(distributedLhsTypeOrFailure) ||
       failed(distributedRhsTypeOrFailure) ||
       failed(distributedResultTypeOrFailure))
     return rewriter.notifyMatchFailure(
         dpasOp,
-        "Failed to distribute the A, B or result types in xegpu::Dpas op");
+        "Failed to distribute the A, B or output types in xegpu::Dpas op");
 
   llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(), dpasOp.getRhs()};
   llvm::SmallVector<Type, 3> newYieldTypes{distributedLhsTypeOrFailure.value(),
@@ -1175,15 +1168,15 @@ SubgroupOpDpas::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
 
   // Create a new dpas op outside the warp op.
   rewriter.setInsertionPointAfter(newWarpOp);
-  auto newDpasOp = cast<xegpu::DpasOp>(*dpasOp.clone());
-  newDpasOp.getLhsMutable().assign(newWarpOp.getResult(newRetIndices[0]));
-  newDpasOp.getRhsMutable().assign(newWarpOp.getResult(newRetIndices[1]));
-  if (dpasOp.getAcc())
-    newDpasOp.getAccMutable().assign(newWarpOp.getResult(newRetIndices[2]));
-  newDpasOp->getOpResult(0).setType(distributedResultTypeOrFailure.value());
+  SmallVector<Value> newDpasOperands;
+  for (auto i : newRetIndices) {
+    newDpasOperands.push_back(newWarpOp.getResult(i));
+  }
+  auto newDpasOp = rewriter.create<xegpu::DpasOp>(
+      newWarpOp->getLoc(), distributedResultTypeOrFailure.value(),
+      newDpasOperands, dpasOp->getAttrs());
   Value disributedVal = newWarpOp.getResult(operandIdx);
   rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
-
   return success();
 }
 

>From 899439bdb4a827b9248c9e163fadf1df312d28b5 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 24 Mar 2025 15:49:43 +0000
Subject: [PATCH 15/45] refine docs

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  34 +--
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |  16 +-
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       |   8 +-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  72 +++---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |  22 +-
 mlir/test/Dialect/XeGPU/invalid.mlir          |  74 +++---
 mlir/test/Dialect/XeGPU/ops.mlir              | 222 +++++++++---------
 7 files changed, 224 insertions(+), 224 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 8eb1b99c9d2c3..2f9aa0106b1bc 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -154,12 +154,12 @@ def XeGPU_FenceScopeAttr:
     let assemblyFormat = "$value";
 }
 
-def XeGPU_ScopeWG:   I32EnumAttrCase<"WG", 0, "wg">;      // workgroup level code
-def XeGPU_ScopeSG:   I32EnumAttrCase<"SG", 1, "sg">;      // subgroup level code
-def XeGPU_ScopeWI:   I32EnumAttrCase<"WI", 2, "wi">;      // simt level code
+def XeGPU_ScopeWG:     I32EnumAttrCase<"WG", 0, "wg">;        // workgroup level code
+def XeGPU_ScopeSG:     I32EnumAttrCase<"SG", 1, "sg">;        // subgroup level code
+def XeGPU_ScopeLane:   I32EnumAttrCase<"Lane", 2, "lane">;    // simt level code
 
 def XeGPU_ScopeEnums : I32EnumAttr<"Scope", "enumeration of scope",
-  [XeGPU_ScopeWG,XeGPU_ScopeSG,XeGPU_ScopeWI]> {
+  [XeGPU_ScopeWG, XeGPU_ScopeSG, XeGPU_ScopeLane]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::xegpu";
 }
@@ -181,27 +181,27 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
     the following parameters.
 
     * scope: Specifies the scope of the current code, which can be either wg (workgroup), sg (subgroup),
-            or wi (workitem). It is mandatory for subgroup-level programming and optional for workgroup
+            or lane (workitem). It is mandatory for subgroup-level programming and optional for workgroup
             and workitem-level programming. By default, if a LayoutAttr includes sg_layout and sg_data,
-            it will be treated as workgroup level. Similarly, if it only includes wi_layout and wi_data,
+            it will be treated as workgroup level. Similarly, if it only includes lane_layout and lane_data,
             it will be considered as workitem level.
     * sg_layout: [optional] specifies the total number of subgroups and their layout in a workgroup.
     * sg_data: [optional] specifies the data size accessed per subgroup.
     * order: [optional] specifies the dimension order used to linearize n-d sbugroup ids to 1-d.
             The first dimension in the order list is the fastest-changing dimension.
-    * wi_layout: [required] specifies the total number of work-items and their layout in a subgroup
-    * wi_data: [required] specifies the data size accessed per work-item for a single distribution.
+    * lane_layout: [required] specifies the total number of work-items and their layout in a subgroup
+    * lane_data: [required] specifies the data size accessed per work-item for a single distribution.
 
-    `wi_data[0] * wi_data[1]` can be greater than 1, indicating that each work item operates on multiple
+    `lane_data[0] * lane_data[1]` can be greater than 1, indicating that each work item operates on multiple
     elements. These elements are eventually lowered to a "SIMT-flavor" vector, such as a SPIR-V vector or
-    an LLVM vector, or packed into a storage data type. The multiple elements specified by wi_data must
+    an LLVM vector, or packed into a storage data type. The multiple elements specified by lane_data must
     come from a single dimension and be contiguous in memory along either dimension.
 
-    E.g., #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
-    In this example, the subgroup consists of 16 work items arranged as wi_layout=[1, 16], with
-    each work item accessing a single element as defined by wi_data=[1, 1].
+    E.g., #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    In this example, the subgroup consists of 16 work items arranged as lane_layout=[1, 16], with
+    each work item accessing a single element as defined by lane_data=[1, 1].
 
-    E.g., #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>
+    E.g., #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>
     In this example, the layout represents a workgroup distribution. A workgroup consists of 8 subgroups
     arranged in a 2x4 layout. Each subgroup accesses a 16x16 block per instruction, which is further
     distributed to 16 work items as described above.
@@ -212,8 +212,8 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
     OptionalParameter<"DenseI32ArrayAttr">: $sg_layout,
     OptionalParameter<"DenseI32ArrayAttr">: $sg_data,
     OptionalParameter<"DenseI32ArrayAttr">: $order,
-    "DenseI32ArrayAttr": $wi_layout,
-    "DenseI32ArrayAttr": $wi_data
+    "DenseI32ArrayAttr": $lane_layout,
+    "DenseI32ArrayAttr": $lane_data
   );
 
   let extraClassDeclaration = [{
@@ -230,7 +230,7 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
     bool isForWorkItemLevel() {
       if (!getScope())
         return !getSgLayout() && !getSgData() && !getOrder();
-      return getScope() == ScopeAttr::get(getContext(), Scope::WI);
+      return getScope() == ScopeAttr::get(getContext(), Scope::Lane);
     }
   }];
 
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index a3ee6e901a775..7188f74815943 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -113,7 +113,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 8 : index
     %1 = xegpu.create_nd_tdesc %0[%c0, %c0] : memref<1024x1024xf32>
-          -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
+          -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     ```
   }];
 
@@ -323,7 +323,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
       xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint<cached>,
                         l2_hint = #xegpu.cache_hint<uncached>}>
         : !xegpu.tensor_desc<8x16xf32,
-          #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+          #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
     ```
 
 
@@ -381,7 +381,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
                              l2_hint = #xegpu.cache_hint<write_back>,
                              l3_hint = #xegpu.cache_hint<write_through>}
                              : vector<8x1xf16>, !xegpu.tensor_desc<8x16xf16,
-                               #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
+                               #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     ```
 
 
@@ -422,7 +422,7 @@ def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
   Example 2 (SIMT mode):
   ```
     %2 = xegpu.update_nd_offset %1, [0, 16]:
-      !xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
+      !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   ```
   }];
 
@@ -517,7 +517,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
     %off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
     %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
           -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>,
-          #xegpu.layout<wi_layout = [4, 1], wi_data = [1, 1]>>
+          #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
     ```
   }];
 
@@ -653,7 +653,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
                             l2_hint = #xegpu.cache_hint<uncached>,
                             l3_hint = #xegpu.cache_hint<uncached>}
           : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>,
-            !xegpu.layout<wi_layout = [16, 1], wi_data = [1, 1]>>
+            !xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
             vector<16xi1> -> vector<8x1xf32>
   ```
 
@@ -732,7 +732,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
                                  l2_hint = #xegpu.cache_hint<write_back>,
                                  l3_hint = #xegpu.cache_hint<write_through>}
           : vector<8x1xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>,
-            !xegpu.layout<wi_layout = [16, 1], wi_data = [1, 1]>> vector<16xi1>
+            !xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> vector<16xi1>
   ```
 
   }];
@@ -790,7 +790,7 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
       %off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
       %2 = xegpu.update_offset %1, %off :
               !xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<chunk_size=2>,
-              #xegpu.layout<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+              #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
     ```
   }];
 
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 82d6a4ec39e6b..8559f4beb2c03 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -62,7 +62,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     static-dim-list ::= decimal-literal `x` decimal-literal
     attr-list = (, encoding-attr)? (, layout-attr)?
     enconding-attr = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)?
-    layout-attr = (, layout `<` (scope = value,)? (sg_layout = value, sg_data = value, order = value)? wi_layout = value, wi_data = value `>`)?
+    layout-attr = (, layout `<` (scope = value,)? (sg_layout = value, sg_data = value, order = value)? lane_layout = value, lane_data = value `>`)?
     ```
 
     Examples:
@@ -78,13 +78,13 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_space = slm>>
 
     // A TensorDesc with a layout for workgroup level programming
-    xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
+    xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
 
     // A TensorDesc with a layout for subgroup level programming
-    xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>>
+    xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>>
 
     // A TensorDesc with a layout for workitem level programming
-    xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+    xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, lane_layout = [1, 16], lane_data = [1, 1]>>
     ```
   }];
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 9557a06e8e2a4..0da86f1af33e4 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -74,7 +74,7 @@ LogicalResult
 LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
                    ScopeAttr scope, DenseI32ArrayAttr sg_layout,
                    DenseI32ArrayAttr sg_data, DenseI32ArrayAttr order,
-                   DenseI32ArrayAttr wi_layout, DenseI32ArrayAttr wi_data) {
+                   DenseI32ArrayAttr lane_layout, DenseI32ArrayAttr lane_data) {
 
   if (sg_data) {
     if (!sg_layout)
@@ -104,8 +104,8 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
     return emitError() << "expected sg_layout for workgroup level layout";
   }
 
-  if (wi_layout.size() != wi_data.size() || wi_layout.size() > 2) {
-    return emitError() << "expected wi_layout and wi_data having the same "
+  if (lane_layout.size() != lane_data.size() || lane_layout.size() > 2) {
+    return emitError() << "expected lane_layout and lane_data having the same "
                           "rank, with a maximum rank of 2";
   }
 
@@ -249,11 +249,11 @@ LogicalResult TensorDescType::verify(
   }
 
   if (auto layoutAttr = llvm::dyn_cast_if_present<LayoutAttr>(layout)) {
-    ArrayRef<int32_t> wiLayout = layoutAttr.getWiLayout().asArrayRef();
-    ArrayRef<int32_t> wiData = layoutAttr.getWiData().asArrayRef();
+    ArrayRef<int32_t> laneLayout = layoutAttr.getLaneLayout().asArrayRef();
+    ArrayRef<int32_t> laneData = layoutAttr.getLaneData().asArrayRef();
 
     if (rank == 1) {
-      if (wiLayout[0] != 1 || wiData[0] != 1)
+      if (laneLayout[0] != 1 || laneData[0] != 1)
         return emitError()
                << "outer layout distribution and data mapping must be 1 "
                   "for 1D tensor";
@@ -265,10 +265,10 @@ LogicalResult TensorDescType::verify(
       // [sg_size, chunk_size] will be [1] or [1, 32/element_ty_bit_width]
       // respectively, the mapping should reflect that. This is because each
       // work item access data in 32 bit granularity.
-      if (wiData[0] != 1)
+      if (laneData[0] != 1)
         return emitError()
                << "cannot map over non-contiguous scattered row elements";
-      if (wiData[1] != packingFactor)
+      if (laneData[1] != packingFactor)
         return emitError() << "work item data mapping must match the number of "
                               "contiguous elements";
     }
@@ -281,10 +281,10 @@ LogicalResult TensorDescType::verify(
 
     size_t dims = tensorShape.size();
     for (size_t i = 0; i < dims; ++i) {
-      uint32_t numElemPerWi = wiLayout[i] * wiData[i];
+      uint32_t numElemPerWi = laneLayout[i] * laneData[i];
       if (tensorShape[i] < numElemPerWi || tensorShape[i] % numElemPerWi != 0)
         return emitError() << "cannot distribute " << tensorShape[i] << " over "
-                           << wiLayout[i] << " work items with " << wiData[i]
+                           << laneLayout[i] << " work items with " << laneData[i]
                            << " elements each";
     }
   }
@@ -295,16 +295,16 @@ LogicalResult TensorDescType::verify(
 // If tensor descriptor has a layout attribute it is used in SIMT mode.
 // In this mode, the distributed vector shape is determined as follows:
 // Definitions:
-//        wi_data_size = wi_data[0] × wi_data[1]
-//        subgroup_size = wi_layout[0] × wi_layout[1]
-//        distribution_unit_size = subgroup_size × wi_data_size
+//        lane_data_size = lane_data[0] × lane_data[1]
+//        subgroup_size = lane_layout[0] × lane_layout[1]
+//        distribution_unit_size = subgroup_size × lane_data_size
 // ---------------------------------------------------------------------
 // Case 1: Regular loads/stores.
 // ---------------------------------------------------------------------
 // Distributed vector shape must be:
-//        [chunk_size / wi_data_size, wi_data_size]
+//        [chunk_size / lane_data_size, lane_data_size]
 // If the tensor descriptor shape is 1D, first dimension is ignored (set to 1).
-//        [wi_data_size]
+//        [lane_data_size]
 // ---------------------------------------------------------------------
 // Case 2: Block loads/stores
 // ---------------------------------------------------------------------
@@ -312,23 +312,23 @@ LogicalResult TensorDescType::verify(
 //        tensor_size = tensor_desc[0] * .. * tensor_desc[r-1] * array_length
 //        n_distribution_units = tensor_size / distribution_unit_size
 // Given above definitions, the following conditions must be met:
-//        * tensor_desc[0] % (wi_layout[0] × wi_data[0]) == 0
-//        * tensor_desc[1] % (wi_layout[1] × wi_data[1]) == 0
+//        * tensor_desc[0] % (lane_layout[0] × lane_data[0]) == 0
+//        * tensor_desc[1] % (lane_layout[1] × lane_data[1]) == 0
 // Distributed vector shape must be:
-//        [n_distribution_units, wi_data_size]
+//        [n_distribution_units, lane_data_size]
 FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
   auto layout = llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
   // If no layout is provided, tensor desc is not used in SIMT mode.
   if (!layout || !layout.isForWorkItemLevel())
     return failure();
 
-  SmallVector<int64_t> wiData(layout.getWiData().asArrayRef());
-  SmallVector<int64_t> wiLayout(layout.getWiLayout().asArrayRef());
+  SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
+  SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
   auto tdescShape = getShape();
 
-  auto wiDataSize = 1, sgSize = 1;
-  for (auto [wiDim, wiDataDim] : llvm::zip_equal(wiLayout, wiData)) {
-    wiDataSize *= wiDataDim;
+  auto laneDataSize = 1, sgSize = 1;
+  for (auto [wiDim, laneDataDim] : llvm::zip_equal(laneLayout, laneData)) {
+    laneDataSize *= laneDataDim;
     sgSize *= wiDim;
   }
 
@@ -338,35 +338,35 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
     auto chunkSize = scatterAttr.getChunkSize().getInt();
     // Verify if the first dimension of the tensor descriptor shape is
     // distributable.
-    assert(tdescShape[0] % (wiLayout[0]) == 0 &&
+    assert(tdescShape[0] % (laneLayout[0]) == 0 &&
            "tensor descriptor shape is not distributable");
     if (chunkSize > 1)
-      return VectorType::get({chunkSize / wiDataSize, wiDataSize},
+      return VectorType::get({chunkSize / laneDataSize, laneDataSize},
                              getElementType());
-    return VectorType::get({wiDataSize}, getElementType());
+    return VectorType::get({laneDataSize}, getElementType());
   }
 
   // Case 2: block loads/stores
-  // Tensor descriptor shape can be 1D. For the 1D case, outer dims of wiData
-  // and wiLayout must be 1.
+  // Tensor descriptor shape can be 1D. For the 1D case, outer dims of laneData
+  // and laneLayout must be 1.
   if (tdescShape.size() == 1) {
-    assert((wiData[0] == 1 && wiLayout[0] == 1) &&
-           "wi_data[0] and wi_layout[0] must be 1 for 1D tensor descriptor");
-    wiData = {wiData[1]};
-    wiLayout = {wiLayout[1]};
+    assert((laneData[0] == 1 && laneLayout[0] == 1) &&
+           "lane_data[0] and lane_layout[0] must be 1 for 1D tensor descriptor");
+    laneData = {laneData[1]};
+    laneLayout = {laneLayout[1]};
   }
   // Check if the tensor descriptor shape is distributable.
   int64_t tensorSize = 1;
-  for (auto [tdescDim, wiDim, wiDataDim] :
-       llvm::zip_equal(tdescShape, wiLayout, wiData)) {
-    assert((tdescDim % (wiDim * wiDataDim) == 0) &&
+  for (auto [tdescDim, wiDim, laneDataDim] :
+       llvm::zip_equal(tdescShape, laneLayout, laneData)) {
+    assert((tdescDim % (wiDim * laneDataDim) == 0) &&
            "tensor descriptor shape is not distributable");
     tensorSize *= tdescDim;
   }
   // tensorSize must be adjusted for array_length.
   tensorSize *= getArrayLength();
 
-  return VectorType::get({tensorSize / (sgSize * wiDataSize), wiDataSize},
+  return VectorType::get({tensorSize / (sgSize * laneDataSize), laneDataSize},
                          getElementType());
 }
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 59faa1d31454d..e2ccc59d39371 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -113,8 +113,8 @@ static bool isEvenDistributed(llvm::ArrayRef<int64_t> shape,
     data = attr.getSgData().asArrayRef();
     layout = attr.getSgLayout().asArrayRef();
   } else {
-    data = attr.getWiData().asArrayRef();
-    layout = attr.getWiLayout().asArrayRef();
+    data = attr.getLaneData().asArrayRef();
+    layout = attr.getLaneLayout().asArrayRef();
   }
   for (auto [s, d, l] : llvm::zip_equal(shape, data, layout)) {
     // check s % (d * l) != 0
@@ -627,17 +627,17 @@ LogicalResult DpasOp::verify() {
     if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
       return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
 
-    auto wiLayoutA = aLayout.getWiLayout();
-    auto wiLayoutB = bLayout.getWiLayout();
-    auto wiLayoutC = cLayout.getWiLayout();
-    // Obtain the expanded shapes of the operands and result using wi_layout.
+    auto laneLayoutA = aLayout.getLaneLayout();
+    auto laneLayoutB = bLayout.getLaneLayout();
+    auto laneLayoutC = cLayout.getLaneLayout();
+    // Obtain the expanded shapes of the operands and result using lane_layout.
     // NOTE: For B, get rid of the packed dimension for the expanded shape.
-    SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],
-                                           lhsShape[1] * wiLayoutA[1]};
+    SmallVector<int64_t> expandedShapeA = {lhsShape[0] * laneLayoutA[0],
+                                           lhsShape[1] * laneLayoutA[1]};
     SmallVector<int64_t> expandedShapeB = {
-        rhsShape[0] * rhsShape[1] * wiLayoutB[0], 1 * wiLayoutB[1]};
-    SmallVector<int64_t> expandedShapeC = {resShape[0] * wiLayoutC[0],
-                                           resShape[1] * wiLayoutC[1]};
+        rhsShape[0] * rhsShape[1] * laneLayoutB[0], 1 * laneLayoutB[1]};
+    SmallVector<int64_t> expandedShapeC = {resShape[0] * laneLayoutC[0],
+                                           resShape[1] * laneLayoutC[1]};
     auto bK = expandedShapeB[0];
     if (bK != expandedShapeA[1])
       return emitOpError("K-dimension mismatch.");
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index c4958d920a89f..17e4f60638905 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -80,11 +80,11 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
 // -----
 func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   // expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1]}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
       l2_hint = #xegpu.cache_hint<uncached>}>
-    : !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+    : !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
     -> vector<8x2xf32>
   return
 }
@@ -92,11 +92,11 @@ func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
 // -----
 func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   // expected-error at +1 {{Result shape [8] is not consistent with distributed vector shape [1, 1]}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
       l2_hint = #xegpu.cache_hint<uncached>}>
-    : !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+    : !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
     -> vector<8xf32>
   return
 }
@@ -136,20 +136,20 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
 // -----
 func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) {
   %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   // expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1] for tensor descriptor}}
   xegpu.store_nd %data, %1
-    : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+    : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   return
 }
 
 // -----
 func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<2xf32>) {
   %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   // expected-error at +1 {{Result shape [2] is not consistent with distributed vector shape [1, 1] for tensor descriptor}}
   xegpu.store_nd %data, %1
-    : vector<2xf32>, !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+    : vector<2xf32>, !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   return
 }
 
@@ -248,7 +248,7 @@ func.func @test_prefetch_vc_2(%src: ui64) {
 func.func @test_create_tdesc_layout_1(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>,   #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>,   #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
   return
 }
 
@@ -256,7 +256,7 @@ func.func @test_create_tdesc_layout_1(%src: ui64) {
 func.func @test_create_tdesc_layout_2(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error at +1 {{cannot map over non-contiguous scattered row elements}}
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,   #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [2, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,   #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [2, 1]>>
   return
 }
 
@@ -264,7 +264,7 @@ func.func @test_create_tdesc_layout_2(%src: ui64) {
 func.func @test_create_tdesc_layout_3(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error at +1 {{work item data mapping must match the number of contiguous elements}}
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>,   #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>,   #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
   return
 }
 
@@ -272,9 +272,9 @@ func.func @test_create_tdesc_layout_3(%src: ui64) {
 func.func @test_load_gather_layout_1(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,   #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,   #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
   // expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
+  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
   return
 }
 
@@ -282,9 +282,9 @@ func.func @test_load_gather_layout_1(%src: ui64) {
 func.func @test_load_gather_layout_2(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
   // expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
+  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
   return
 }
 
@@ -294,9 +294,9 @@ func.func @test_store_scatter_layout_1(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %val = arith.constant dense<2.9>: vector<1x2xf32>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
   // expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
   return
 }
 
@@ -305,9 +305,9 @@ func.func @test_store_scatter_layout_2(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %val = arith.constant dense<2.9>: vector<2xf32>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
   // expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
   return
 }
 
@@ -396,16 +396,16 @@ func.func @test_dpas_4(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) {
 // -----
 func.func @test_dpas_layout_1(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
   // expected-error at +1 {{layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code)}}
-  %1 = xegpu.dpas %a, %b {a_layout =  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+  %1 = xegpu.dpas %a, %b {a_layout =  #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
   return
 }
 
 // -----
 func.func @test_dpas_layout_2(%a : vector<8x1xf16>, %b: vector<4x2xf16>) {
   // expected-error at +1 {{K-dimension mismatch}}
-  %1 = xegpu.dpas %a, %b {a_layout =  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
-                          b_layout =  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
-                          c_layout =  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>}
+  %1 = xegpu.dpas %a, %b {a_layout =  #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>,
+                          b_layout =  #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>,
+                          c_layout =  #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>}
                           : vector<8x1xf16>, vector<4x2xf16> -> vector<8x1xf32>
   return
 }
@@ -439,7 +439,7 @@ func.func @tensor_desc_invalid_rank_1(%src: memref<24x32xf32>) {
 func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
-      !xegpu.tensor_desc<16xf32,  #xegpu.layout<scope = wi, wi_layout = [2, 16], wi_data = [1, 1]>>
+      !xegpu.tensor_desc<16xf32,  #xegpu.layout<scope = lane, lane_layout = [2, 16], lane_data = [1, 1]>>
   return
 }
 
@@ -447,7 +447,7 @@ func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) {
 func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
-      !xegpu.tensor_desc<16xf32,  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+      !xegpu.tensor_desc<16xf32,  #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
   return
 }
 
@@ -455,7 +455,7 @@ func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
 func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{cannot distribute 8 over 16 work items with 1 elements each}}
-      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   return
 }
 
@@ -463,7 +463,7 @@ func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
 func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{cannot distribute 4 over 8 work items with 1 elements each}}
-      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = wi, wi_layout = [8, 2], wi_data = [1, 1]>>
+      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = lane, lane_layout = [8, 2], lane_data = [1, 1]>>
   return
 }
 
@@ -471,7 +471,7 @@ func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
 func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{cannot distribute 4 over 2 work items with 4 elements each}}
-      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = wi, wi_layout = [2, 8], wi_data = [4, 1]>>
+      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = lane, lane_layout = [2, 8], lane_data = [4, 1]>>
   return
 }
 
@@ -479,7 +479,7 @@ func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
 func.func @tensor_desc_invalid_map_data_1(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{cannot distribute 4 over 8 work items with 1 elements each}}
-      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = wi, wi_layout = [8, 2], wi_data = [1, 2]>>
+      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = lane, lane_layout = [8, 2], lane_data = [1, 2]>>
   return
 }
 
@@ -490,7 +490,7 @@ func.func @tensor_desc_scatter_invalid_map_data(%src: ui64) {
       // expected-error at +1 {{cannot map over non-contiguous scattered row elements}}
       !xegpu.tensor_desc<4x2xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 2>,
-         #xegpu.layout<scope = wi, wi_layout = [1, 1], wi_data = [2, 1]>>
+         #xegpu.layout<scope = lane, lane_layout = [1, 1], lane_data = [2, 1]>>
   return
 }
 
@@ -500,7 +500,7 @@ func.func @tensor_desc_scatter_invalid_map_data_1(%src: ui64, %offsets: vector<1
       // expected-error at +1 {{work item data mapping must match the number of contiguous elements}}
       !xegpu.tensor_desc<16xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 1>,
-         #xegpu.layout<scope = wi, wi_layout = [1, 8], wi_data = [1, 2]>>
+         #xegpu.layout<scope = lane, lane_layout = [1, 8], lane_data = [1, 2]>>
   return
 }
 
@@ -510,7 +510,7 @@ func.func @tensor_desc_scatter_invalid_chunk_size_1D(%src: ui64, %offsets: vecto
       // expected-error at +1 {{expected non-contiguous elements for 1D tensor}}
       !xegpu.tensor_desc<16xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 2>,
-         #xegpu.layout<scope = wi, wi_layout = [1, 8], wi_data = [1, 2]>>
+         #xegpu.layout<scope = lane, lane_layout = [1, 8], lane_data = [1, 2]>>
   return
 }
 
@@ -520,22 +520,22 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto
       // expected-error at +1 {{expected chunk blocks for 2D tensor}}
       !xegpu.tensor_desc<16x2xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 1>,
-         #xegpu.layout<scope = wi, wi_layout = [8, 1], wi_data = [1, 2]>>
+         #xegpu.layout<scope = lane, lane_layout = [8, 1], lane_data = [1, 2]>>
   return
 }
 
 // -----
 func.func @test_convert_layout_same_map(%a: vector<32x64xf16>) {
   // expected-error at +1 {{expected different srcMap and resMap}}
-  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>,
-                                resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>,
+                                resMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
   gpu.return
 }
 
 // -----
 func.func @test_convert_layout_unmatch(%a: vector<32x64xf16>) {
   // expected-error at +1 {{expected srcMap and resMap be in the same scope}}
-  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>,
-                                resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+                                resMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
   gpu.return
 }
\ No newline at end of file
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 6a29a73a20612..e52562a2f453d 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -15,9 +15,9 @@ gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_1(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_create_nd_tdesc_simt_1(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -34,8 +34,8 @@ gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : ind
 gpu.func @test_create_nd_tdesc_simt_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
   //CHECK: %[[C:.*]] = arith.constant 1 : index
   %c1 = arith.constant 1 : index
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -48,8 +48,8 @@ gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_create_nd_tdesc_simt_3(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -62,8 +62,8 @@ gpu.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
 gpu.func @test_create_nd_tdesc_simt_4(%src: memref<2x24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -76,8 +76,8 @@ gpu.func @test_create_nd_tdesc_vc_5(%src: memref<2x24x32xf32, 3>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
 gpu.func @test_create_nd_tdesc_simt_5(%src: memref<2x24x32xf32, 3>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -90,8 +90,8 @@ gpu.func @test_create_nd_tdesc_vc_6(%src: memref<24x32xf32>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_6(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_create_nd_tdesc_simt_6(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -106,10 +106,10 @@ gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
 
 // CHECK: gpu.func @test_prefetch_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_prefetch_nd_simt(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -125,11 +125,11 @@ gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
 
 // CHECK: func @test_load_nd_simt(%[[arg0:.*]]: memref<8x16xf16>) {
 gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<4x2xf16>
   %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
-       : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
+       : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<4x2xf16>
   gpu.return
 }
 
@@ -144,10 +144,10 @@ gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
 
 // CHECK: func @test_load_nd_simt_2(%[[arg0:.*]]: memref<8x16xf16>) {
 gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<1x1xf16>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<1x1xf16>
   gpu.return
 }
 
@@ -162,11 +162,11 @@ gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
 
 // CHECK: func @test_load_nd_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
   gpu.return
 }
 
@@ -181,11 +181,11 @@ gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) {
 
 // CHECK: func @test_load_nd_simt_4(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
-    !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
+    !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<8x2xf16>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<8x2xf16>
   gpu.return
 }
 
@@ -200,11 +200,11 @@ gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
 
 // CHECK: func @test_load_nd_simt_5(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
+    !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x1xf32>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x1xf32>
   gpu.return
 }
 
@@ -219,11 +219,11 @@ gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) {
 
 // CHECK: func @test_load_nd_simt_6(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x1xf16>
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
-    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
+    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x1xf16>
   gpu.return
 }
 
@@ -238,11 +238,11 @@ gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) {
 
 // CHECK: func @test_load_nd_simt_7(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x2xf16>
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
-    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
+    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x2xf16>
   gpu.return
 }
 
@@ -257,10 +257,10 @@ gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) {
 
 // CHECK: func @test_load_nd_simt_8(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = lane, lane_layout = [16, 1], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = lane, lane_layout = [16, 1], lane_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = lane, lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<8x1xf32>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = lane, lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<8x1xf32>
   gpu.return
 }
 
@@ -279,11 +279,11 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
 gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) {
    // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48x1xf16>
   %1 = arith.constant dense<1.0>: vector<48x1xf16>
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
-    !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -305,11 +305,11 @@ gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
 gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) {
    // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2x1xf16>
   %1 = arith.constant dense<1.0>: vector<2x1xf16>
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
-    !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -324,10 +324,10 @@ gpu.func @test_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
 
 // CHECK: gpu.func @test_update_nd_tdesc_simt(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_update_nd_tdesc_simt(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-  %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -344,8 +344,8 @@ gpu.func @test_create_tdesc_vc(%src: ui64) {
 gpu.func @test_create_tdesc_simt(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -363,8 +363,8 @@ gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) {
 gpu.func @test_create_tdesc_simt_1(%src: memref<?xf32, 3>) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space =  slm, chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex>  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space =  slm, chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex>  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -383,7 +383,7 @@ gpu.func @test_create_tdesc_simt_2(%src: memref<?xf32>) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>
-  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>  -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>  -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -401,8 +401,8 @@ gpu.func @test_create_tdesc_vc_3(%src: ui64) {
 gpu.func @test_create_tdesc_simt_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
+  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
   gpu.return
 }
 
@@ -425,10 +425,10 @@ gpu.func @test_load_simt(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
   %1 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
-  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
-  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
-  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
   gpu.return
 }
 
@@ -451,10 +451,10 @@ gpu.func @test_load_simt_2(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
   %1 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
-  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
-  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
-  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
+  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
+  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
+  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
   gpu.return
 }
 
@@ -477,10 +477,10 @@ gpu.func @test_load_simt_3(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
   %1 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
-  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
-  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
-  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
+  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
+  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
   gpu.return
 }
 
@@ -509,10 +509,10 @@ gpu.func @test_store_simt(%src: ui64) {
   %1 = arith.constant dense<1>: vector<4xi1>
   //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2x1xf32>
   %2 = arith.constant dense<2.9>: vector<2x1xf32>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
-  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
-  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
-  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
+  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
   gpu.return
 }
 
@@ -541,10 +541,10 @@ gpu.func @test_store_simt_2(%src: ui64) {
   %1 = arith.constant dense<1>: vector<4xi1>
   //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<1x2xf16>
   %2 = arith.constant dense<2.9>: vector<1x2xf16>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
-  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
-  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
-  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
+  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
+  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1>
+  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1>
   gpu.return
 }
 
@@ -572,10 +572,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
   %1 = arith.constant dense<1>: vector<4xi1>
   //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32>
   %2 = arith.constant dense<2.9>: vector<1xf32>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
-  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
-  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
-  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
+  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
+  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>, vector<4xi1>
+  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>, vector<4xi1>
   gpu.return
 }
 
@@ -583,10 +583,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
 gpu.func @test_prefetch_simt(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
-  // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
-  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -605,13 +605,13 @@ gpu.func @test_prefetch_vc(%src: ui64) {
 // CHECK: gpu.func @test_create_update_tdesc_simt(%[[arg0:.*]]: ui64) {
 gpu.func @test_create_update_tdesc_simt(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
   //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
-  //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+  //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
   %s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
-  %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+  %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
   gpu.return
 }
 
@@ -637,12 +637,12 @@ gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
 
 // CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8x1xf16>, %[[arg1:.*]]: vector<8x2xf16>)
 gpu.func @test_dpas_simt(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
-  // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
-  // CHECK: b_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
-  // CHECK: c_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
-  %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
-                          b_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
-                          c_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>}
+  // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {a_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>,
+  // CHECK: b_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>,
+  // CHECK: c_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+  %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>,
+                          b_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>,
+                          c_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>}
                           : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
   gpu.return
 }
@@ -706,20 +706,20 @@ gpu.func @fence() {
 
 // CHECK: gpu.func @test_create_nd_tdesc_wg_1(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], wi_layout = [1, 16], wi_data = [8, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], wi_layout = [1, 16], wi_data = [8, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
   gpu.return
 }
 
 gpu.func @test_convert_layout(%a: vector<32x64xf16>) {
-  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [2, 1]>,
-                                resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [2, 1]>,
+                                resMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
   gpu.return
 }
 
 gpu.func @test_convert_layout_wg(%a: vector<32x64xf16>) {
-  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>,
-                                resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+                                resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
   gpu.return
 }
 

>From 8636d1562fc1ffc7a7d4365847a3a3dfa18782aa Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 24 Mar 2025 16:21:26 +0000
Subject: [PATCH 16/45] refine docs

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 70 ++++++++++++-------
 1 file changed, 45 insertions(+), 25 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 2f9aa0106b1bc..7bb59796af36e 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -166,7 +166,11 @@ def XeGPU_ScopeEnums : I32EnumAttr<"Scope", "enumeration of scope",
 
 def XeGPU_ScopeAttr
   : EnumAttr<XeGPU_Dialect, XeGPU_ScopeEnums, "Scope"> {
-    let summary = [{Describe the programming scope of the IR}];
+    let summary = [{Defines the programming scope of the IR,
+                    where WG represents the workgroup level,
+                    SG represents the subgroup level, and
+                    Lane represents the work-item level}];
+
     let assemblyFormat = "``$value";
 }
 
@@ -176,37 +180,53 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
     specified by the tensor descriptor.
   }];
   let description = [{
-    XeGPU operations leverages LayoutAttr to distribute data across subgroups and workitems.
-    It is specified in tensor_descs upon the tensor description creation. LayoutAttr contains
-    the following parameters.
-
-    * scope: Specifies the scope of the current code, which can be either wg (workgroup), sg (subgroup),
-            or lane (workitem). It is mandatory for subgroup-level programming and optional for workgroup
-            and workitem-level programming. By default, if a LayoutAttr includes sg_layout and sg_data,
-            it will be treated as workgroup level. Similarly, if it only includes lane_layout and lane_data,
-            it will be considered as workitem level.
-    * sg_layout: [optional] specifies the total number of subgroups and their layout in a workgroup.
-    * sg_data: [optional] specifies the data size accessed per subgroup.
-    * order: [optional] specifies the dimension order used to linearize n-d sbugroup ids to 1-d.
-            The first dimension in the order list is the fastest-changing dimension.
-    * lane_layout: [required] specifies the total number of work-items and their layout in a subgroup
-    * lane_data: [required] specifies the data size accessed per work-item for a single distribution.
+    XeGPU operations use `LayoutAttr` to define how data is distributed across subgroups and work-items.
+    This attribute is specified in tensor descriptors during tensor description creation. `LayoutAttr`
+    includes the following parameters, categorized into three groups:
+
+    ### Group 1:
+    * scope: Defines the scope of the code, which can be `wg` (workgroup), `sg` (subgroup),
+      or `lane` (work-item). It is mandatory for subgroup-level programming but optional
+      for workgroup and work-item levels. By default:
+        - If sg_layout is included, the layout is treated as workgroup level.
+        - If only `lane_layout` and `lane_data` are included, it is considered work-item level
+
+    ### Group 2:
+    * sg_layout (optional): Specifies the total number of subgroups and their layout within a workgroup.
+      It is mandatory for workgroup-level programming. Its presence implies workgroup-level code, and
+      the scope must be empty or set to `wg`.
+    * sg_data (optional): Defines the data size accessed per subgroup. It must be used with sg_layout or
+      left empty, in which case it can be derived from `lane_layout` and `lane_data` using the formula:
+      `sg_data[i] = lane_layout[i] * lane_data[i]`.
+    * order (optional): Specifies the dimension order used to linearize n-dimensional sbugroup IDs to
+      1-dimensional IDs. The first dimension in the order list is the fastest-changing dimension.
+
+    ### Group 3:
+    * lane_layout (required): Specifies the total number of work-items and their layout within a subgroup
+    * lane_data: (required): Specifies the data size accessed per work-item for a single distribution.
 
     `lane_data[0] * lane_data[1]` can be greater than 1, indicating that each work item operates on multiple
     elements. These elements are eventually lowered to a "SIMT-flavor" vector, such as a SPIR-V vector or
     an LLVM vector, or packed into a storage data type. The multiple elements specified by lane_data must
     come from a single dimension and be contiguous in memory along either dimension.
 
-    E.g., #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-    In this example, the subgroup consists of 16 work items arranged as lane_layout=[1, 16], with
-    each work item accessing a single element as defined by lane_data=[1, 1].
-
-    E.g., #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>
-    In this example, the layout represents a workgroup distribution. A workgroup consists of 8 subgroups
-    arranged in a 2x4 layout. Each subgroup accesses a 16x16 block per instruction, which is further
-    distributed to 16 work items as described above.
-
+    ### Examples:
+      1. Work-item level layout:
+      ```mlir
+      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+      ```
+      In this example, the subgroup consists of 16 work items arranged as lane_layout=[1, 16], with
+      each work item accessing a single element as defined by lane_data=[1, 1].
+
+      2. Workgroup level layout:
+      ```mlir
+      #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>
+      ```
+      In this example, the layout represents a workgroup distribution. A workgroup consists of 8 subgroups
+      arranged in a 2x4 layout. Each subgroup accesses a 16x16 block per instruction, which is further
+      distributed to 16 work items as described above.
   }];
+
   let parameters = (ins
     OptionalParameter<"ScopeAttr">: $scope,
     OptionalParameter<"DenseI32ArrayAttr">: $sg_layout,

>From 0190418212529ec164a52f52582c1f77ecbd5c09 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 24 Mar 2025 16:27:02 +0000
Subject: [PATCH 17/45] refine util

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 7bb59796af36e..4afeef1427e8b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -239,7 +239,7 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
   let extraClassDeclaration = [{
     bool isForWorkgroupLevel() {
       if (!getScope())
-        return getSgLayout() && getSgData();
+        return getSgLayout() != nullptr;
       return getScope() == ScopeAttr::get(getContext(), Scope::WG);
     }
 

>From 32f9272752c48ded0fa51c362fe2ed138614937b Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 24 Mar 2025 17:06:03 +0000
Subject: [PATCH 18/45] refine convert_layout docs

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 7188f74815943..41911ee1aa323 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -984,9 +984,12 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
 }
 
 def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> {
-    let summary = "Convert the sg layout of the input operand";
+    let summary = "Convert the layout of the input operand";
     let description = [{
-        convert_layout remaps the distribution of data across workitems by updating the LayoutAttr.
+      `convert_layout` adjusts the data distribution across subgroups and/or work-items by modifying
+      the `LayoutAttr`. Both `srcMap` and `resMap` must correspond to the same programming scope, such
+      as workgroup-level (wg) or subgroup-level (sg) code. This operation is not supported for
+      work-item-level code.
     }];
     let arguments = (ins XeGPU_Vector2DType: $source,
                          XeGPU_LayoutAttr: $srcMap,

>From fe11c7987a8822afec39f905cf4421496fef7b55 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 24 Mar 2025 18:25:59 +0000
Subject: [PATCH 19/45] save work

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 30 +++++++++++--------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 3ed3f462aa530..49e0935f88705 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -934,6 +934,16 @@ FailureOr<VectorType> getDistributedVectorType(xegpu::SGMapAttr sgMap,
   return newVectorType;
 }
 
+/// An operation can be sinked out of WarpExecuteOnLane0 if all ops in its
+/// use-def chain are already sinked.
+static bool canBeSinked(Operation *op) {
+  DenseSet<Operation *> visited;
+  visited.insert(op);
+  while (!visited.empty()) {
+  }
+  return true;
+}
+
 LogicalResult MoveFuncBodyToWarpExecuteOnLane0::matchAndRewrite(
     gpu::GPUFuncOp gpuFuncOp, PatternRewriter &rewriter) const {
   /// If the function already moved inside a warp_execute_on_lane0, skip.
@@ -1052,17 +1062,13 @@ SubgroupOpLoadNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
   SmallVector<size_t> newRetIndices;
   gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
       rewriter, subgroupOp, /* new yielded values = */ loadOp.getTensorDesc(),
-      /* new yielded types = */ TypeRange{tensorDescTy}, newRetIndices);
+      /* new yielded types = */ tensorDescTy, newRetIndices);
 
   // Create a new load op outside the warp op with the distributed vector type.
   rewriter.setInsertionPointAfter(newWarpOp);
   auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(
-      loadOp.getLoc(), newVectorType, loadOp.getTensorDesc(),
-      loadOp.getPackedAttr(), loadOp.getTransposeAttr(), loadOp.getL1HintAttr(),
-      loadOp.getL2HintAttr(), loadOp.getL3HintAttr());
-
-  newLoadOp.getTensorDescMutable().assign(
-      newWarpOp.getResult(newRetIndices[0]));
+      newWarpOp.getLoc(), newVectorType, newWarpOp->getResults()[0],
+      loadOp->getAttrs());
   Value distributedVal = newWarpOp.getResult(operandIdx);
   rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
   return success();
@@ -1219,13 +1225,13 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
   patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
   /// We want to avoid ops from hoisted out of the gpu.warp_execute_on_lane0
   /// region.
-  GreedyRewriteConfig config;
-  config.cseConstants = false;
-  config.fold = false;
-  (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
+  // GreedyRewriteConfig config;
+  // config.cseConstants = false;
+  // config.fold = false;
+  (void)applyPatternsGreedily(getOperation(), std::move(patterns));
 
   /// Finally, do the SIMD to SIMT distribution.
   patterns.clear();
   xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
-  (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
+  (void)applyPatternsGreedily(getOperation(), std::move(patterns));
 }

>From 6e1ef3ea8324bc07b31f148325426541031604d2 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 24 Mar 2025 21:44:48 +0000
Subject: [PATCH 20/45] save work

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 52 ++++++++++++++-----
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 49e0935f88705..04ff165bb5313 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -13,9 +13,11 @@
 #include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
@@ -1220,18 +1222,44 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
     signalPassFailure();
   /// Move all operations inside a GPU functions inside
   /// gpu.warp_execute_on_lane0
-
-  RewritePatternSet patterns(&getContext());
-  patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
-  /// We want to avoid ops from hoisted out of the gpu.warp_execute_on_lane0
-  /// region.
-  // GreedyRewriteConfig config;
-  // config.cseConstants = false;
-  // config.fold = false;
-  (void)applyPatternsGreedily(getOperation(), std::move(patterns));
-
+  {
+    RewritePatternSet patterns(&getContext());
+    patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
+    /// We want to avoid ops from hoisted out of the gpu.warp_execute_on_lane0
+    /// region.
+    GreedyRewriteConfig config;
+    config.cseConstants = false;
+    config.fold = false;
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
+  }
   /// Finally, do the SIMD to SIMT distribution.
-  patterns.clear();
+  RewritePatternSet patterns(&getContext());
   xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
-  (void)applyPatternsGreedily(getOperation(), std::move(patterns));
+  auto distributionFn = [](Value val) {
+    // Create an identity dim map of the same rank as the vector.
+    VectorType vecType = dyn_cast<VectorType>(val.getType());
+    int64_t vecRank = vecType ? vecType.getRank() : 0;
+    OpBuilder builder(val.getContext());
+    if (vecRank == 0)
+      return AffineMap::get(val.getContext());
+    return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext());
+  };
+  auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
+                      int64_t warpSz) {
+    assert((val.getType().isF32() || val.getType().isInteger(32)) &&
+           "unsupported shuffle type");
+    Type i32Type = builder.getIntegerType(32);
+    Value srcIdxI32 = builder.create<arith::IndexCastOp>(loc, i32Type, srcIdx);
+    Value warpSzI32 = builder.create<arith::ConstantOp>(
+        loc, builder.getIntegerAttr(i32Type, warpSz));
+    Value result = builder
+                       .create<gpu::ShuffleOp>(loc, val, srcIdxI32, warpSzI32,
+                                               gpu::ShuffleMode::IDX)
+                       .getResult(0);
+    return result;
+  };
+  vector::populatePropagateWarpVectorDistributionPatterns(
+      patterns, distributionFn, shuffleFn);
+  llvm::errs() << AffineMap::getMultiDimIdentityMap(2, &getContext()) << "\n";
+  // (void)applyPatternsGreedily(getOperation(), std::move(patterns));
 }

>From 55c272c367ad296631db90740ed736e1eb7ea1e4 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 25 Mar 2025 02:16:03 +0000
Subject: [PATCH 21/45] save work

---
 .../Vector/Transforms/VectorDistribution.h       |  4 ++++
 .../Vector/Transforms/VectorDistribute.cpp       | 16 +++++++++++-----
 .../lib/Dialect/Vector/TestVectorTransforms.cpp  |  2 ++
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
index dda45219b2acc..082d990cee8a4 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
@@ -98,6 +98,10 @@ void populatePropagateWarpVectorDistributionPatterns(
     const WarpShuffleFromIdxFn &warpShuffleFromIdxFn,
     PatternBenefit benefit = 1, PatternBenefit readBenefit = 0);
 
+/// Patterns for simplification of WarpExecuteOnLane0Op during distribution.
+void populateWarpSimplificationPatterns(RewritePatternSet &pattern,
+                                        PatternBenefit benefit = 1);
+
 /// Lambda signature to compute a reduction of a distributed value for the given
 /// reduction kind and size.
 using DistributedReductionFn =
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index e214257de2cdf..f0d771142e307 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -1761,17 +1761,23 @@ void mlir::vector::populatePropagateWarpVectorDistributionPatterns(
     const WarpShuffleFromIdxFn &warpShuffleFromIdxFn, PatternBenefit benefit,
     PatternBenefit readBenefit) {
   patterns.add<WarpOpTransferRead>(patterns.getContext(), readBenefit);
-  patterns.add<WarpOpElementwise, WarpOpDeadResult, WarpOpBroadcast,
-               WarpOpShapeCast, WarpOpExtract, WarpOpForwardOperand,
-               WarpOpConstant, WarpOpExtractElement, WarpOpInsertElement,
-               WarpOpInsertScalar, WarpOpInsert, WarpOpCreateMask>(
-      patterns.getContext(), benefit);
+  patterns
+      .add<WarpOpElementwise, WarpOpBroadcast, WarpOpShapeCast, WarpOpExtract,
+           WarpOpConstant, WarpOpExtractElement, WarpOpInsertElement,
+           WarpOpInsertScalar, WarpOpInsert, WarpOpCreateMask>(
+          patterns.getContext(), benefit);
   patterns.add<WarpOpExtractScalar>(patterns.getContext(), warpShuffleFromIdxFn,
                                     benefit);
   patterns.add<WarpOpScfForOp>(patterns.getContext(), distributionMapFn,
                                benefit);
 }
 
+void mlir::vector::populateWarpSimplificationPatterns(
+    RewritePatternSet &patterns, PatternBenefit benefit) {
+  patterns.add<WarpOpDeadResult, WarpOpForwardOperand>(patterns.getContext(),
+                                                       benefit);
+}
+
 void mlir::vector::populateDistributeReduction(
     RewritePatternSet &patterns,
     const DistributedReductionFn &distributedReductionFn,
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index a54ae816570a8..feec10e6492f7 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -660,6 +660,7 @@ struct TestVectorDistribution
       vector::populatePropagateWarpVectorDistributionPatterns(
           patterns, distributionFn, shuffleFn, /*benefit=*/1,
           /*readBenefit=*/0);
+      vector::populateWarpSimplificationPatterns(patterns);
       vector::populateDistributeReduction(patterns, warpReduction, 1);
       populateDistributeTransferWriteOpPatterns(patterns, distributionFn, 2);
       (void)applyPatternsGreedily(getOperation(), std::move(patterns));
@@ -672,6 +673,7 @@ struct TestVectorDistribution
       RewritePatternSet patterns(ctx);
       vector::populatePropagateWarpVectorDistributionPatterns(
           patterns, distributionFn, shuffleFn);
+      vector::populateWarpSimplificationPatterns(patterns);
       vector::populateDistributeReduction(patterns, warpReduction);
       (void)applyPatternsGreedily(getOperation(), std::move(patterns));
     }

>From 1ffe5c8e7e988185e5089d05f22fe40d9f267914 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 26 Mar 2025 00:09:27 +0000
Subject: [PATCH 22/45] save work

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 207 ++++++++++--------
 1 file changed, 113 insertions(+), 94 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 04ff165bb5313..9252b0ca226ae 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -21,6 +21,7 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeRange.h"
@@ -908,8 +909,9 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
 /// | 32x16                 | [1, 16]   | 32x1                     |
 /// | 32x16                 | [2, 8]    | 16x2                     |
 /// | 2x32x16               | [1, 16]   | 2x32x1                   |
-FailureOr<VectorType> getDistributedVectorType(xegpu::SGMapAttr sgMap,
-                                               VectorType originalType) {
+FailureOr<VectorType>
+getDistributedVecTypeBasedOnWiLayout(xegpu::SGMapAttr sgMap,
+                                     VectorType originalType) {
   llvm::SmallVector<int64_t, 2> distributedShape;
   if (!sgMap)
     return failure();
@@ -936,14 +938,30 @@ FailureOr<VectorType> getDistributedVectorType(xegpu::SGMapAttr sgMap,
   return newVectorType;
 }
 
-/// An operation can be sinked out of WarpExecuteOnLane0 if all ops in its
-/// use-def chain are already sinked.
-static bool canBeSinked(Operation *op) {
-  DenseSet<Operation *> visited;
-  visited.insert(op);
-  while (!visited.empty()) {
-  }
-  return true;
+static VectorType getDistributedVectorType(xegpu::SGMapAttr sgMap,
+                                           VectorType originalType) {
+  auto shape = originalType.getShape();
+  auto distVecTyOrFailure =
+      xegpu::TensorDescType::get(shape, originalType.getElementType(),
+                                 /*array_length=*/1, /*boundary_check=*/true,
+                                 /*memory_space=*/xegpu::MemorySpace::Global,
+                                 sgMap)
+          .getDistributedVectorType();
+  assert(llvm::succeeded(distVecTyOrFailure) &&
+         "Failed to compute distributed vector type for the given vector type");
+  return distVecTyOrFailure.value();
+}
+
+static Value reconcileDistribtedVecType(Value orig, VectorType expected,
+                                        PatternRewriter &rewriter) {
+  assert(isa<VectorType>(orig.getType()) && "expecting vector type");
+  auto origVecType = cast<VectorType>(orig.getType());
+  /// No need to reconcile if the types are the same.
+  if (origVecType == expected)
+    return orig;
+  auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
+                                                            expected, orig);
+  return castOp.getResult(0);
 }
 
 LogicalResult MoveFuncBodyToWarpExecuteOnLane0::matchAndRewrite(
@@ -1004,40 +1022,51 @@ SubgroupOpStoreNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
   if (storeOp.getTensorDescType().getShape().size() != 2)
     return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
 
-  auto distributedTypeOrFailure =
-      getDistributedVectorType(sgMap, storeOp.getValueType());
-  if (failed(distributedTypeOrFailure))
+  auto distriburtedTypeByWarpOp =
+      getDistributedVecTypeBasedOnWiLayout(sgMap, storeOp.getValueType());
+  if (failed(distriburtedTypeByWarpOp))
     return rewriter.notifyMatchFailure(storeOp,
                                        "Failed to distribute the type");
-  VectorType newVectorType = distributedTypeOrFailure.value();
+  VectorType distributedTypeByWarpOp = distriburtedTypeByWarpOp.value();
 
   SmallVector<size_t> newRetIndices;
   gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
       rewriter, subgroupOp,
       /* new yielded values = */
-      ValueRange{storeOp.getTensorDesc(), storeOp.getValue()},
+      ValueRange{storeOp.getValue(), storeOp.getTensorDesc()},
       /* new yielded types = */
-      TypeRange{storeOp.getTensorDescType(), newVectorType}, newRetIndices);
-
-  // Create a new store op outside the warp op with the distributed vector type.
-  // Tensor descriptor is not distributed.
+      TypeRange{distributedTypeByWarpOp, storeOp.getTensorDescType()},
+      newRetIndices);
+  /// Create a new store op outside the warp op with the distributed vector
+  /// type. Tensor descriptor is not distributed.
   rewriter.setInsertionPointAfter(newWarpOp);
-  auto newStoreOp =
-      cast<xegpu::StoreNdOp>(rewriter.clone(*storeOp.getOperation()));
+  SmallVector<Value> newStoreOperands;
+
+  /// For the value operand, there can be a conflict between the vector type
+  /// distributed by the warp op and (xegpu-specific) distributed type supported
+  /// by the store op. We reconcile these mismatches by inserting a cast. These
+  /// gets cancelled out later.
+  auto storeNdDistributedValueTyOrFailure =
+      storeOp.getTensorDescType().getDistributedVectorType();
+  if (failed(storeNdDistributedValueTyOrFailure))
+    return rewriter.notifyMatchFailure(
+        storeOp, "Failed to get distributed vector type for the store op");
+  newStoreOperands.push_back(reconcileDistribtedVecType(
+      newWarpOp.getResult(newRetIndices[0]),
+      storeNdDistributedValueTyOrFailure.value(), rewriter));
+  newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[1]));
+
+  rewriter.create<xegpu::StoreNdOp>(newWarpOp.getLoc(), TypeRange{},
+                                    newStoreOperands, storeOp->getAttrs());
   rewriter.eraseOp(storeOp);
-  newStoreOp.getTensorDescMutable().assign(
-      newWarpOp.getResult(newRetIndices[0]));
-  newStoreOp.getValueMutable().assign(newWarpOp.getResult(newRetIndices[1]));
-
   return success();
 }
 
 LogicalResult
 SubgroupOpLoadNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
                                   PatternRewriter &rewriter) const {
-  OpOperand *operand = getWarpResult(subgroupOp, [](Operation *op) {
-    return isa<xegpu::LoadNdOp>(op) && op->hasOneUse();
-  });
+  OpOperand *operand =
+      getWarpResult(subgroupOp, llvm::IsaPred<xegpu::LoadNdOp>);
   if (!operand)
     return rewriter.notifyMatchFailure(subgroupOp,
                                        "warp result is not a xegpu::LoadNd op");
@@ -1049,29 +1078,31 @@ SubgroupOpLoadNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
     return rewriter.notifyMatchFailure(
         loadOp, "the source tensor descriptor lacks sg_map attribute");
 
-  auto tensorDecShape = tensorDescTy.getShape();
-  if (tensorDecShape.size() != 2)
-    return rewriter.notifyMatchFailure(loadOp,
-                                       "unsupported tensor descriptor shape");
-
-  auto distributedTypeOrFailure =
-      getDistributedVectorType(sgMap, loadOp.getType());
-  if (failed(distributedTypeOrFailure))
-    return rewriter.notifyMatchFailure(loadOp, "Failed to distribute the type");
-  VectorType newVectorType = distributedTypeOrFailure.value();
-
   unsigned operandIdx = operand->getOperandNumber();
+  VectorType distributedTypeByWarpOp =
+      cast<VectorType>(subgroupOp.getResult(operandIdx).getType());
+
   SmallVector<size_t> newRetIndices;
   gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
       rewriter, subgroupOp, /* new yielded values = */ loadOp.getTensorDesc(),
       /* new yielded types = */ tensorDescTy, newRetIndices);
 
-  // Create a new load op outside the warp op with the distributed vector type.
+  /// Create a new load op outside the warp op with the distributed vector type.
   rewriter.setInsertionPointAfter(newWarpOp);
-  auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(
-      newWarpOp.getLoc(), newVectorType, newWarpOp->getResults()[0],
-      loadOp->getAttrs());
+  auto loadNdDistValueTyOrFailure =
+      loadOp.getTensorDescType().getDistributedVectorType();
+  if (failed(loadNdDistValueTyOrFailure))
+    return rewriter.notifyMatchFailure(
+        loadOp, "Failed to get distributed vector type for the load op");
+  Value newLoadOp = rewriter.create<xegpu::LoadNdOp>(
+      newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
+      newWarpOp->getResult(newRetIndices[0]), loadOp->getAttrs());
   Value distributedVal = newWarpOp.getResult(operandIdx);
+  /// There can be a conflict between the vector type distributed by the warp op
+  /// and (xegpu-specific) distributed type supported by the load op. We
+  /// reconcile these mismatches by inserting a cast.
+  newLoadOp =
+      reconcileDistribtedVecType(newLoadOp, distributedTypeByWarpOp, rewriter);
   rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
   return success();
 }
@@ -1079,10 +1110,8 @@ SubgroupOpLoadNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
 LogicalResult
 SubgroupOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
                                         PatternRewriter &rewriter) const {
-  OpOperand *operand = getWarpResult(subgroupOp, [](Operation *op) {
-    return isa<xegpu::CreateNdDescOp>(op) && op->hasOneUse();
-  });
-
+  OpOperand *operand =
+      getWarpResult(subgroupOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
   if (!operand)
     return rewriter.notifyMatchFailure(
         subgroupOp, "warp result is not a xegpu::CreateNdDesc op");
@@ -1131,10 +1160,7 @@ SubgroupOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
 LogicalResult
 SubgroupOpDpas::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
                                 PatternRewriter &rewriter) const {
-  OpOperand *operand = getWarpResult(subgroupOp, [](Operation *op) {
-    return isa<xegpu::DpasOp>(op) && op->hasOneUse();
-  });
-
+  OpOperand *operand = getWarpResult(subgroupOp, llvm::IsaPred<xegpu::DpasOp>);
   if (!operand)
     return rewriter.notifyMatchFailure(subgroupOp,
                                        "warp result is not a xegpu::Dpas op");
@@ -1148,28 +1174,29 @@ SubgroupOpDpas::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
     return rewriter.notifyMatchFailure(
         dpasOp, "the xegpu::Dpas op lacks sg_map attribute for A, B or output");
 
-  auto distributedLhsTypeOrFailure =
-      getDistributedVectorType(sgMapA, dpasOp.getLhsType());
-  auto distributedRhsTypeOrFailure =
-      getDistributedVectorType(sgMapB, dpasOp.getRhsType());
-  auto distributedResultTypeOrFailure =
-      getDistributedVectorType(sgMapOut, dpasOp.getResultType());
-  if (failed(distributedLhsTypeOrFailure) ||
-      failed(distributedRhsTypeOrFailure) ||
-      failed(distributedResultTypeOrFailure))
+  auto distLhsTypeByWarpOpOrFailure =
+      getDistributedVecTypeBasedOnWiLayout(sgMapA, dpasOp.getLhsType());
+  auto distRhsTypeByWarpOpOrFailure =
+      getDistributedVecTypeBasedOnWiLayout(sgMapB, dpasOp.getRhsType());
+  auto distResultTypeByWarpOpOrFailure =
+      getDistributedVecTypeBasedOnWiLayout(sgMapOut, dpasOp.getResultType());
+  if (failed(distLhsTypeByWarpOpOrFailure) ||
+      failed(distRhsTypeByWarpOpOrFailure) ||
+      failed(distResultTypeByWarpOpOrFailure))
     return rewriter.notifyMatchFailure(
         dpasOp,
         "Failed to distribute the A, B or output types in xegpu::Dpas op");
 
   llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(), dpasOp.getRhs()};
-  llvm::SmallVector<Type, 3> newYieldTypes{distributedLhsTypeOrFailure.value(),
-                                           distributedRhsTypeOrFailure.value()};
-  // Dpas acc operand is optional.
+  llvm::SmallVector<Type, 3> newYieldTypes{
+      distLhsTypeByWarpOpOrFailure.value(),
+      distRhsTypeByWarpOpOrFailure.value()};
+  /// Dpas acc operand is optional.
   if (dpasOp.getAcc()) {
     newYieldValues.push_back(dpasOp.getAcc());
-    newYieldTypes.push_back(distributedResultTypeOrFailure.value());
+    newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
   }
-  // Create a new warp op without the dpas.
+  /// Create a new warp op without the dpas.
   SmallVector<size_t> newRetIndices;
   gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
       rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
@@ -1177,13 +1204,30 @@ SubgroupOpDpas::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
   // Create a new dpas op outside the warp op.
   rewriter.setInsertionPointAfter(newWarpOp);
   SmallVector<Value> newDpasOperands;
+  SmallVector<VectorType> newDpasOperandExpectedTypes;
+  /// Reconcile the distributed types with the original types.
+  newDpasOperandExpectedTypes.push_back(
+      getDistributedVectorType(sgMapA, dpasOp.getLhsType()));
+  newDpasOperandExpectedTypes.push_back(
+      getDistributedVectorType(sgMapB, dpasOp.getRhsType()));
+  if (dpasOp.getAcc()) {
+    newDpasOperandExpectedTypes.push_back(
+        getDistributedVectorType(sgMapOut, dpasOp.getResultType()));
+  }
+
   for (auto i : newRetIndices) {
-    newDpasOperands.push_back(newWarpOp.getResult(i));
+    newDpasOperands.push_back(reconcileDistribtedVecType(
+        newWarpOp.getResult(i),
+        newDpasOperandExpectedTypes[newDpasOperands.size()], rewriter));
   }
   auto newDpasOp = rewriter.create<xegpu::DpasOp>(
-      newWarpOp->getLoc(), distributedResultTypeOrFailure.value(),
+      newWarpOp->getLoc(), distResultTypeByWarpOpOrFailure.value(),
       newDpasOperands, dpasOp->getAttrs());
   Value disributedVal = newWarpOp.getResult(operandIdx);
+  /// Reconile the output type.
+  disributedVal = reconcileDistribtedVecType(
+      disributedVal, getDistributedVectorType(sgMapOut, dpasOp.getResultType()),
+      rewriter);
   rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
   return success();
 }
@@ -1235,31 +1279,6 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
   /// Finally, do the SIMD to SIMT distribution.
   RewritePatternSet patterns(&getContext());
   xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
-  auto distributionFn = [](Value val) {
-    // Create an identity dim map of the same rank as the vector.
-    VectorType vecType = dyn_cast<VectorType>(val.getType());
-    int64_t vecRank = vecType ? vecType.getRank() : 0;
-    OpBuilder builder(val.getContext());
-    if (vecRank == 0)
-      return AffineMap::get(val.getContext());
-    return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext());
-  };
-  auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
-                      int64_t warpSz) {
-    assert((val.getType().isF32() || val.getType().isInteger(32)) &&
-           "unsupported shuffle type");
-    Type i32Type = builder.getIntegerType(32);
-    Value srcIdxI32 = builder.create<arith::IndexCastOp>(loc, i32Type, srcIdx);
-    Value warpSzI32 = builder.create<arith::ConstantOp>(
-        loc, builder.getIntegerAttr(i32Type, warpSz));
-    Value result = builder
-                       .create<gpu::ShuffleOp>(loc, val, srcIdxI32, warpSzI32,
-                                               gpu::ShuffleMode::IDX)
-                       .getResult(0);
-    return result;
-  };
-  vector::populatePropagateWarpVectorDistributionPatterns(
-      patterns, distributionFn, shuffleFn);
-  llvm::errs() << AffineMap::getMultiDimIdentityMap(2, &getContext()) << "\n";
-  // (void)applyPatternsGreedily(getOperation(), std::move(patterns));
+  vector::populateWarpSimplificationPatterns(patterns);
+  (void)applyPatternsGreedily(getOperation(), std::move(patterns));
 }

>From e5521f93b89ea344a659e8d294ba45023cc34227 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 27 Mar 2025 19:28:51 +0000
Subject: [PATCH 23/45] save work before merging with Chao's PR

---
 .../Vector/Transforms/VectorDistribution.h    |   4 -
 .../Vector/Transforms/VectorDistribute.cpp    |  16 +-
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 690 +++++++++---------
 .../Dialect/Vector/TestVectorTransforms.cpp   |   2 -
 4 files changed, 348 insertions(+), 364 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
index 082d990cee8a4..dda45219b2acc 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
@@ -98,10 +98,6 @@ void populatePropagateWarpVectorDistributionPatterns(
     const WarpShuffleFromIdxFn &warpShuffleFromIdxFn,
     PatternBenefit benefit = 1, PatternBenefit readBenefit = 0);
 
-/// Patterns for simplification of WarpExecuteOnLane0Op during distribution.
-void populateWarpSimplificationPatterns(RewritePatternSet &pattern,
-                                        PatternBenefit benefit = 1);
-
 /// Lambda signature to compute a reduction of a distributed value for the given
 /// reduction kind and size.
 using DistributedReductionFn =
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index f0d771142e307..e214257de2cdf 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -1761,23 +1761,17 @@ void mlir::vector::populatePropagateWarpVectorDistributionPatterns(
     const WarpShuffleFromIdxFn &warpShuffleFromIdxFn, PatternBenefit benefit,
     PatternBenefit readBenefit) {
   patterns.add<WarpOpTransferRead>(patterns.getContext(), readBenefit);
-  patterns
-      .add<WarpOpElementwise, WarpOpBroadcast, WarpOpShapeCast, WarpOpExtract,
-           WarpOpConstant, WarpOpExtractElement, WarpOpInsertElement,
-           WarpOpInsertScalar, WarpOpInsert, WarpOpCreateMask>(
-          patterns.getContext(), benefit);
+  patterns.add<WarpOpElementwise, WarpOpDeadResult, WarpOpBroadcast,
+               WarpOpShapeCast, WarpOpExtract, WarpOpForwardOperand,
+               WarpOpConstant, WarpOpExtractElement, WarpOpInsertElement,
+               WarpOpInsertScalar, WarpOpInsert, WarpOpCreateMask>(
+      patterns.getContext(), benefit);
   patterns.add<WarpOpExtractScalar>(patterns.getContext(), warpShuffleFromIdxFn,
                                     benefit);
   patterns.add<WarpOpScfForOp>(patterns.getContext(), distributionMapFn,
                                benefit);
 }
 
-void mlir::vector::populateWarpSimplificationPatterns(
-    RewritePatternSet &patterns, PatternBenefit benefit) {
-  patterns.add<WarpOpDeadResult, WarpOpForwardOperand>(patterns.getContext(),
-                                                       benefit);
-}
-
 void mlir::vector::populateDistributeReduction(
     RewritePatternSet &patterns,
     const DistributedReductionFn &distributedReductionFn,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 9252b0ca226ae..38d9fe6c88800 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -683,7 +683,6 @@ void attachLayoutAttributeToUsers(Value v, xegpu::SGMapAttr layout) {
 static LogicalResult
 attachLayoutAttributes(Operation *top,
                        llvm::function_ref<SGMap(Value)> getPropagatedLayout) {
-  llvm::errs() << "op name : " << top->getName() << "\n";
   /// Helper to convert SGMap to xegpu::SGMapAttr.
   auto getSGMapForResult = [&](Value r) -> xegpu::SGMapAttr {
     auto layout = getPropagatedLayout(r);
@@ -759,6 +758,71 @@ namespace {
 /// SIMT Distribution Patterns
 ///===----------------------------------------------------------------------===///
 
+/// Returns the distributed vector type for a source vector type according to
+/// the wi_layout. We simply divide each dimension of tensor descriptor shape by
+/// corresponding wi_layout dimension. If array_length > 1, that is appended to
+/// the front of the disributed shape.
+/// Examples:
+/// | original vector shape | wi_layout | distributed vector shape |
+/// |-----------------------|-----------|--------------------------|
+/// | 32x16                 | [1, 16]   | 32x1                     |
+/// | 32x16                 | [2, 8]    | 16x2                     |
+/// | 2x32x16               | [1, 16]   | 2x32x1                   |
+FailureOr<VectorType>
+getDistributedVecTypeBasedOnWiLayout(xegpu::SGMapAttr sgMap,
+                                     VectorType originalType) {
+  llvm::SmallVector<int64_t, 2> distributedShape;
+  if (!sgMap)
+    return failure();
+
+  auto wiLayout = sgMap.getWiLayout();
+  assert((originalType.getRank() == 2 || originalType.getRank() == 3) &&
+         "expecting 2D or 3D shape for the original vector type");
+  assert(wiLayout.size() == 2 && "expecting 2D shape for the wi layout");
+  // Original type can be 2D or 3D (array_length > 1), the last two dims are the
+  // block shape.
+  auto blockShape = originalType.getShape().take_back(2);
+  // Check if the block vector shape can be distributed evenly.
+  if (blockShape[0] % wiLayout[0] != 0 || blockShape[1] % wiLayout[1] != 0)
+    return failure();
+
+  if (originalType.getRank() == 3) {
+    distributedShape.push_back(originalType.getShape()[0]);
+  }
+  for (unsigned i = 0; i < 2; ++i) {
+    distributedShape.push_back(blockShape[i] / wiLayout[i]);
+  }
+  auto newVectorType =
+      VectorType::get(distributedShape, originalType.getElementType());
+  return newVectorType;
+}
+
+static VectorType getDistributedVectorType(xegpu::SGMapAttr sgMap,
+                                           VectorType originalType) {
+  auto shape = originalType.getShape();
+  auto distVecTyOrFailure =
+      xegpu::TensorDescType::get(shape, originalType.getElementType(),
+                                 /*array_length=*/1, /*boundary_check=*/true,
+                                 /*memory_space=*/xegpu::MemorySpace::Global,
+                                 sgMap)
+          .getDistributedVectorType();
+  assert(llvm::succeeded(distVecTyOrFailure) &&
+         "Failed to compute distributed vector type for the given vector type");
+  return distVecTyOrFailure.value();
+}
+
+static Value reconcileDistribtedVecType(Value orig, VectorType expected,
+                                        PatternRewriter &rewriter) {
+  assert(isa<VectorType>(orig.getType()) && "expecting vector type");
+  auto origVecType = cast<VectorType>(orig.getType());
+  /// No need to reconcile if the types are the same.
+  if (origVecType == expected)
+    return orig;
+  auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
+                                                            expected, orig);
+  return castOp.getResult(0);
+}
+
 /// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
 /// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
 /// contained within a WarpExecuteOnLane0Op.
@@ -786,7 +850,48 @@ struct MoveFuncBodyToWarpExecuteOnLane0
     : public OpRewritePattern<gpu::GPUFuncOp> {
   using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
-                                PatternRewriter &rewriter) const override;
+                                PatternRewriter &rewriter) const override {
+    /// If the function only contains a single void return, skip.
+    if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
+          return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
+        }))
+      return failure();
+    /// If the function already moved inside a warp_execute_on_lane0, skip.
+    if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
+          return isa<gpu::WarpExecuteOnLane0Op>(op);
+        }))
+      return failure();
+    /// Create a new function with the same signature.
+    auto newGpuFunc = rewriter.create<gpu::GPUFuncOp>(
+        gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType());
+    /// Create a WarpExecuteOnLane0Op with same arguments and results as the
+    /// original gpuFuncOp.
+    rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
+    auto laneId = rewriter.create<gpu::LaneIdOp>(
+        newGpuFunc.getLoc(), rewriter.getIndexType(),
+        /** upperBound = **/ mlir::IntegerAttr());
+    auto gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
+    auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
+        laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize,
+        newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes());
+    auto &warpBodyBlock = warpOp.getBodyRegion().front();
+    /// Replace the ReturnOp of the original gpu function with a YieldOp.
+    auto origRetunOp =
+        cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
+    rewriter.setInsertionPointAfter(origRetunOp);
+    rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
+                                  origRetunOp.getOperands());
+    rewriter.eraseOp(origRetunOp);
+    /// Move the original function body to the WarpExecuteOnLane0Op body.
+    rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
+                                warpOp.getBodyRegion().begin());
+    rewriter.eraseBlock(&warpBodyBlock);
+    /// Insert a new ReturnOp after the WarpExecuteOnLane0Op.
+    rewriter.setInsertionPointAfter(warpOp);
+    rewriter.create<gpu::ReturnOp>(newGpuFunc.getLoc(), warpOp.getResults());
+    rewriter.replaceOp(gpuFuncOp, newGpuFunc);
+    return success();
+  }
 };
 
 /// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
@@ -823,7 +928,53 @@ struct MoveFuncBodyToWarpExecuteOnLane0
 struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
-                                PatternRewriter &rewriter) const override;
+                                PatternRewriter &rewriter) const override {
+    OpOperand *operand =
+        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
+    if (!operand)
+      return rewriter.notifyMatchFailure(
+          subgroupOp, "warp result is not a xegpu::CreateNdDesc op");
+    auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
+    unsigned operandIdx = operand->getOperandNumber();
+
+    auto srcTypedVal = dyn_cast<TypedValue<MemRefType>>(descOp.getSource());
+    if (!srcTypedVal)
+      return rewriter.notifyMatchFailure(
+          descOp, "expecting a memref typed value as the source");
+
+    auto descOffsets = descOp.getMixedOffsets();
+
+    xegpu::SGMapAttr sgMap = descOp.getType().getSGMapAttr();
+    if (!sgMap)
+      return rewriter.notifyMatchFailure(
+          descOp, "the tensor descriptor lacks sg_map attribute");
+
+    SmallVector<size_t> newRetIndices;
+    SmallVector<Value> newYieldValues;
+    SmallVector<Type> newYieldTypes;
+
+    for (auto arg : descOp->getOperands()) {
+      newYieldValues.push_back(arg);
+      newYieldTypes.push_back(arg.getType());
+    }
+    rewriter.setInsertionPoint(subgroupOp);
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, subgroupOp, /* new yieled values = */ newYieldValues,
+        /* new yielded types = */ newYieldTypes, newRetIndices);
+
+    SmallVector<Value> newDescOperands;
+    for (auto i : newRetIndices) {
+      newDescOperands.push_back(newWarpOp.getResult(i));
+    }
+    rewriter.setInsertionPointAfter(newWarpOp);
+    auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
+        newWarpOp.getLoc(), descOp.getType(), newDescOperands,
+        descOp->getAttrs());
+
+    Value distributedVal = newWarpOp.getResult(operandIdx);
+    rewriter.replaceAllUsesWith(distributedVal, newDescOp);
+    return success();
+  }
 };
 
 /// Sink a store_nd op at the end of enclosing `gpu.warp_execute_on_lane_0`. In
@@ -853,7 +1004,62 @@ struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
 struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
-                                PatternRewriter &rewriter) const override;
+                                PatternRewriter &rewriter) const override {
+    auto yield = cast<gpu::YieldOp>(
+        subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+    Operation *lastNode = yield->getPrevNode();
+    auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
+    if (!storeOp)
+      return failure();
+
+    auto tensorDescTy = storeOp.getTensorDescType();
+    xegpu::SGMapAttr sgMap = tensorDescTy.getSGMapAttr();
+    if (!sgMap)
+      return rewriter.notifyMatchFailure(
+          storeOp, "the source tensor descriptor lacks sg_map attribute");
+
+    if (storeOp.getTensorDescType().getShape().size() != 2)
+      return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
+
+    auto distriburtedTypeByWarpOp =
+        getDistributedVecTypeBasedOnWiLayout(sgMap, storeOp.getValueType());
+    if (failed(distriburtedTypeByWarpOp))
+      return rewriter.notifyMatchFailure(storeOp,
+                                         "Failed to distribute the type");
+    VectorType distributedTypeByWarpOp = distriburtedTypeByWarpOp.value();
+
+    SmallVector<size_t> newRetIndices;
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, subgroupOp,
+        /* new yielded values = */
+        ValueRange{storeOp.getValue(), storeOp.getTensorDesc()},
+        /* new yielded types = */
+        TypeRange{distributedTypeByWarpOp, storeOp.getTensorDescType()},
+        newRetIndices);
+    /// Create a new store op outside the warp op with the distributed vector
+    /// type. Tensor descriptor is not distributed.
+    rewriter.setInsertionPointAfter(newWarpOp);
+    SmallVector<Value> newStoreOperands;
+
+    /// For the value operand, there can be a conflict between the vector type
+    /// distributed by the warp op and (xegpu-specific) distributed type
+    /// supported by the store op. We reconcile these mismatches by inserting a
+    /// cast. These gets cancelled out later.
+    auto storeNdDistributedValueTyOrFailure =
+        storeOp.getTensorDescType().getDistributedVectorType();
+    if (failed(storeNdDistributedValueTyOrFailure))
+      return rewriter.notifyMatchFailure(
+          storeOp, "Failed to get distributed vector type for the store op");
+    newStoreOperands.push_back(reconcileDistribtedVecType(
+        newWarpOp.getResult(newRetIndices[0]),
+        storeNdDistributedValueTyOrFailure.value(), rewriter));
+    newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[1]));
+
+    rewriter.create<xegpu::StoreNdOp>(newWarpOp.getLoc(), TypeRange{},
+                                      newStoreOperands, storeOp->getAttrs());
+    rewriter.eraseOp(storeOp);
+    return success();
+  }
 };
 
 /// Clone a load_nd feeding into vector.yield op for the enclosing
@@ -888,349 +1094,132 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
 struct SubgroupOpLoadNd final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
-                                PatternRewriter &rewriter) const override;
+                                PatternRewriter &rewriter) const override {
+    OpOperand *operand =
+        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::LoadNdOp>);
+    if (!operand)
+      return rewriter.notifyMatchFailure(
+          subgroupOp, "warp result is not a xegpu::LoadNd op");
+
+    auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
+    xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
+    xegpu::SGMapAttr sgMap = tensorDescTy.getSGMapAttr();
+    if (!sgMap)
+      return rewriter.notifyMatchFailure(
+          loadOp, "the source tensor descriptor lacks sg_map attribute");
+
+    unsigned operandIdx = operand->getOperandNumber();
+    VectorType distributedTypeByWarpOp =
+        cast<VectorType>(subgroupOp.getResult(operandIdx).getType());
+
+    SmallVector<size_t> newRetIndices;
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, subgroupOp, /* new yielded values = */ loadOp.getTensorDesc(),
+        /* new yielded types = */ tensorDescTy, newRetIndices);
+
+    /// Create a new load op outside the warp op with the distributed vector
+    /// type.
+    rewriter.setInsertionPointAfter(newWarpOp);
+    auto loadNdDistValueTyOrFailure =
+        loadOp.getTensorDescType().getDistributedVectorType();
+    if (failed(loadNdDistValueTyOrFailure))
+      return rewriter.notifyMatchFailure(
+          loadOp, "Failed to get distributed vector type for the load op");
+    Value newLoadOp = rewriter.create<xegpu::LoadNdOp>(
+        newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
+        newWarpOp->getResult(newRetIndices[0]), loadOp->getAttrs());
+    Value distributedVal = newWarpOp.getResult(operandIdx);
+    /// There can be a conflict between the vector type distributed by the warp
+    /// op and (xegpu-specific) distributed type supported by the load op. We
+    /// reconcile these mismatches by inserting a cast.
+    newLoadOp = reconcileDistribtedVecType(newLoadOp, distributedTypeByWarpOp,
+                                           rewriter);
+    rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
+    return success();
+  }
 };
 
 struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
-                                PatternRewriter &rewriter) const override;
-};
-
-} // namespace
-
-/// Returns the distributed vector type for a source vector type according to
-/// the wi_layout. We simply divide each dimension of tensor descriptor shape by
-/// corresponding wi_layout dimension. If array_length > 1, that is appended to
-/// the front of the disributed shape.
-/// Examples:
-/// | original vector shape | wi_layout | distributed vector shape |
-/// |-----------------------|-----------|--------------------------|
-/// | 32x16                 | [1, 16]   | 32x1                     |
-/// | 32x16                 | [2, 8]    | 16x2                     |
-/// | 2x32x16               | [1, 16]   | 2x32x1                   |
-FailureOr<VectorType>
-getDistributedVecTypeBasedOnWiLayout(xegpu::SGMapAttr sgMap,
-                                     VectorType originalType) {
-  llvm::SmallVector<int64_t, 2> distributedShape;
-  if (!sgMap)
-    return failure();
-
-  auto wiLayout = sgMap.getWiLayout();
-  assert((originalType.getRank() == 2 || originalType.getRank() == 3) &&
-         "expecting 2D or 3D shape for the original vector type");
-  assert(wiLayout.size() == 2 && "expecting 2D shape for the wi layout");
-  // Original type can be 2D or 3D (array_length > 1), the last two dims are the
-  // block shape.
-  auto blockShape = originalType.getShape().take_back(2);
-  // Check if the block vector shape can be distributed evenly.
-  if (blockShape[0] % wiLayout[0] != 0 || blockShape[1] % wiLayout[1] != 0)
-    return failure();
-
-  if (originalType.getRank() == 3) {
-    distributedShape.push_back(originalType.getShape()[0]);
-  }
-  for (unsigned i = 0; i < 2; ++i) {
-    distributedShape.push_back(blockShape[i] / wiLayout[i]);
-  }
-  auto newVectorType =
-      VectorType::get(distributedShape, originalType.getElementType());
-  return newVectorType;
-}
-
-static VectorType getDistributedVectorType(xegpu::SGMapAttr sgMap,
-                                           VectorType originalType) {
-  auto shape = originalType.getShape();
-  auto distVecTyOrFailure =
-      xegpu::TensorDescType::get(shape, originalType.getElementType(),
-                                 /*array_length=*/1, /*boundary_check=*/true,
-                                 /*memory_space=*/xegpu::MemorySpace::Global,
-                                 sgMap)
-          .getDistributedVectorType();
-  assert(llvm::succeeded(distVecTyOrFailure) &&
-         "Failed to compute distributed vector type for the given vector type");
-  return distVecTyOrFailure.value();
-}
-
-static Value reconcileDistribtedVecType(Value orig, VectorType expected,
-                                        PatternRewriter &rewriter) {
-  assert(isa<VectorType>(orig.getType()) && "expecting vector type");
-  auto origVecType = cast<VectorType>(orig.getType());
-  /// No need to reconcile if the types are the same.
-  if (origVecType == expected)
-    return orig;
-  auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
-                                                            expected, orig);
-  return castOp.getResult(0);
-}
-
-LogicalResult MoveFuncBodyToWarpExecuteOnLane0::matchAndRewrite(
-    gpu::GPUFuncOp gpuFuncOp, PatternRewriter &rewriter) const {
-  /// If the function already moved inside a warp_execute_on_lane0, skip.
-  if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
-        return isa<gpu::WarpExecuteOnLane0Op>(op);
-      }))
-    return failure();
-  /// Create a new function with the same signature.
-  auto newGpuFunc = rewriter.create<gpu::GPUFuncOp>(
-      gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType());
-  /// Create a WarpExecuteOnLane0Op with same arguments and results as the
-  /// original gpuFuncOp.
-  rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
-  auto laneId = rewriter.create<gpu::LaneIdOp>(
-      newGpuFunc.getLoc(), rewriter.getIndexType(),
-      /** upperBound = **/ mlir::IntegerAttr());
-  auto gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
-  auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
-      laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize,
-      newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes());
-  auto &warpBodyBlock = warpOp.getBodyRegion().front();
-  /// Replace the ReturnOp of the original gpu function with a YieldOp.
-  auto origRetunOp =
-      cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
-  rewriter.setInsertionPointAfter(origRetunOp);
-  rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
-                                origRetunOp.getOperands());
-  rewriter.eraseOp(origRetunOp);
-  /// Move the original function body to the WarpExecuteOnLane0Op body.
-  rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
-                              warpOp.getBodyRegion().begin());
-  rewriter.eraseBlock(&warpBodyBlock);
-  /// Insert a new ReturnOp after the WarpExecuteOnLane0Op.
-  rewriter.setInsertionPointAfter(warpOp);
-  rewriter.create<gpu::ReturnOp>(newGpuFunc.getLoc(), warpOp.getResults());
-  rewriter.replaceOp(gpuFuncOp, newGpuFunc);
-  return success();
-}
-
-LogicalResult
-SubgroupOpStoreNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
-                                   PatternRewriter &rewriter) const {
-  auto yield = cast<gpu::YieldOp>(
-      subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
-  Operation *lastNode = yield->getPrevNode();
-  auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
-  if (!storeOp)
-    return failure();
-
-  auto tensorDescTy = storeOp.getTensorDescType();
-  xegpu::SGMapAttr sgMap = tensorDescTy.getSGMapAttr();
-  if (!sgMap)
-    return rewriter.notifyMatchFailure(
-        storeOp, "the source tensor descriptor lacks sg_map attribute");
-
-  if (storeOp.getTensorDescType().getShape().size() != 2)
-    return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
-
-  auto distriburtedTypeByWarpOp =
-      getDistributedVecTypeBasedOnWiLayout(sgMap, storeOp.getValueType());
-  if (failed(distriburtedTypeByWarpOp))
-    return rewriter.notifyMatchFailure(storeOp,
-                                       "Failed to distribute the type");
-  VectorType distributedTypeByWarpOp = distriburtedTypeByWarpOp.value();
-
-  SmallVector<size_t> newRetIndices;
-  gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-      rewriter, subgroupOp,
-      /* new yielded values = */
-      ValueRange{storeOp.getValue(), storeOp.getTensorDesc()},
-      /* new yielded types = */
-      TypeRange{distributedTypeByWarpOp, storeOp.getTensorDescType()},
-      newRetIndices);
-  /// Create a new store op outside the warp op with the distributed vector
-  /// type. Tensor descriptor is not distributed.
-  rewriter.setInsertionPointAfter(newWarpOp);
-  SmallVector<Value> newStoreOperands;
-
-  /// For the value operand, there can be a conflict between the vector type
-  /// distributed by the warp op and (xegpu-specific) distributed type supported
-  /// by the store op. We reconcile these mismatches by inserting a cast. These
-  /// gets cancelled out later.
-  auto storeNdDistributedValueTyOrFailure =
-      storeOp.getTensorDescType().getDistributedVectorType();
-  if (failed(storeNdDistributedValueTyOrFailure))
-    return rewriter.notifyMatchFailure(
-        storeOp, "Failed to get distributed vector type for the store op");
-  newStoreOperands.push_back(reconcileDistribtedVecType(
-      newWarpOp.getResult(newRetIndices[0]),
-      storeNdDistributedValueTyOrFailure.value(), rewriter));
-  newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[1]));
-
-  rewriter.create<xegpu::StoreNdOp>(newWarpOp.getLoc(), TypeRange{},
-                                    newStoreOperands, storeOp->getAttrs());
-  rewriter.eraseOp(storeOp);
-  return success();
-}
-
-LogicalResult
-SubgroupOpLoadNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
-                                  PatternRewriter &rewriter) const {
-  OpOperand *operand =
-      getWarpResult(subgroupOp, llvm::IsaPred<xegpu::LoadNdOp>);
-  if (!operand)
-    return rewriter.notifyMatchFailure(subgroupOp,
-                                       "warp result is not a xegpu::LoadNd op");
-
-  auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
-  xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
-  xegpu::SGMapAttr sgMap = tensorDescTy.getSGMapAttr();
-  if (!sgMap)
-    return rewriter.notifyMatchFailure(
-        loadOp, "the source tensor descriptor lacks sg_map attribute");
-
-  unsigned operandIdx = operand->getOperandNumber();
-  VectorType distributedTypeByWarpOp =
-      cast<VectorType>(subgroupOp.getResult(operandIdx).getType());
-
-  SmallVector<size_t> newRetIndices;
-  gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-      rewriter, subgroupOp, /* new yielded values = */ loadOp.getTensorDesc(),
-      /* new yielded types = */ tensorDescTy, newRetIndices);
-
-  /// Create a new load op outside the warp op with the distributed vector type.
-  rewriter.setInsertionPointAfter(newWarpOp);
-  auto loadNdDistValueTyOrFailure =
-      loadOp.getTensorDescType().getDistributedVectorType();
-  if (failed(loadNdDistValueTyOrFailure))
-    return rewriter.notifyMatchFailure(
-        loadOp, "Failed to get distributed vector type for the load op");
-  Value newLoadOp = rewriter.create<xegpu::LoadNdOp>(
-      newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
-      newWarpOp->getResult(newRetIndices[0]), loadOp->getAttrs());
-  Value distributedVal = newWarpOp.getResult(operandIdx);
-  /// There can be a conflict between the vector type distributed by the warp op
-  /// and (xegpu-specific) distributed type supported by the load op. We
-  /// reconcile these mismatches by inserting a cast.
-  newLoadOp =
-      reconcileDistribtedVecType(newLoadOp, distributedTypeByWarpOp, rewriter);
-  rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
-  return success();
-}
-
-LogicalResult
-SubgroupOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
-                                        PatternRewriter &rewriter) const {
-  OpOperand *operand =
-      getWarpResult(subgroupOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
-  if (!operand)
-    return rewriter.notifyMatchFailure(
-        subgroupOp, "warp result is not a xegpu::CreateNdDesc op");
-  auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
-  unsigned operandIdx = operand->getOperandNumber();
-
-  auto srcTypedVal = dyn_cast<TypedValue<MemRefType>>(descOp.getSource());
-  if (!srcTypedVal)
-    return rewriter.notifyMatchFailure(
-        descOp, "expecting a memref typed value as the source");
-
-  auto descOffsets = descOp.getMixedOffsets();
-
-  xegpu::SGMapAttr sgMap = descOp.getType().getSGMapAttr();
-  if (!sgMap)
-    return rewriter.notifyMatchFailure(
-        descOp, "the tensor descriptor lacks sg_map attribute");
-
-  SmallVector<size_t> newRetIndices;
-  SmallVector<Value> newYieldValues;
-  SmallVector<Type> newYieldTypes;
-
-  for (auto arg : descOp->getOperands()) {
-    newYieldValues.push_back(arg);
-    newYieldTypes.push_back(arg.getType());
-  }
-  rewriter.setInsertionPoint(subgroupOp);
-  gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-      rewriter, subgroupOp, /* new yieled values = */ newYieldValues,
-      /* new yielded types = */ newYieldTypes, newRetIndices);
-
-  SmallVector<Value> newDescOperands;
-  for (auto i : newRetIndices) {
-    newDescOperands.push_back(newWarpOp.getResult(i));
-  }
-  rewriter.setInsertionPointAfter(newWarpOp);
-  auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
-      newWarpOp.getLoc(), descOp.getType(), newDescOperands,
-      descOp->getAttrs());
-
-  Value distributedVal = newWarpOp.getResult(operandIdx);
-  rewriter.replaceAllUsesWith(distributedVal, newDescOp);
-  return success();
-}
-
-LogicalResult
-SubgroupOpDpas::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
-                                PatternRewriter &rewriter) const {
-  OpOperand *operand = getWarpResult(subgroupOp, llvm::IsaPred<xegpu::DpasOp>);
-  if (!operand)
-    return rewriter.notifyMatchFailure(subgroupOp,
-                                       "warp result is not a xegpu::Dpas op");
-
-  auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
-  unsigned operandIdx = operand->getOperandNumber();
-  xegpu::SGMapAttr sgMapA = dpasOp.getSgMapAAttr();
-  xegpu::SGMapAttr sgMapB = dpasOp.getSgMapBAttr();
-  xegpu::SGMapAttr sgMapOut = dpasOp->getAttrOfType<xegpu::SGMapAttr>("r0");
-  if (!sgMapA || !sgMapB || !sgMapOut)
-    return rewriter.notifyMatchFailure(
-        dpasOp, "the xegpu::Dpas op lacks sg_map attribute for A, B or output");
-
-  auto distLhsTypeByWarpOpOrFailure =
-      getDistributedVecTypeBasedOnWiLayout(sgMapA, dpasOp.getLhsType());
-  auto distRhsTypeByWarpOpOrFailure =
-      getDistributedVecTypeBasedOnWiLayout(sgMapB, dpasOp.getRhsType());
-  auto distResultTypeByWarpOpOrFailure =
-      getDistributedVecTypeBasedOnWiLayout(sgMapOut, dpasOp.getResultType());
-  if (failed(distLhsTypeByWarpOpOrFailure) ||
-      failed(distRhsTypeByWarpOpOrFailure) ||
-      failed(distResultTypeByWarpOpOrFailure))
-    return rewriter.notifyMatchFailure(
-        dpasOp,
-        "Failed to distribute the A, B or output types in xegpu::Dpas op");
-
-  llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(), dpasOp.getRhs()};
-  llvm::SmallVector<Type, 3> newYieldTypes{
-      distLhsTypeByWarpOpOrFailure.value(),
-      distRhsTypeByWarpOpOrFailure.value()};
-  /// Dpas acc operand is optional.
-  if (dpasOp.getAcc()) {
-    newYieldValues.push_back(dpasOp.getAcc());
-    newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
-  }
-  /// Create a new warp op without the dpas.
-  SmallVector<size_t> newRetIndices;
-  gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-      rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
-
-  // Create a new dpas op outside the warp op.
-  rewriter.setInsertionPointAfter(newWarpOp);
-  SmallVector<Value> newDpasOperands;
-  SmallVector<VectorType> newDpasOperandExpectedTypes;
-  /// Reconcile the distributed types with the original types.
-  newDpasOperandExpectedTypes.push_back(
-      getDistributedVectorType(sgMapA, dpasOp.getLhsType()));
-  newDpasOperandExpectedTypes.push_back(
-      getDistributedVectorType(sgMapB, dpasOp.getRhsType()));
-  if (dpasOp.getAcc()) {
+                                PatternRewriter &rewriter) const override {
+    OpOperand *operand =
+        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::DpasOp>);
+    if (!operand)
+      return rewriter.notifyMatchFailure(subgroupOp,
+                                         "warp result is not a xegpu::Dpas op");
+
+    auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
+    unsigned operandIdx = operand->getOperandNumber();
+    xegpu::SGMapAttr sgMapA = dpasOp.getSgMapAAttr();
+    xegpu::SGMapAttr sgMapB = dpasOp.getSgMapBAttr();
+    xegpu::SGMapAttr sgMapOut = dpasOp->getAttrOfType<xegpu::SGMapAttr>("r0");
+    if (!sgMapA || !sgMapB || !sgMapOut)
+      return rewriter.notifyMatchFailure(
+          dpasOp,
+          "the xegpu::Dpas op lacks sg_map attribute for A, B or output");
+
+    auto distLhsTypeByWarpOpOrFailure =
+        getDistributedVecTypeBasedOnWiLayout(sgMapA, dpasOp.getLhsType());
+    auto distRhsTypeByWarpOpOrFailure =
+        getDistributedVecTypeBasedOnWiLayout(sgMapB, dpasOp.getRhsType());
+    auto distResultTypeByWarpOpOrFailure =
+        getDistributedVecTypeBasedOnWiLayout(sgMapOut, dpasOp.getResultType());
+    if (failed(distLhsTypeByWarpOpOrFailure) ||
+        failed(distRhsTypeByWarpOpOrFailure) ||
+        failed(distResultTypeByWarpOpOrFailure))
+      return rewriter.notifyMatchFailure(
+          dpasOp,
+          "Failed to distribute the A, B or output types in xegpu::Dpas op");
+
+    llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(),
+                                               dpasOp.getRhs()};
+    llvm::SmallVector<Type, 3> newYieldTypes{
+        distLhsTypeByWarpOpOrFailure.value(),
+        distRhsTypeByWarpOpOrFailure.value()};
+    /// Dpas acc operand is optional.
+    if (dpasOp.getAcc()) {
+      newYieldValues.push_back(dpasOp.getAcc());
+      newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
+    }
+    /// Create a new warp op without the dpas.
+    SmallVector<size_t> newRetIndices;
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
+
+    // Create a new dpas op outside the warp op.
+    rewriter.setInsertionPointAfter(newWarpOp);
+    SmallVector<Value> newDpasOperands;
+    SmallVector<VectorType> newDpasOperandExpectedTypes;
+    /// Reconcile the distributed types with the original types.
     newDpasOperandExpectedTypes.push_back(
-        getDistributedVectorType(sgMapOut, dpasOp.getResultType()));
-  }
+        getDistributedVectorType(sgMapA, dpasOp.getLhsType()));
+    newDpasOperandExpectedTypes.push_back(
+        getDistributedVectorType(sgMapB, dpasOp.getRhsType()));
+    if (dpasOp.getAcc()) {
+      newDpasOperandExpectedTypes.push_back(
+          getDistributedVectorType(sgMapOut, dpasOp.getResultType()));
+    }
 
-  for (auto i : newRetIndices) {
-    newDpasOperands.push_back(reconcileDistribtedVecType(
-        newWarpOp.getResult(i),
-        newDpasOperandExpectedTypes[newDpasOperands.size()], rewriter));
+    for (auto i : newRetIndices) {
+      newDpasOperands.push_back(reconcileDistribtedVecType(
+          newWarpOp.getResult(i),
+          newDpasOperandExpectedTypes[newDpasOperands.size()], rewriter));
+    }
+    auto newDpasOp = rewriter.create<xegpu::DpasOp>(
+        newWarpOp->getLoc(), distResultTypeByWarpOpOrFailure.value(),
+        newDpasOperands, dpasOp->getAttrs());
+    Value disributedVal = newWarpOp.getResult(operandIdx);
+    /// Reconile the output type.
+    disributedVal = reconcileDistribtedVecType(
+        disributedVal,
+        getDistributedVectorType(sgMapOut, dpasOp.getResultType()), rewriter);
+    rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
+    return success();
   }
-  auto newDpasOp = rewriter.create<xegpu::DpasOp>(
-      newWarpOp->getLoc(), distResultTypeByWarpOpOrFailure.value(),
-      newDpasOperands, dpasOp->getAttrs());
-  Value disributedVal = newWarpOp.getResult(operandIdx);
-  /// Reconile the output type.
-  disributedVal = reconcileDistribtedVecType(
-      disributedVal, getDistributedVectorType(sgMapOut, dpasOp.getResultType()),
-      rewriter);
-  rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
-  return success();
-}
+};
+
+} // namespace
 
 namespace {
 struct XeGPUSubgroupDistributePass final
@@ -1265,20 +1254,27 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
   if (failed(resolveLayoutConflicts(getOperation())))
     signalPassFailure();
   /// Move all operations inside a GPU functions inside
-  /// gpu.warp_execute_on_lane0
+  /// gpu.warp_execute_on_lane0.
+  /// We want to avoid ops from hoisted out of the gpu.warp_execute_on_lane0
+  /// region.
+  // GreedyRewriteConfig config;
+  // config.cseConstants = false;
+  // config.fold = false;
+  // config.enableRegionSimplification = GreedySimplifyRegionLevel::Disabled;
   {
     RewritePatternSet patterns(&getContext());
     patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
-    /// We want to avoid ops from hoisted out of the gpu.warp_execute_on_lane0
-    /// region.
-    GreedyRewriteConfig config;
-    config.cseConstants = false;
-    config.fold = false;
-    (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
+
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
   /// Finally, do the SIMD to SIMT distribution.
   RewritePatternSet patterns(&getContext());
   xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
-  vector::populateWarpSimplificationPatterns(patterns);
+  /// TODO: These are not used at this point.
+  auto distributionFn = [](Value val) { return AffineMap(); };
+  auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
+                      int64_t warpSz) { return Value(); };
+  vector::populatePropagateWarpVectorDistributionPatterns(
+      patterns, distributionFn, shuffleFn);
   (void)applyPatternsGreedily(getOperation(), std::move(patterns));
 }
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index feec10e6492f7..a54ae816570a8 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -660,7 +660,6 @@ struct TestVectorDistribution
       vector::populatePropagateWarpVectorDistributionPatterns(
           patterns, distributionFn, shuffleFn, /*benefit=*/1,
           /*readBenefit=*/0);
-      vector::populateWarpSimplificationPatterns(patterns);
       vector::populateDistributeReduction(patterns, warpReduction, 1);
       populateDistributeTransferWriteOpPatterns(patterns, distributionFn, 2);
       (void)applyPatternsGreedily(getOperation(), std::move(patterns));
@@ -673,7 +672,6 @@ struct TestVectorDistribution
       RewritePatternSet patterns(ctx);
       vector::populatePropagateWarpVectorDistributionPatterns(
           patterns, distributionFn, shuffleFn);
-      vector::populateWarpSimplificationPatterns(patterns);
       vector::populateDistributeReduction(patterns, warpReduction);
       (void)applyPatternsGreedily(getOperation(), std::move(patterns));
     }

>From 5700c8149354b94e05d3570de0dcea32d51039c4 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Sat, 29 Mar 2025 03:43:26 +0000
Subject: [PATCH 24/45] merge xegpu changes

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  7 ++
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 11 ++++
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 65 ++++++++++---------
 3 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 4afeef1427e8b..2baf34550dc38 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -254,6 +254,13 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
     }
   }];
 
+  let builders = [
+    AttrBuilder<(ins
+      "ArrayRef<int>": $lane_layout,
+      "ArrayRef<int>": $lane_data
+    )>
+  ];
+
   let assemblyFormat = "`<` struct(params) `>`";
   let genVerifyDecl = 1;
 }
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 6ef1a2deebcab..946a3961aa5c1 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -8,7 +8,9 @@
 
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/DialectImplementation.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"
 
 namespace mlir {
@@ -113,6 +115,15 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
   return success();
 }
 
+LayoutAttr LayoutAttr::get(mlir::MLIRContext *context, ArrayRef<int> laneLayout,
+                           ArrayRef<int> laneData) {
+  return Base::get(context, ScopeAttr::get(context, Scope::Lane),
+                   DenseI32ArrayAttr(), DenseI32ArrayAttr(),
+                   DenseI32ArrayAttr(),
+                   DenseI32ArrayAttr::get(context, laneLayout),
+                   DenseI32ArrayAttr::get(context, laneData));
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_TensorDescType
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 38d9fe6c88800..e2d8b6b06c513 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -659,7 +659,7 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
   }
 }
 
-void attachLayoutAttributeToUsers(Value v, xegpu::SGMapAttr layout) {
+void attachLayoutAttributeToUsers(Value v, xegpu::LayoutAttr layout) {
   for (OpOperand &user : v.getUses()) {
     Operation *owner = user.getOwner();
     unsigned operandNumber = user.getOperandNumber();
@@ -667,11 +667,11 @@ void attachLayoutAttributeToUsers(Value v, xegpu::SGMapAttr layout) {
     /// attribute.
     if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
       if (operandNumber == 0)
-        dpasOp.setSgMapAAttr(layout);
+        dpasOp.setALayoutAttr(layout);
       else if (operandNumber == 1)
-        dpasOp.setSgMapBAttr(layout);
+        dpasOp.setBLayoutAttr(layout);
       else if (operandNumber == 2)
-        dpasOp.setSgMapCAttr(layout);
+        dpasOp.setCLayoutAttr(layout);
       continue;
     }
     /// For every other user, use a generic attribute name.
@@ -684,17 +684,17 @@ static LogicalResult
 attachLayoutAttributes(Operation *top,
                        llvm::function_ref<SGMap(Value)> getPropagatedLayout) {
   /// Helper to convert SGMap to xegpu::SGMapAttr.
-  auto getSGMapForResult = [&](Value r) -> xegpu::SGMapAttr {
+  auto getSGMapForResult = [&](Value r) -> xegpu::LayoutAttr {
     auto layout = getPropagatedLayout(r);
     if (!layout.isAssigned())
       return {};
-    SmallVector<uint32_t, 2> wiLayout, wiData;
+    SmallVector<int, 2> wiLayout, wiData;
     for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
                                                layout.getDataAsArrayRef())) {
-      wiLayout.push_back(static_cast<uint32_t>(layout));
-      wiData.push_back(static_cast<uint32_t>(data));
+      wiLayout.push_back(static_cast<int>(layout));
+      wiData.push_back(static_cast<int>(data));
     }
-    return xegpu::SGMapAttr::get(top->getContext(), wiLayout, wiData);
+    return xegpu::LayoutAttr::get(r.getContext(), wiLayout, wiData);
   };
   /// Attach the layout attributes to the results of the operations.
   auto walkResult = top->walk([&](Operation *op) {
@@ -769,13 +769,13 @@ namespace {
 /// | 32x16                 | [2, 8]    | 16x2                     |
 /// | 2x32x16               | [1, 16]   | 2x32x1                   |
 FailureOr<VectorType>
-getDistributedVecTypeBasedOnWiLayout(xegpu::SGMapAttr sgMap,
+getDistributedVecTypeBasedOnWiLayout(xegpu::LayoutAttr layout,
                                      VectorType originalType) {
   llvm::SmallVector<int64_t, 2> distributedShape;
-  if (!sgMap)
+  if (!layout)
     return failure();
 
-  auto wiLayout = sgMap.getWiLayout();
+  auto wiLayout = layout.getLaneLayout();
   assert((originalType.getRank() == 2 || originalType.getRank() == 3) &&
          "expecting 2D or 3D shape for the original vector type");
   assert(wiLayout.size() == 2 && "expecting 2D shape for the wi layout");
@@ -797,14 +797,14 @@ getDistributedVecTypeBasedOnWiLayout(xegpu::SGMapAttr sgMap,
   return newVectorType;
 }
 
-static VectorType getDistributedVectorType(xegpu::SGMapAttr sgMap,
+static VectorType getDistributedVectorType(xegpu::LayoutAttr layout,
                                            VectorType originalType) {
   auto shape = originalType.getShape();
   auto distVecTyOrFailure =
       xegpu::TensorDescType::get(shape, originalType.getElementType(),
                                  /*array_length=*/1, /*boundary_check=*/true,
                                  /*memory_space=*/xegpu::MemorySpace::Global,
-                                 sgMap)
+                                 layout)
           .getDistributedVectorType();
   assert(llvm::succeeded(distVecTyOrFailure) &&
          "Failed to compute distributed vector type for the given vector type");
@@ -944,8 +944,8 @@ struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
 
     auto descOffsets = descOp.getMixedOffsets();
 
-    xegpu::SGMapAttr sgMap = descOp.getType().getSGMapAttr();
-    if (!sgMap)
+    xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr();
+    if (!layout)
       return rewriter.notifyMatchFailure(
           descOp, "the tensor descriptor lacks sg_map attribute");
 
@@ -1013,8 +1013,8 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
       return failure();
 
     auto tensorDescTy = storeOp.getTensorDescType();
-    xegpu::SGMapAttr sgMap = tensorDescTy.getSGMapAttr();
-    if (!sgMap)
+    xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
+    if (!layout)
       return rewriter.notifyMatchFailure(
           storeOp, "the source tensor descriptor lacks sg_map attribute");
 
@@ -1022,7 +1022,7 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
       return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
 
     auto distriburtedTypeByWarpOp =
-        getDistributedVecTypeBasedOnWiLayout(sgMap, storeOp.getValueType());
+        getDistributedVecTypeBasedOnWiLayout(layout, storeOp.getValueType());
     if (failed(distriburtedTypeByWarpOp))
       return rewriter.notifyMatchFailure(storeOp,
                                          "Failed to distribute the type");
@@ -1103,8 +1103,8 @@ struct SubgroupOpLoadNd final : public gpu::WarpDistributionPattern {
 
     auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
     xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
-    xegpu::SGMapAttr sgMap = tensorDescTy.getSGMapAttr();
-    if (!sgMap)
+    xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
+    if (!layout)
       return rewriter.notifyMatchFailure(
           loadOp, "the source tensor descriptor lacks sg_map attribute");
 
@@ -1151,20 +1151,21 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
 
     auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
     unsigned operandIdx = operand->getOperandNumber();
-    xegpu::SGMapAttr sgMapA = dpasOp.getSgMapAAttr();
-    xegpu::SGMapAttr sgMapB = dpasOp.getSgMapBAttr();
-    xegpu::SGMapAttr sgMapOut = dpasOp->getAttrOfType<xegpu::SGMapAttr>("r0");
-    if (!sgMapA || !sgMapB || !sgMapOut)
+    xegpu::LayoutAttr layoutA = dpasOp.getALayoutAttr();
+    xegpu::LayoutAttr layoutB = dpasOp.getBLayoutAttr();
+    xegpu::LayoutAttr layoutOut =
+        dpasOp->getAttrOfType<xegpu::LayoutAttr>("r0");
+    if (!layoutA || !layoutB || !layoutOut)
       return rewriter.notifyMatchFailure(
           dpasOp,
           "the xegpu::Dpas op lacks sg_map attribute for A, B or output");
 
     auto distLhsTypeByWarpOpOrFailure =
-        getDistributedVecTypeBasedOnWiLayout(sgMapA, dpasOp.getLhsType());
+        getDistributedVecTypeBasedOnWiLayout(layoutA, dpasOp.getLhsType());
     auto distRhsTypeByWarpOpOrFailure =
-        getDistributedVecTypeBasedOnWiLayout(sgMapB, dpasOp.getRhsType());
+        getDistributedVecTypeBasedOnWiLayout(layoutB, dpasOp.getRhsType());
     auto distResultTypeByWarpOpOrFailure =
-        getDistributedVecTypeBasedOnWiLayout(sgMapOut, dpasOp.getResultType());
+        getDistributedVecTypeBasedOnWiLayout(layoutOut, dpasOp.getResultType());
     if (failed(distLhsTypeByWarpOpOrFailure) ||
         failed(distRhsTypeByWarpOpOrFailure) ||
         failed(distResultTypeByWarpOpOrFailure))
@@ -1193,12 +1194,12 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
     SmallVector<VectorType> newDpasOperandExpectedTypes;
     /// Reconcile the distributed types with the original types.
     newDpasOperandExpectedTypes.push_back(
-        getDistributedVectorType(sgMapA, dpasOp.getLhsType()));
+        getDistributedVectorType(layoutA, dpasOp.getLhsType()));
     newDpasOperandExpectedTypes.push_back(
-        getDistributedVectorType(sgMapB, dpasOp.getRhsType()));
+        getDistributedVectorType(layoutB, dpasOp.getRhsType()));
     if (dpasOp.getAcc()) {
       newDpasOperandExpectedTypes.push_back(
-          getDistributedVectorType(sgMapOut, dpasOp.getResultType()));
+          getDistributedVectorType(layoutOut, dpasOp.getResultType()));
     }
 
     for (auto i : newRetIndices) {
@@ -1213,7 +1214,7 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
     /// Reconile the output type.
     disributedVal = reconcileDistribtedVecType(
         disributedVal,
-        getDistributedVectorType(sgMapOut, dpasOp.getResultType()), rewriter);
+        getDistributedVectorType(layoutOut, dpasOp.getResultType()), rewriter);
     rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
     return success();
   }

>From 2334a9780b5ce1b129053f0080f1d37e4ae4a6a7 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 31 Mar 2025 19:26:21 +0000
Subject: [PATCH 25/45] refactor names

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 434 +++++++++---------
 1 file changed, 223 insertions(+), 211 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index e2d8b6b06c513..8e1e846c94d3e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -55,7 +55,7 @@ using namespace mlir::dataflow;
 
 /// HW dependent constants.
 /// TODO: These constants should be queried from the target information.
-constexpr unsigned subgroupSize = 16; // How many work items in a subgroup.
+constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup.
 /// If DPAS A or B operands have low precision element types they must be packed
 /// according to the following sizes.
 constexpr unsigned packedSizeInBitsForDefault =
@@ -69,8 +69,8 @@ namespace {
 /// Layout
 ///===----------------------------------------------------------------------===///
 
-/// Helper class to store the ND layout of work items within a subgroup and data
-/// owned by each work item.
+/// Helper class to store the ND layout of lanes within a subgroup and data
+/// owned by each lane.
 struct Layout {
   SmallVector<int64_t, 3> layout;
   Layout() = default;
@@ -91,123 +91,125 @@ int64_t Layout::operator[](size_t idx) const {
   return layout[idx];
 }
 
-/// WiLayout represents the layout of work items within a subgroup when it
-/// accesses some value. WiData represents the layout of data owned by each work
-/// item.
-using WiLayout = Layout;
-using WiData = Layout;
+/// LaneLayout represents the logical layout of lanes within a subgroup when it
+/// accesses some value. LaneData represents the logical layout of data owned by
+/// each work item.
+using LaneLayout = Layout;
+using LaneData = Layout;
 
 ///===----------------------------------------------------------------------===///
-/// SGMap
+/// LayoutInfo
 ///===----------------------------------------------------------------------===///
 
 /// Helper class for tracking the analysis state of a value. For SGPropagation,
-/// the analysis state is simply the wi_layout and wi_data of each value.
+/// the analysis state is simply the lane_layout and lane_data of each value.
 /// Purpose of this analysis to propagate some unique layout for each value in
 /// the program starting from some known values (like DPAS, StoreNd, etc.).
 ///
-/// Given this, SGMap satisifies the following properties:
-///  1) SGMap is a lattice with two states - assigned and not assigned.
-///  2) Two SGMap values are equal if they are both assigned or both not
+/// Given this, LayoutInfo satisifies the following properties:
+///  1) LayoutInfo is a lattice with two states - assigned and not assigned.
+///  2) Two LayoutInfo values are equal if they are both assigned or both not
 ///  assigned. The concrete value of assigned state does not matter.
 ///  3) The meet operator works as follows:
 ///     - If current state is assigned, return the current state. (already
 ///     a unique layout is assigned. don't change it)
 ///     - Otherwise, return the other state.
 
-struct SGMap {
+struct LayoutInfo {
 private:
-  WiLayout wiLayout;
-  WiData wiData;
+  LaneLayout laneLayout;
+  LaneData laneData;
 
 public:
-  SGMap() = default;
-  SGMap(const WiLayout &layout, const WiData &data)
-      : wiLayout(layout), wiData(data) {}
+  LayoutInfo() = default;
+  LayoutInfo(const LaneLayout &layout, const LaneData &data)
+      : laneLayout(layout), laneData(data) {}
 
   /// Two lattice values are equal if they have `some` layout. The actual
   /// content of the layout does not matter.
-  bool operator==(const SGMap &other) const {
+  bool operator==(const LayoutInfo &other) const {
     return this->isAssigned() == other.isAssigned();
   }
 
-  static SGMap meet(const SGMap &lhs, const SGMap &rhs);
+  static LayoutInfo meet(const LayoutInfo &lhs, const LayoutInfo &rhs);
 
-  static SGMap join(const SGMap &lhs, const SGMap &rhs);
+  static LayoutInfo join(const LayoutInfo &lhs, const LayoutInfo &rhs);
 
   void print(raw_ostream &os) const;
 
-  bool isAssigned() const { return wiLayout.size() > 0 && wiData.size() > 0; }
+  bool isAssigned() const {
+    return laneLayout.size() > 0 && laneData.size() > 0;
+  }
 
-  SGMap getTransposedLayout(ArrayRef<int64_t> permutation) const;
+  LayoutInfo getTransposedLayout(ArrayRef<int64_t> permutation) const;
 
-  const WiLayout &getLayout() const { return wiLayout; }
-  const WiData &getData() const { return wiData; }
-  ArrayRef<int64_t> getLayoutAsArrayRef() const { return wiLayout.layout; }
-  ArrayRef<int64_t> getDataAsArrayRef() const { return wiData.layout; }
+  const LaneLayout &getLayout() const { return laneLayout; }
+  const LaneData &getData() const { return laneData; }
+  ArrayRef<int64_t> getLayoutAsArrayRef() const { return laneLayout.layout; }
+  ArrayRef<int64_t> getDataAsArrayRef() const { return laneData.layout; }
 };
 
-void SGMap::print(raw_ostream &os) const {
+void LayoutInfo::print(raw_ostream &os) const {
   if (isAssigned()) {
-    os << "wi_layout: ";
-    wiLayout.print(os);
-    os << ", wi_data: ";
-    wiData.print(os);
+    os << "lane_layout: ";
+    laneLayout.print(os);
+    os << ", lane_data: ";
+    laneData.print(os);
   } else
     os << "Not assigned.";
 }
 
-SGMap SGMap::meet(const SGMap &lhs, const SGMap &rhs) {
+LayoutInfo LayoutInfo::meet(const LayoutInfo &lhs, const LayoutInfo &rhs) {
   if (!lhs.isAssigned())
     return rhs;
   return lhs;
 }
 
 /// Since this is a backward analysis, join method is not used.
-SGMap SGMap::join(const SGMap &lhs, const SGMap &rhs) {
+LayoutInfo LayoutInfo::join(const LayoutInfo &lhs, const LayoutInfo &rhs) {
   llvm_unreachable("Join should not be triggered by SGMapPropagation.");
 }
 
 /// Get the transposed layout according to the given permutation.
-SGMap SGMap::getTransposedLayout(ArrayRef<int64_t> permutation) const {
+LayoutInfo
+LayoutInfo::getTransposedLayout(ArrayRef<int64_t> permutation) const {
   if (!isAssigned())
     return {};
-  WiLayout newLayout;
-  WiData newData;
+  LaneLayout newLayout;
+  LaneData newData;
   for (auto idx : permutation) {
-    newLayout.layout.push_back(wiLayout.layout[idx]);
-    newData.layout.push_back(wiData.layout[idx]);
+    newLayout.layout.push_back(laneLayout.layout[idx]);
+    newData.layout.push_back(laneData.layout[idx]);
   }
-  return SGMap(newLayout, newData);
+  return LayoutInfo(newLayout, newData);
 }
 
 ///===----------------------------------------------------------------------===///
-/// SGMapLattice
+/// LayoutInfoLattice
 ///===----------------------------------------------------------------------===///
 
-/// Lattice holding the SGMap for each value.
-struct SGMapLattice : public Lattice<SGMap> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SGMapLattice)
+/// Lattice holding the LayoutInfo for each value.
+struct LayoutInfoLattice : public Lattice<LayoutInfo> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LayoutInfoLattice)
   using Lattice::Lattice;
 };
 
 /// Helper Functions to get default layouts. A `default layout` is a layout that
 /// is assigned to a value when the layout is not fixed by some anchor operation
-/// (like DPAS). This is the natural layout work items are arranged in a
-/// subgroup.
+/// (like DPAS).
 
 /// Helper Function to get the default layout for uniform values like constants.
-/// For 1D vector, wi_layout is [subgroupSize] and wi_data is [1].
-/// For 2D vector, wi_layout is [1, subgroupSize] and wi_data is [1, 1].
-static SGMap getDefaultSgMap(unsigned rank) {
+/// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1].
+/// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1].
+static LayoutInfo getDefaultLayoutInfo(unsigned rank) {
   assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
   if (rank == 1)
-    return SGMap(WiLayout({subgroupSize}), WiData({1}));
-  return SGMap(WiLayout({1, subgroupSize}), WiData({1, 1}));
+    return LayoutInfo(LaneLayout({subgroupSize}), LaneData({1}));
+  return LayoutInfo(LaneLayout({1, subgroupSize}), LaneData({1, 1}));
 }
 
 /// Helper to get the default layout for a vector type.
-static SGMap getDefaultSgMap(VectorType vectorTy) {
+static LayoutInfo getDefaultLayoutInfo(VectorType vectorTy) {
   /// Expecting a 1D or 2D vector.
   assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) &&
          "Expected 1D or 2D vector.");
@@ -216,112 +218,119 @@ static SGMap getDefaultSgMap(VectorType vectorTy) {
          "Expected int or float element type.");
   /// If the rank is 1, then return default layout for 1D vector.
   if (vectorTy.getRank() == 1)
-    return getDefaultSgMap(1);
+    return getDefaultLayoutInfo(1);
   /// Packing factor is determined by the element type bitwidth.
   int packingFactor = 1;
   auto bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth();
   if (bitwidth < packedSizeInBitsForDefault)
     packingFactor = packedSizeInBitsForDefault / bitwidth;
-  return SGMap(WiLayout({1, subgroupSize}), WiData({1, packingFactor}));
+  return LayoutInfo(LaneLayout({1, subgroupSize}),
+                    LaneData({1, packingFactor}));
 }
 
-/// Helper Function to get the expected layouts for DPAS operands. `wi_data` is
-/// set according to the following criteria:
+/// Helper Function to get the expected layouts for DPAS operands. `lane_data`
+/// is set according to the following criteria:
 /// * For A operand, the data must be packed in minimum
 /// `packedSizeInBitsForDefault`
 /// * For B operand, the data must be packed in minimum
 /// `packedSizeInBitsForDpasB`
-static SGMap getSGMapForDPASOperand(VectorType vectorTy, unsigned operandNum) {
+static LayoutInfo getLayoutInfoForDPASOperand(VectorType vectorTy,
+                                              unsigned operandNum) {
   auto elementTy = vectorTy.getElementType();
   assert(elementTy.isIntOrFloat() &&
          "Expected int or float type in DPAS operands");
-  WiLayout layout({1, subgroupSize});
+  LaneLayout layout({1, subgroupSize});
   /// For B operand, data must be packed in minimum `packedDpasBSizeInBits` and
   /// must have the VNNI format.
   if (operandNum == 1 &&
       elementTy.getIntOrFloatBitWidth() < packedSizeInBitsForDpasB) {
-    WiData data(
+    LaneData data(
         {packedSizeInBitsForDpasB / elementTy.getIntOrFloatBitWidth(), 1});
-    return SGMap(layout, data);
+    return LayoutInfo(layout, data);
   }
   /// Otherwise, return the default layout for the vector type.
-  return getDefaultSgMap(vectorTy);
+  return getDefaultLayoutInfo(vectorTy);
 }
 
 ///===----------------------------------------------------------------------===///
-/// SGMapPropagation
+/// LayoutInfoPropagation
 ///===----------------------------------------------------------------------===///
 
-/// Backward data flow analysis to propagate the wi_layout and wi_data of each
-/// value in the program. Currently, the layouts for operands DPAS, StoreNd, and
-/// StoreScatter are fixed (known before propagation). Purpose of this analysis
-/// is to propagate those known layouts to all their producers and (other)
-/// consumers.
-class SGMapPropagation : public SparseBackwardDataFlowAnalysis<SGMapLattice> {
+/// Backward data flow analysis to propagate the lane_layout and lane_data of
+/// each value in the program. Currently, the layouts for operands DPAS,
+/// StoreNd, and StoreScatter are fixed (known before propagation). Purpose of
+/// this analysis is to propagate those known layouts to all their producers and
+/// (other) consumers.
+class LayoutInfoPropagation
+    : public SparseBackwardDataFlowAnalysis<LayoutInfoLattice> {
 private:
-  void visitDpasOp(xegpu::DpasOp dpas, ArrayRef<SGMapLattice *> operands,
-                   ArrayRef<const SGMapLattice *> results);
+  void visitDpasOp(xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
+                   ArrayRef<const LayoutInfoLattice *> results);
 
-  void visitStoreNdOp(xegpu::StoreNdOp store, ArrayRef<SGMapLattice *> operands,
-                      ArrayRef<const SGMapLattice *> results);
+  void visitStoreNdOp(xegpu::StoreNdOp store,
+                      ArrayRef<LayoutInfoLattice *> operands,
+                      ArrayRef<const LayoutInfoLattice *> results);
 
   void visitStoreScatterOp(xegpu::StoreScatterOp storeScatter,
-                           ArrayRef<SGMapLattice *> operands,
-                           ArrayRef<const SGMapLattice *> results);
+                           ArrayRef<LayoutInfoLattice *> operands,
+                           ArrayRef<const LayoutInfoLattice *> results);
 
-  void visitLoadNdOp(xegpu::LoadNdOp load, ArrayRef<SGMapLattice *> operands,
-                     ArrayRef<const SGMapLattice *> results);
+  void visitLoadNdOp(xegpu::LoadNdOp load,
+                     ArrayRef<LayoutInfoLattice *> operands,
+                     ArrayRef<const LayoutInfoLattice *> results);
 
   void visitLoadGatherOp(xegpu::LoadGatherOp load,
-                         ArrayRef<SGMapLattice *> operands,
-                         ArrayRef<const SGMapLattice *> results);
+                         ArrayRef<LayoutInfoLattice *> operands,
+                         ArrayRef<const LayoutInfoLattice *> results);
 
   void visitTransposeOp(vector::TransposeOp transpose,
-                        ArrayRef<SGMapLattice *> operands,
-                        ArrayRef<const SGMapLattice *> results);
+                        ArrayRef<LayoutInfoLattice *> operands,
+                        ArrayRef<const LayoutInfoLattice *> results);
 
   void visitVectorBitcastOp(vector::BitCastOp bitcast,
-                            ArrayRef<SGMapLattice *> operands,
-                            ArrayRef<const SGMapLattice *> results);
+                            ArrayRef<LayoutInfoLattice *> operands,
+                            ArrayRef<const LayoutInfoLattice *> results);
 
   void visitCreateDescOp(xegpu::CreateDescOp createDesc,
-                         ArrayRef<SGMapLattice *> operands,
-                         ArrayRef<const SGMapLattice *> results);
+                         ArrayRef<LayoutInfoLattice *> operands,
+                         ArrayRef<const LayoutInfoLattice *> results);
 
   void visitUpdateNdOffsetOp(xegpu::UpdateNdOffsetOp updateNdOffset,
-                             ArrayRef<SGMapLattice *> operands,
-                             ArrayRef<const SGMapLattice *> results);
+                             ArrayRef<LayoutInfoLattice *> operands,
+                             ArrayRef<const LayoutInfoLattice *> results);
 
   void visitVectorMultiReductionOp(vector::MultiDimReductionOp reduction,
-                                   ArrayRef<SGMapLattice *> operands,
-                                   ArrayRef<const SGMapLattice *> results);
+                                   ArrayRef<LayoutInfoLattice *> operands,
+                                   ArrayRef<const LayoutInfoLattice *> results);
 
 public:
-  SGMapPropagation(DataFlowSolver &solver, SymbolTableCollection &symbolTable)
+  LayoutInfoPropagation(DataFlowSolver &solver,
+                        SymbolTableCollection &symbolTable)
       : SparseBackwardDataFlowAnalysis(solver, symbolTable) {}
   using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis;
 
-  LogicalResult visitOperation(Operation *op, ArrayRef<SGMapLattice *> operands,
-                               ArrayRef<const SGMapLattice *> results) override;
+  LogicalResult
+  visitOperation(Operation *op, ArrayRef<LayoutInfoLattice *> operands,
+                 ArrayRef<const LayoutInfoLattice *> results) override;
 
   void visitBranchOperand(OpOperand &operand) override {};
 
   void visitCallOperand(OpOperand &operand) override {};
 
   void visitExternalCall(CallOpInterface call,
-                         ArrayRef<SGMapLattice *> operands,
-                         ArrayRef<const SGMapLattice *> results) override {};
+                         ArrayRef<LayoutInfoLattice *> operands,
+                         ArrayRef<const LayoutInfoLattice *> results) override {
+  };
 
-  void setToExitState(SGMapLattice *lattice) override {
-    (void)lattice->meet(SGMap());
+  void setToExitState(LayoutInfoLattice *lattice) override {
+    (void)lattice->meet(LayoutInfo());
   }
 };
 } // namespace
 
-LogicalResult
-SGMapPropagation::visitOperation(Operation *op,
-                                 ArrayRef<SGMapLattice *> operands,
-                                 ArrayRef<const SGMapLattice *> results) {
+LogicalResult LayoutInfoPropagation::visitOperation(
+    Operation *op, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
   TypeSwitch<Operation *>(op)
       .Case<xegpu::DpasOp>(
           [&](auto dpasOp) { visitDpasOp(dpasOp, operands, results); })
@@ -355,8 +364,8 @@ SGMapPropagation::visitOperation(Operation *op,
       })
       /// All other ops.
       .Default([&](Operation *op) {
-        for (const SGMapLattice *r : results) {
-          for (SGMapLattice *operand : operands) {
+        for (const LayoutInfoLattice *r : results) {
+          for (LayoutInfoLattice *operand : operands) {
             /// Propagate the layout of the result to the operand.
             if (r->getValue().isAssigned())
               meet(operand, *r);
@@ -364,15 +373,16 @@ SGMapPropagation::visitOperation(Operation *op,
         }
       });
   /// Add a dependency from each result to program point after the operation.
-  for (const SGMapLattice *r : results) {
-    addDependency(const_cast<SGMapLattice *>(r), getProgramPointAfter(op));
+  for (const LayoutInfoLattice *r : results) {
+    addDependency(const_cast<LayoutInfoLattice *>(r), getProgramPointAfter(op));
   }
   return success();
 }
 
-void SGMapPropagation::visitVectorMultiReductionOp(
-    vector::MultiDimReductionOp reduction, ArrayRef<SGMapLattice *> operands,
-    ArrayRef<const SGMapLattice *> results) {
+void LayoutInfoPropagation::visitVectorMultiReductionOp(
+    vector::MultiDimReductionOp reduction,
+    ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
   /// The layout of the result must be present.
   auto resultLayout = results[0]->getValue();
   if (!resultLayout.isAssigned())
@@ -382,7 +392,7 @@ void SGMapPropagation::visitVectorMultiReductionOp(
          "Expected 1D layout for reduction result.");
   /// Given that the result is 1D, the layout of the operand should be 2D with
   /// default layout.
-  auto operandLayout = getDefaultSgMap(2);
+  auto operandLayout = getDefaultLayoutInfo(2);
   propagateIfChanged(operands[0], operands[0]->meet(operandLayout));
   /// Accumulator should have the same layout as the result.
   propagateIfChanged(operands[1], operands[1]->meet(resultLayout));
@@ -390,9 +400,10 @@ void SGMapPropagation::visitVectorMultiReductionOp(
 
 /// Propagate the layout of the result tensor to the source tensor descriptor in
 /// UpdateNdOffsetOp.
-void SGMapPropagation::visitUpdateNdOffsetOp(
-    xegpu::UpdateNdOffsetOp updateNdOffset, ArrayRef<SGMapLattice *> operands,
-    ArrayRef<const SGMapLattice *> results) {
+void LayoutInfoPropagation::visitUpdateNdOffsetOp(
+    xegpu::UpdateNdOffsetOp updateNdOffset,
+    ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
   /// The layout of the result must be present.
   auto resultLayout = results[0]->getValue();
   if (!resultLayout.isAssigned())
@@ -402,48 +413,48 @@ void SGMapPropagation::visitUpdateNdOffsetOp(
 }
 
 /// Set the layouts for DPAS A, B, and C operands.
-void SGMapPropagation::visitDpasOp(xegpu::DpasOp dpas,
-                                   ArrayRef<SGMapLattice *> operands,
-                                   ArrayRef<const SGMapLattice *> results) {
+void LayoutInfoPropagation::visitDpasOp(
+    xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
   auto aTy = dpas.getLhsType();
   auto bTy = dpas.getRhsType();
   propagateIfChanged(operands[0],
-                     operands[0]->meet(getSGMapForDPASOperand(aTy, 0)));
+                     operands[0]->meet(getLayoutInfoForDPASOperand(aTy, 0)));
   propagateIfChanged(operands[1],
-                     operands[1]->meet(getSGMapForDPASOperand(bTy, 1)));
+                     operands[1]->meet(getLayoutInfoForDPASOperand(bTy, 1)));
   if (operands.size() > 2) {
     auto cTy = dpas.getAccType();
     propagateIfChanged(operands[2],
-                       operands[2]->meet(getSGMapForDPASOperand(cTy, 2)));
+                       operands[2]->meet(getLayoutInfoForDPASOperand(cTy, 2)));
   }
 }
 
 /// Set the layout for the value and tensor descriptor operands in StoreNdOp.
-void SGMapPropagation::visitStoreNdOp(xegpu::StoreNdOp store,
-                                      ArrayRef<SGMapLattice *> operands,
-                                      ArrayRef<const SGMapLattice *> results) {
-  auto storeLayout = getDefaultSgMap(store.getValueType());
+void LayoutInfoPropagation::visitStoreNdOp(
+    xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  auto storeLayout = getDefaultLayoutInfo(store.getValueType());
   /// Both operands should have the same layout
-  for (SGMapLattice *operand : operands) {
+  for (LayoutInfoLattice *operand : operands) {
     propagateIfChanged(operand, operand->meet(storeLayout));
   }
 }
 
 /// Propagate the layout of the value to the tensor descriptor operand in
 /// LoadNdOp.
-void SGMapPropagation::visitLoadNdOp(xegpu::LoadNdOp load,
-                                     ArrayRef<SGMapLattice *> operands,
-                                     ArrayRef<const SGMapLattice *> results) {
+void LayoutInfoPropagation::visitLoadNdOp(
+    xegpu::LoadNdOp load, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
   auto valueLayout = results[0]->getValue();
   /// Need the layout of the value to propagate to the tensor descriptor.
   if (!valueLayout.isAssigned())
     return;
-  SGMap tensorDescLayout = valueLayout;
+  LayoutInfo tensorDescLayout = valueLayout;
   /// LoadNdOp has the transpose effect. However, at the stage of this analysis
   /// this effect is not expected and should be abstracted away. Emit a warning.
   if (auto transpose = load.getTranspose()) {
     load.emitWarning("Transpose effect is not expected for LoadNdOp at "
-                     "SGMapPropagation stage.");
+                     "LayoutInfoPropagation stage.");
     tensorDescLayout = valueLayout.getTransposedLayout(transpose.value());
   }
   /// Propagate the new layout to the tensor descriptor operand.
@@ -452,9 +463,9 @@ void SGMapPropagation::visitLoadNdOp(xegpu::LoadNdOp load,
 
 /// For vector::TransposeOp, the layout of the result is transposed and
 /// propagated to the operand.
-void SGMapPropagation::visitTransposeOp(
-    vector::TransposeOp transpose, ArrayRef<SGMapLattice *> operands,
-    ArrayRef<const SGMapLattice *> results) {
+void LayoutInfoPropagation::visitTransposeOp(
+    vector::TransposeOp transpose, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
   /// Need the layout of transpose result to propagate to the operands.
   auto resultLayout = results[0]->getValue();
   if (!resultLayout.isAssigned())
@@ -464,11 +475,11 @@ void SGMapPropagation::visitTransposeOp(
   propagateIfChanged(operands[0], operands[0]->meet(newLayout));
 }
 
-/// For vector::BitCastOp, the wi_data of the source layout is changed based on
-/// the bit width of the source and result types.
-void SGMapPropagation::visitVectorBitcastOp(
-    vector::BitCastOp bitcast, ArrayRef<SGMapLattice *> operands,
-    ArrayRef<const SGMapLattice *> results) {
+/// For vector::BitCastOp, the lane_data of the source layout is changed based
+/// on the bit width of the source and result types.
+void LayoutInfoPropagation::visitVectorBitcastOp(
+    vector::BitCastOp bitcast, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
   /// Need the layout of bitcast result to propagate to the operands.
   auto resultLayout = results[0]->getValue();
   if (!resultLayout.isAssigned())
@@ -478,49 +489,49 @@ void SGMapPropagation::visitVectorBitcastOp(
   auto outElemTyBitWidth =
       bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth();
 
-  /// WiLayout does not change.
-  const WiLayout &newWiLayout = resultLayout.getLayout();
-  const WiData &currData = resultLayout.getData();
-  WiData newWiData;
+  /// LaneLayout does not change.
+  const LaneLayout &newLaneLayout = resultLayout.getLayout();
+  const LaneData &currData = resultLayout.getData();
+  LaneData newLaneData;
   /// It's a widening bitcast
   if (inElemTyBitWidth < outElemTyBitWidth) {
     auto ratio = outElemTyBitWidth / inElemTyBitWidth;
-    newWiData = resultLayout.getData()[0] == 1
-                    ? WiData({1, currData[1] * ratio})
-                    : WiData({currData[0] * ratio, 1});
+    newLaneData = resultLayout.getData()[0] == 1
+                      ? LaneData({1, currData[1] * ratio})
+                      : LaneData({currData[0] * ratio, 1});
   } else {
     /// It's a narrowing bitcast
     auto ratio = inElemTyBitWidth / outElemTyBitWidth;
-    newWiData = resultLayout.getData()[0] == 1
-                    ? WiData({1, currData[1] / ratio})
-                    : WiData({currData[0] / ratio, 1});
+    newLaneData = resultLayout.getData()[0] == 1
+                      ? LaneData({1, currData[1] / ratio})
+                      : LaneData({currData[0] / ratio, 1});
   }
 
   propagateIfChanged(operands[0],
-                     operands[0]->meet(SGMap(newWiLayout, newWiData)));
+                     operands[0]->meet(LayoutInfo(newLaneLayout, newLaneData)));
 }
 
 /// Propagate the layout of the result to the tensor descriptor and mask
 /// operands in LoadGatherOp.
-void SGMapPropagation::visitLoadGatherOp(
-    xegpu::LoadGatherOp load, ArrayRef<SGMapLattice *> operands,
-    ArrayRef<const SGMapLattice *> results) {
+void LayoutInfoPropagation::visitLoadGatherOp(
+    xegpu::LoadGatherOp load, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
   auto valueLayout = results[0]->getValue();
   /// Need the layout of the value to propagate to the tensor descriptor.
   if (!valueLayout.isAssigned())
     return;
 
-  SGMap tensorDescLayout = valueLayout;
+  LayoutInfo tensorDescLayout = valueLayout;
   if (load.getTranspose()) {
     /// LoadGatherOp has the transpose effect. However, at the stage of this
     /// analyis this effect is not expected and should be abstracted away. Emit
     /// a warning.
     load.emitWarning("Transpose effect is not expected for LoadGatherOp at "
-                     "SGMapPropagation stage.");
+                     "LayoutInfoPropagation stage.");
     tensorDescLayout = valueLayout.getTransposedLayout({1, 0});
   }
   /// Mask operand should have 1D default layout.
-  auto maskLayout = getDefaultSgMap(1);
+  auto maskLayout = getDefaultLayoutInfo(1);
   /// Propagate the new layout to the tensor descriptor operand.
   propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout));
   /// Propagate the new layout to the mask operand.
@@ -529,23 +540,23 @@ void SGMapPropagation::visitLoadGatherOp(
 
 /// Propagate the layout of the descriptor to the vector offset operand in
 /// CreateDescOp.
-void SGMapPropagation::visitCreateDescOp(
-    xegpu::CreateDescOp createDesc, ArrayRef<SGMapLattice *> operands,
-    ArrayRef<const SGMapLattice *> results) {
+void LayoutInfoPropagation::visitCreateDescOp(
+    xegpu::CreateDescOp createDesc, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
   auto descLayout = results[0]->getValue();
   /// Need the layout of the descriptor to propagate to the operands.
   if (!descLayout.isAssigned())
     return;
   /// For offset operand propagate 1D default layout.
-  SGMap layout = getDefaultSgMap(1);
+  LayoutInfo layout = getDefaultLayoutInfo(1);
   propagateIfChanged(operands[1], operands[1]->meet(layout));
 }
 
 /// Set the layout for the value, tensor descriptor, and mask operands in the
 /// StoreScatterOp.
-void SGMapPropagation::visitStoreScatterOp(
-    xegpu::StoreScatterOp storeScatter, ArrayRef<SGMapLattice *> operands,
-    ArrayRef<const SGMapLattice *> results) {
+void LayoutInfoPropagation::visitStoreScatterOp(
+    xegpu::StoreScatterOp storeScatter, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
   /// Currently, for 2D StoreScatterOp we expect that the height dimension of
   /// the tensor descriptor is evenly divisible by the subgroup size.
   /// TODO: Add support for other 2D shapes.
@@ -555,14 +566,14 @@ void SGMapPropagation::visitStoreScatterOp(
                            "be evenly divisible by the subgroup size.");
     return;
   }
-  auto valueLayout = getDefaultSgMap(storeScatter.getValueType());
-  SGMap storeScatterLayout = valueLayout;
+  auto valueLayout = getDefaultLayoutInfo(storeScatter.getValueType());
+  LayoutInfo storeScatterLayout = valueLayout;
   if (storeScatter.getTranspose()) {
     /// StoreScatteOp allows transpose effect. However, at the stage of this
     /// analyis this effect is not expected and should be abstracted away. Emit
     /// a warning.
     storeScatter.emitWarning("Transpose effect is not expected for "
-                             "StoreScatterOp at SGMapPropagation stage.");
+                             "StoreScatterOp at LayoutInfoPropagation stage.");
     storeScatterLayout = valueLayout.getTransposedLayout({1, 0});
   }
   /// Propagate the value layout.
@@ -570,28 +581,28 @@ void SGMapPropagation::visitStoreScatterOp(
   /// Propagate the tensor descriptor layout.
   propagateIfChanged(operands[1], operands[1]->meet(storeScatterLayout));
   /// Use default 1D layout for mask operand.
-  auto maskLayout = getDefaultSgMap(1);
+  auto maskLayout = getDefaultLayoutInfo(1);
   propagateIfChanged(operands[2], operands[2]->meet(maskLayout));
 }
 
 namespace {
 
 ///===----------------------------------------------------------------------===///
-/// RunSGMapPropagation
+/// RunLayoutInfoPropagation
 ///===----------------------------------------------------------------------===///
 
-/// Driver class for running the SGMapPropagation analysis.
-class RunSGMapPropagation {
+/// Driver class for running the LayoutInfoPropagation analysis.
+class RunLayoutInfoPropagation {
 public:
-  RunSGMapPropagation(Operation *op) : target(op) {
+  RunLayoutInfoPropagation(Operation *op) : target(op) {
     SymbolTableCollection symbolTable;
     solver.load<DeadCodeAnalysis>();
     solver.load<SparseConstantPropagation>();
-    solver.load<SGMapPropagation>(symbolTable);
+    solver.load<LayoutInfoPropagation>(symbolTable);
     (void)solver.initializeAndRun(op);
   }
 
-  SGMap getSGMap(Value val);
+  LayoutInfo getLayoutInfo(Value val);
 
   void printAnalysisResult(llvm::raw_ostream &os);
 
@@ -601,21 +612,21 @@ class RunSGMapPropagation {
 };
 } // namespace
 
-SGMap RunSGMapPropagation::getSGMap(Value val) {
-  auto *state = solver.lookupState<SGMapLattice>(val);
+LayoutInfo RunLayoutInfoPropagation::getLayoutInfo(Value val) {
+  auto *state = solver.lookupState<LayoutInfoLattice>(val);
   if (!state)
     return {};
   return state->getValue();
 }
 
-void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
+void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) {
   auto printFunctionResult = [&](FunctionOpInterface funcOp) {
     os << "function: " << funcOp.getName() << ":\n";
     // Function arguments
     for (auto arg : funcOp.getArguments()) {
-      auto layout = getSGMap(arg);
+      auto layout = getLayoutInfo(arg);
       os << "argument: " << arg << "\n";
-      os << "sg_map  : ";
+      os << "layout  : ";
       layout.print(os);
       os << "\n";
     }
@@ -631,10 +642,10 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
       else
         op->print(os);
       os << "\n";
-      /// Print the sg_map for each result.
+      /// Print the layout for each result.
       for (auto [i, r] : llvm::enumerate(op->getResults())) {
-        auto layout = getSGMap(r);
-        os << "sg_map for result #" << i << ": ";
+        auto layout = getLayoutInfo(r);
+        os << "layout for result #" << i << ": ";
         layout.print(os);
         os << "\n";
       }
@@ -663,8 +674,7 @@ void attachLayoutAttributeToUsers(Value v, xegpu::LayoutAttr layout) {
   for (OpOperand &user : v.getUses()) {
     Operation *owner = user.getOwner();
     unsigned operandNumber = user.getOperandNumber();
-    /// If the user is a DpasOp, set "sg_map_a", "sg_map_b", or "sg_map_c"
-    /// attribute.
+    /// If the user is a DpasOp, set A, B or C layout attributes.
     if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
       if (operandNumber == 0)
         dpasOp.setALayoutAttr(layout);
@@ -680,11 +690,10 @@ void attachLayoutAttributeToUsers(Value v, xegpu::LayoutAttr layout) {
   }
 }
 
-static LogicalResult
-attachLayoutAttributes(Operation *top,
-                       llvm::function_ref<SGMap(Value)> getPropagatedLayout) {
-  /// Helper to convert SGMap to xegpu::SGMapAttr.
-  auto getSGMapForResult = [&](Value r) -> xegpu::LayoutAttr {
+static LogicalResult attachLayoutAttributes(
+    Operation *top, llvm::function_ref<LayoutInfo(Value)> getPropagatedLayout) {
+  /// Helper to convert the layout info to the xegpu::LayoutAttr.
+  auto getLayoutInfoForResult = [&](Value r) -> xegpu::LayoutAttr {
     auto layout = getPropagatedLayout(r);
     if (!layout.isAssigned())
       return {};
@@ -701,9 +710,9 @@ attachLayoutAttributes(Operation *top,
     /// For function ops, propagate the argument layout to the users.
     if (auto func = dyn_cast<FunctionOpInterface>(op)) {
       for (auto arg : func.getArguments()) {
-        auto sgMapAttr = getSGMapForResult(arg);
-        if (sgMapAttr) {
-          attachLayoutAttributeToUsers(arg, sgMapAttr);
+        auto layoutInfo = getLayoutInfoForResult(arg);
+        if (layoutInfo) {
+          attachLayoutAttributeToUsers(arg, layoutInfo);
         }
       }
       return WalkResult::advance();
@@ -713,8 +722,8 @@ attachLayoutAttributes(Operation *top,
       return WalkResult::advance();
     if (auto tensorDescTy =
             dyn_cast<xegpu::TensorDescType>(op->getResult(0).getType())) {
-      auto sgMapAttr = getSGMapForResult(op->getResult(0));
-      if (!sgMapAttr) {
+      auto layoutInfo = getLayoutInfoForResult(op->getResult(0));
+      if (!layoutInfo) {
         LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n");
         return WalkResult::interrupt();
       }
@@ -725,7 +734,8 @@ attachLayoutAttributes(Operation *top,
       auto *newOp = builder.clone(*op);
       auto newTensorDescTy = xegpu::TensorDescType::get(
           tensorDescTy.getContext(), tensorDescTy.getShape(),
-          tensorDescTy.getElementType(), tensorDescTy.getEncoding(), sgMapAttr);
+          tensorDescTy.getElementType(), tensorDescTy.getEncoding(),
+          layoutInfo);
       newOp->getResult(0).setType(newTensorDescTy);
       op->replaceAllUsesWith(newOp->getResults());
       op->erase();
@@ -733,12 +743,12 @@ attachLayoutAttributes(Operation *top,
     }
     /// Otherwise simply attach the sg_map to the op itself.
     for (auto [i, r] : llvm::enumerate(op->getResults())) {
-      auto sgMapAttr = getSGMapForResult(r);
-      if (sgMapAttr) {
+      auto layoutInfo = getLayoutInfoForResult(r);
+      if (layoutInfo) {
         auto attrName = "r" + std::to_string(i);
-        op->setAttr(attrName, sgMapAttr);
+        op->setAttr(attrName, layoutInfo);
         /// Attach the layout attribute to the users of the result.
-        attachLayoutAttributeToUsers(r, sgMapAttr);
+        attachLayoutAttributeToUsers(r, layoutInfo);
       }
     }
     return WalkResult::advance();
@@ -759,18 +769,18 @@ namespace {
 ///===----------------------------------------------------------------------===///
 
 /// Returns the distributed vector type for a source vector type according to
-/// the wi_layout. We simply divide each dimension of tensor descriptor shape by
-/// corresponding wi_layout dimension. If array_length > 1, that is appended to
-/// the front of the disributed shape.
+/// the lane_layout. We simply divide each dimension of tensor descriptor shape
+/// by corresponding lane_layout dimension. If array_length > 1, that is
+/// appended to the front of the disributed shape.
+///
 /// Examples:
-/// | original vector shape | wi_layout | distributed vector shape |
-/// |-----------------------|-----------|--------------------------|
-/// | 32x16                 | [1, 16]   | 32x1                     |
-/// | 32x16                 | [2, 8]    | 16x2                     |
-/// | 2x32x16               | [1, 16]   | 2x32x1                   |
-FailureOr<VectorType>
-getDistributedVecTypeBasedOnWiLayout(xegpu::LayoutAttr layout,
-                                     VectorType originalType) {
+/// | original vector shape | lane_layout | distributed vector shape |
+/// |-----------------------|-------------|--------------------------|
+/// | 32x16                 | [1, 16]     | 32x1                     |
+/// | 32x16                 | [2, 8]      | 16x2                     |
+/// | 2x32x16               | [1, 16]     | 2x32x1                   |
+FailureOr<VectorType> getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
+                                                      VectorType originalType) {
   llvm::SmallVector<int64_t, 2> distributedShape;
   if (!layout)
     return failure();
@@ -1022,7 +1032,7 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
       return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
 
     auto distriburtedTypeByWarpOp =
-        getDistributedVecTypeBasedOnWiLayout(layout, storeOp.getValueType());
+        getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
     if (failed(distriburtedTypeByWarpOp))
       return rewriter.notifyMatchFailure(storeOp,
                                          "Failed to distribute the type");
@@ -1161,11 +1171,11 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
           "the xegpu::Dpas op lacks sg_map attribute for A, B or output");
 
     auto distLhsTypeByWarpOpOrFailure =
-        getDistributedVecTypeBasedOnWiLayout(layoutA, dpasOp.getLhsType());
+        getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType());
     auto distRhsTypeByWarpOpOrFailure =
-        getDistributedVecTypeBasedOnWiLayout(layoutB, dpasOp.getRhsType());
+        getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType());
     auto distResultTypeByWarpOpOrFailure =
-        getDistributedVecTypeBasedOnWiLayout(layoutOut, dpasOp.getResultType());
+        getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());
     if (failed(distLhsTypeByWarpOpOrFailure) ||
         failed(distRhsTypeByWarpOpOrFailure) ||
         failed(distResultTypeByWarpOpOrFailure))
@@ -1242,14 +1252,16 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
 }
 
 void XeGPUSubgroupDistributePass::runOnOperation() {
-  auto &analyis = getAnalysis<RunSGMapPropagation>();
+  auto &analyis = getAnalysis<RunLayoutInfoPropagation>();
   // Print the analysis result and exit. (for testing purposes)
   if (printOnly) {
     auto &os = llvm::outs();
     analyis.printAnalysisResult(os);
     return;
   }
-  auto getPropagatedLayout = [&](Value val) { return analyis.getSGMap(val); };
+  auto getPropagatedLayout = [&](Value val) {
+    return analyis.getLayoutInfo(val);
+  };
   if (failed(attachLayoutAttributes(getOperation(), getPropagatedLayout)))
     signalPassFailure();
   if (failed(resolveLayoutConflicts(getOperation())))

>From 9bddeb6f6b4ba3dc1fef7a666304c520190d02c7 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 1 Apr 2025 19:01:16 +0000
Subject: [PATCH 26/45] drop ScopeAttr and refine 1D layout support

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 144 ++++++------
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |   6 +-
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       |  13 +-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  93 ++++----
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |  21 +-
 mlir/test/Dialect/XeGPU/invalid.mlir          |  80 +++----
 mlir/test/Dialect/XeGPU/ops.mlir              | 214 +++++++++---------
 7 files changed, 287 insertions(+), 284 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 4afeef1427e8b..80c6ce1160593 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -35,7 +35,7 @@ def XeGPU_BlockTensorDescAttr: XeGPU_TensorDescAttr<"BlockTensorDesc", "block_td
         It is default to `Global`.
     2. `array_length`: It describes how many horizontally consecutive blocks
         will be loaded by a hardware load instruction. If the TensorDesc shape
-        is 8x16, with array_length = 2. The loaded block shape will be acctually
+        is 8x16, with array_length = 2. The loaded block shape will be actually
         8x32. Its default value is 1.
     3. `boundary_check`: It is used to indicates the hardware whether to do
         out-of-boundary check. The default value is true.
@@ -154,26 +154,6 @@ def XeGPU_FenceScopeAttr:
     let assemblyFormat = "$value";
 }
 
-def XeGPU_ScopeWG:     I32EnumAttrCase<"WG", 0, "wg">;        // workgroup level code
-def XeGPU_ScopeSG:     I32EnumAttrCase<"SG", 1, "sg">;        // subgroup level code
-def XeGPU_ScopeLane:   I32EnumAttrCase<"Lane", 2, "lane">;    // simt level code
-
-def XeGPU_ScopeEnums : I32EnumAttr<"Scope", "enumeration of scope",
-  [XeGPU_ScopeWG, XeGPU_ScopeSG, XeGPU_ScopeLane]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::xegpu";
-}
-
-def XeGPU_ScopeAttr
-  : EnumAttr<XeGPU_Dialect, XeGPU_ScopeEnums, "Scope"> {
-    let summary = [{Defines the programming scope of the IR,
-                    where WG represents the workgroup level,
-                    SG represents the subgroup level, and
-                    Lane represents the work-item level}];
-
-    let assemblyFormat = "``$value";
-}
-
 def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
   let summary = [{
     Describes the data distribution to subgroups and work-items for a tensor
@@ -182,75 +162,99 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
   let description = [{
     XeGPU operations use `LayoutAttr` to define how data is distributed across subgroups and work-items.
     This attribute is specified in tensor descriptors during tensor description creation. `LayoutAttr`
-    includes the following parameters, categorized into three groups:
-
-    ### Group 1:
-    * scope: Defines the scope of the code, which can be `wg` (workgroup), `sg` (subgroup),
-      or `lane` (work-item). It is mandatory for subgroup-level programming but optional
-      for workgroup and work-item levels. By default:
-        - If sg_layout is included, the layout is treated as workgroup level.
-        - If only `lane_layout` and `lane_data` are included, it is considered work-item level
-
-    ### Group 2:
-    * sg_layout (optional): Specifies the total number of subgroups and their layout within a workgroup.
-      It is mandatory for workgroup-level programming. Its presence implies workgroup-level code, and
-      the scope must be empty or set to `wg`.
-    * sg_data (optional): Defines the data size accessed per subgroup. It must be used with sg_layout or
-      left empty, in which case it can be derived from `lane_layout` and `lane_data` using the formula:
-      `sg_data[i] = lane_layout[i] * lane_data[i]`.
-    * order (optional): Specifies the dimension order used to linearize n-dimensional sbugroup IDs to
-      1-dimensional IDs. The first dimension in the order list is the fastest-changing dimension.
-
-    ### Group 3:
-    * lane_layout (required): Specifies the total number of work-items and their layout within a subgroup
-    * lane_data: (required): Specifies the data size accessed per work-item for a single distribution.
-
-    `lane_data[0] * lane_data[1]` can be greater than 1, indicating that each work item operates on multiple
-    elements. These elements are eventually lowered to a "SIMT-flavor" vector, such as a SPIR-V vector or
-    an LLVM vector, or packed into a storage data type. The multiple elements specified by lane_data must
-    come from a single dimension and be contiguous in memory along either dimension.
+    includes the following parameters:
+
+    * `sg_layout`: Specifies the total number of subgroups and their layout within a workgroup.
+      It is mandatory for workgroup-level programming and optional for subgroup programming. Its
+      presence implies workgroup-level code.
+    * `sg_data`: Defines the data size accessed per subgroup. It is optionally used with `sg_layout`
+      for workgroup-level programming. When it is left empty, the size accessed per subgroup can be
+      derived from the tensor shape and `sg_layout` using the formula:
+      `sg_data[i] = tensor_shape[i] / sg_layout[i]`.
+    * `inst_data`: Specifies the data size that is processed by an instruction. It is optionally
+      used with lane_layout. When it is left empty, the data size per instruction is equivalent to
+      the sg_data for workgroup-level programming or equivalent to tensor shape for subgroup-level
+      programming.
+    * `lane_layout` : Specifies the total number of work-items and their arrangement within a subgroup.
+      It is mandatory for subgroup-level programming and optional for workgroup-level programming.
+    * `lane_data` : Specifies the shape of the tensor fragment that each lane accesses. It defines a single,
+      minimal distribution unit. Processing the entire tensor may require one or more distribution units per
+      hardware instruction.
+    * `order`: Specifies the dimension order used to linearize n-dimensional sg_layout and lane_layout to
+      1-dimensional layout. The first dimension in the order list is the fastest-changing dimension. If it
+      is not present, the default value is [1, 0].
 
     ### Examples:
-      1. Work-item level layout:
+      1. Subgroup level layout:
       ```mlir
-      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+      #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>
       ```
-      In this example, the subgroup consists of 16 work items arranged as lane_layout=[1, 16], with
-      each work item accessing a single element as defined by lane_data=[1, 1].
+      In this example, there are 16 work-items per subgroup, and is organized as
+      [[0, 1, 2, .., 7],[8, 9, .., 15]]. The distribution unit is 1x1.
 
-      2. Workgroup level layout:
+      2. Subgroup level layout with order:
       ```mlir
-      #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>
+      #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1], order = [0, 1]>
+      ```
+      In this example, there are 16 work-items per subgroup, and is organized as
+      [[0, 2, 4, ..., 14], [1, 3, 5, ..., 15]]. The distribution unit is 1x1.
+
+      3. Workgroup level layout:
+      ```mlir
+      #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [2, 8], lane_data = [1, 1]>
+      ```
+      In this example, the layout represents a workgroup distribution. A workgroup consists of 8 subgroups
+      arranged as [[0, 1, 2, 3], [4, 5, 6, 7]]. Each subgroup accesses a 16x16 block per instruction, which
+      is further distributed to 16 work items which is organized as [[0, 1, 2, .., 7],[8, 9, .., 15]].
+
+      4. Workgroup level layout with order:
+      ```mlir
+      #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [2, 8], lane_data = [1, 1], order = [0, 1]>
       ```
       In this example, the layout represents a workgroup distribution. A workgroup consists of 8 subgroups
-      arranged in a 2x4 layout. Each subgroup accesses a 16x16 block per instruction, which is further
-      distributed to 16 work items as described above.
+      arranged as [[0, 2, 4, 6], [1, 3, 5, 7]]. Each subgroup accesses a 16x16 block per instruction, which
+      is further distributed to 16 work items which is organized as [[0, 2, 4, ..., 14], [1, 3, 5, ..., 15]].
+
   }];
 
   let parameters = (ins
-    OptionalParameter<"ScopeAttr">: $scope,
     OptionalParameter<"DenseI32ArrayAttr">: $sg_layout,
     OptionalParameter<"DenseI32ArrayAttr">: $sg_data,
-    OptionalParameter<"DenseI32ArrayAttr">: $order,
-    "DenseI32ArrayAttr": $lane_layout,
-    "DenseI32ArrayAttr": $lane_data
+    OptionalParameter<"DenseI32ArrayAttr">: $inst_data,
+    OptionalParameter<"DenseI32ArrayAttr">: $lane_layout,
+    OptionalParameter<"DenseI32ArrayAttr">: $lane_data,
+    OptionalParameter<"DenseI32ArrayAttr">: $order
   );
 
+  let builders = [
+    AttrBuilder<(ins "llvm::ArrayRef<int>": $lane_layout,
+                     "llvm::ArrayRef<int>": $lane_data),
+      [{
+        auto sg_layout = DenseI32ArrayAttr();
+        auto sg_data = DenseI32ArrayAttr();
+        auto inst_data = DenseI32ArrayAttr();
+        auto order = DenseI32ArrayAttr();
+        return $_get($_ctxt, sg_layout, sg_data, inst_data,
+                     DenseI32ArrayAttr::get($_ctxt, lane_layout),
+                     DenseI32ArrayAttr::get($_ctxt, lane_data), order);
+      }]>
+  ];
+
   let extraClassDeclaration = [{
-    bool isForWorkgroupLevel() {
-      if (!getScope())
-        return getSgLayout() != nullptr;
-      return getScope() == ScopeAttr::get(getContext(), Scope::WG);
+    bool isWgLayout() {
+      return getSgLayout() != nullptr;
     }
 
-    bool isForSubgroupLevel() {
-      return getScope() == ScopeAttr::get(getContext(), Scope::SG);
+    bool isSgLayout() {
+      return getSgLayout() == nullptr && getLaneLayout() != nullptr;
     }
 
-    bool isForWorkItemLevel() {
-      if (!getScope())
-        return !getSgLayout() && !getSgData() && !getOrder();
-      return getScope() == ScopeAttr::get(getContext(), Scope::Lane);
+    int64_t getRank() {
+      if (auto attr = getSgLayout())
+        return attr.size();
+      if (auto attr = getLaneLayout())
+        return attr.size();
+      return 0;
     }
   }];
 
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 41911ee1aa323..16a7f63d60c82 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -841,7 +841,7 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
     can be represented as `B: vector<8x16x2xf16>`.
 
     In SIMT mode, DpasOp expects layout attributes `a`, `b`, and `c` (only if acc is used)
-    which descibe the data fragment owned by each work-item w.r.t. the tensor descriptor
+    which describe the data fragment owned by each work-item w.r.t. the tensor descriptor
     these data are loaded from.
 
     Note: on PVC, the hardware can perform load with VNNI transformation when data
@@ -988,8 +988,8 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
     let description = [{
       `convert_layout` adjusts the data distribution across subgroups and/or work-items by modifying
       the `LayoutAttr`. Both `srcMap` and `resMap` must correspond to the same programming scope, such
-      as workgroup-level (wg) or subgroup-level (sg) code. This operation is not supported for
-      work-item-level code.
+      as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once the IR is
+      lowered to WI level because that is the end result of all distributions.
     }];
     let arguments = (ins XeGPU_Vector2DType: $source,
                          XeGPU_LayoutAttr: $srcMap,
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 8559f4beb2c03..3d0f52041d798 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -39,7 +39,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     metadata and does not hold the data itself. It is primarily designed to support 2D block load/store
     and DPAS (matrix multiplication instruction) on Intel GPUs. It encodes the following information:
 
-    * shape:  the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows
+    * shape:  the sizes/shape of the interested data block, e.g., 8x16 means 8 rows
               and each row contains 16 contiguous data element. The rows could be
               either contiguous or not, depends on the encoding attribute. If the
               encoding is a BlockTensorDescAttr, rows are contiguous. If the encoding
@@ -62,7 +62,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     static-dim-list ::= decimal-literal `x` decimal-literal
     attr-list = (, encoding-attr)? (, layout-attr)?
     enconding-attr = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)?
-    layout-attr = (, layout `<` (scope = value,)? (sg_layout = value, sg_data = value, order = value)? lane_layout = value, lane_data = value `>`)?
+    layout-attr = (, layout `<`sg_layout = value, sg_data = value, inst_data = value, lane_layout = value, lane_data = value, order = value`>`)?
     ```
 
     Examples:
@@ -77,14 +77,15 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     // A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
     xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_space = slm>>
 
+    // A TensorDesc with a layout for subgroup level programming
+    xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+
     // A TensorDesc with a layout for workgroup level programming
     xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
 
-    // A TensorDesc with a layout for subgroup level programming
-    xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>>
+    // A TensorDesc with a layout for workgroup level programming without lane_layout and lane_data
+    xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16]>>
 
-    // A TensorDesc with a layout for workitem level programming
-    xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, lane_layout = [1, 16], lane_data = [1, 1]>>
     ```
   }];
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 0da86f1af33e4..7aa698de7e2da 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -72,41 +72,58 @@ LogicalResult ScatterTensorDescAttr::verify(
 //===----------------------------------------------------------------------===//
 LogicalResult
 LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
-                   ScopeAttr scope, DenseI32ArrayAttr sg_layout,
-                   DenseI32ArrayAttr sg_data, DenseI32ArrayAttr order,
-                   DenseI32ArrayAttr lane_layout, DenseI32ArrayAttr lane_data) {
+                   DenseI32ArrayAttr sg_layout, DenseI32ArrayAttr sg_data,
+                   DenseI32ArrayAttr inst_data, DenseI32ArrayAttr lane_layout,
+                   DenseI32ArrayAttr lane_data, DenseI32ArrayAttr order) {
+
+  // A valid layout must include at least one of sg_layout and lane_layout.
+  // sg_layout is essential for Workgroup layout, while lane_layout is
+  // required for Subgroup layout.
+  if (!sg_layout && !lane_layout) {
+    return emitError() << "expected at least one of sg_layout or lane_layout";
+  }
+
+  if (sg_layout && lane_layout && sg_layout.size() != lane_layout.size()) {
+    return emitError()
+           << "expected sg_layout and lane_layout having the same rank";
+  }
 
+  // sg_data is optional for Workgroup layout, but its presence requires
+  // sg_layout.
   if (sg_data) {
     if (!sg_layout)
-      return emitError() << "expected sg_layout being used with sg_data.";
+      return emitError() << "expected sg_layout being used with sg_data";
     if (sg_data.size() != sg_layout.size())
       return emitError() << "expected sg_data having the same rank as sg_layout";
   }
 
-  if (order) {
-    if (!sg_layout)
-      return emitError() << "expected order being used with sg_layout.";
-    if (order.size() != sg_layout.size())
-      return emitError() << "expected order having the same rank as sg_layout";
-  }
-
-  if (sg_layout && sg_layout.size() > 2) {
-    return emitError() << "expected the rank of the layout to be at most 2";
-  }
-
-  if (scope && scope.getValue() != Scope::WG &&
-      (sg_layout || sg_data || order)) {
-    return emitError() << "expected sg_layout, sg_data, or order being only "
-                          "used at workgroup level.";
+  // inst_data is optional for Subgroup layout, but its presence requires
+  // lane_layout.
+  if (inst_data) {
+    if (!lane_layout)
+      return emitError() << "expected lane_layout being used with inst_data";
+    if (inst_data.size() != lane_layout.size())
+      return emitError()
+             << "expected inst_data having the same rank as lane_layout";
   }
 
-  if (scope && scope.getValue() == Scope::WG && !sg_layout ) {
-    return emitError() << "expected sg_layout for workgroup level layout";
+  // lane_data is optional for Subgroup layout, but its presence requires
+  // lane_layout.
+  if (lane_data) {
+    if (!lane_layout)
+      return emitError() << "expected lane_layout being used with lane_data";
+    if (lane_data.size() != lane_layout.size())
+      return emitError()
+             << "expected lane_data having the same rank as lane_layout";
   }
 
-  if (lane_layout.size() != lane_data.size() || lane_layout.size() > 2) {
-    return emitError() << "expected lane_layout and lane_data having the same "
-                          "rank, with a maximum rank of 2";
+  if (order) {
+    if (!sg_layout && !lane_layout)
+      return emitError()
+             << "expected sg_layout/lane_layout being used with order";
+    if (order.size() != sg_layout.size() && order.size() != lane_layout.size())
+      return emitError()
+             << "expected order having the same rank as sg_layout/lane_layout";
   }
 
   return success();
@@ -249,26 +266,24 @@ LogicalResult TensorDescType::verify(
   }
 
   if (auto layoutAttr = llvm::dyn_cast_if_present<LayoutAttr>(layout)) {
+
+    if (rank != (size_t)layoutAttr.getRank())
+      return emitError() << "expected layout rank to match tensor rank";
+
     ArrayRef<int32_t> laneLayout = layoutAttr.getLaneLayout().asArrayRef();
     ArrayRef<int32_t> laneData = layoutAttr.getLaneData().asArrayRef();
 
-    if (rank == 1) {
-      if (laneLayout[0] != 1 || laneData[0] != 1)
-        return emitError()
-               << "outer layout distribution and data mapping must be 1 "
-                  "for 1D tensor";
-    }
-
     if (scatterAttr) {
       // Validate subgroup mapping rules for scattered tensors.
       // A work-item's slice of the tensor with shape [sg_size] or
       // [sg_size, chunk_size] will be [1] or [1, 32/element_ty_bit_width]
       // respectively, the mapping should reflect that. This is because each
       // work item access data in 32 bit granularity.
-      if (laneData[0] != 1)
+
+      if (rank > 1 && laneData[0] != 1)
         return emitError()
                << "cannot map over non-contiguous scattered row elements";
-      if (laneData[1] != packingFactor)
+      if (laneData.back() != packingFactor)
         return emitError() << "work item data mapping must match the number of "
                               "contiguous elements";
     }
@@ -276,8 +291,6 @@ LogicalResult TensorDescType::verify(
     // For 1D tensor, pad the shape with an outer unit dimension to allow common
     // validation logic.
     SmallVector<int64_t> tensorShape(shape.begin(), shape.end());
-    if (rank == 1)
-      tensorShape = {1, tensorShape.back()};
 
     size_t dims = tensorShape.size();
     for (size_t i = 0; i < dims; ++i) {
@@ -319,7 +332,7 @@ LogicalResult TensorDescType::verify(
 FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
   auto layout = llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
   // If no layout is provided, tensor desc is not used in SIMT mode.
-  if (!layout || !layout.isForWorkItemLevel())
+  if (!layout)
     return failure();
 
   SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
@@ -347,14 +360,6 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
   }
 
   // Case 2: block loads/stores
-  // Tensor descriptor shape can be 1D. For the 1D case, outer dims of laneData
-  // and laneLayout must be 1.
-  if (tdescShape.size() == 1) {
-    assert((laneData[0] == 1 && laneLayout[0] == 1) &&
-           "lane_data[0] and lane_layout[0] must be 1 for 1D tensor descriptor");
-    laneData = {laneData[1]};
-    laneLayout = {laneLayout[1]};
-  }
   // Check if the tensor descriptor shape is distributable.
   int64_t tensorSize = 1;
   for (auto [tdescDim, wiDim, laneDataDim] :
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index e2ccc59d39371..2ac3426904fa8 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -82,7 +82,7 @@ isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
   auto valueShape = valueTy.getShape();
   // layout not present means IR is in SIMD mode. In this case value shape must
   // match adjusted tensor descriptor shape.
-  if (!layout || !layout.isForWorkItemLevel())
+  if (!layout)
     return valueShape == adjustedTdescShape
                ? success()
                : emitError()
@@ -606,13 +606,6 @@ LogicalResult DpasOp::verify() {
       result |= (aLayout != nullptr) ^ (cLayout != nullptr);
     }
     result = !result;
-
-    if (aLayout) {
-      auto scope = aLayout.getScope();
-      result &= bLayout ? scope == bLayout.getScope() : false;
-      if (hasAcc())
-        result &= cLayout ? scope == cLayout.getScope() : false;
-    }
     return result;
   };
 
@@ -622,7 +615,7 @@ LogicalResult DpasOp::verify() {
         "code) or not set at all (for SIMD code).");
 
   // query the scope from aLayout (a valid setting).
-  if (aLayout && aLayout.isForWorkItemLevel()) {
+  if (aLayout) {
     // In SIMT mode, All data fragments must be 2D
     if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
       return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
@@ -673,14 +666,14 @@ LogicalResult ConvertLayoutOp::verify() {
   if (!resMap)
     return emitOpError("expected resMap.");
 
-  if (srcMap.getScope() != resMap.getScope())
-    return emitOpError("expected srcMap and resMap be in the same scope.");
-
   if (srcMap == resMap)
     return emitOpError("expected different srcMap and resMap.");
 
-  if (srcMap.isForWorkItemLevel())
-    return emitOpError("doesn't work on SIMT code.");
+  // both srcMap and resMap should be WgLayout or SgLayout at the same time.
+  if ((!srcMap.isWgLayout() || !resMap.isWgLayout()) &&
+      (!srcMap.isSgLayout() || !resMap.isSgLayout()))
+    return emitOpError(
+        "expected srcMap and resMap be WgLayout or SgLayout at the same time.");
 
   auto shape = getSource().getType().getShape();
   if (!isEvenDistributed(shape, srcMap))
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 17e4f60638905..8b5e42af2f7b8 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -80,11 +80,11 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
 // -----
 func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   // expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1]}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
       l2_hint = #xegpu.cache_hint<uncached>}>
-    : !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+    : !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     -> vector<8x2xf32>
   return
 }
@@ -92,11 +92,11 @@ func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
 // -----
 func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+    !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
   // expected-error at +1 {{Result shape [8] is not consistent with distributed vector shape [1, 1]}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
       l2_hint = #xegpu.cache_hint<uncached>}>
-    : !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+    : !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
     -> vector<8xf32>
   return
 }
@@ -136,20 +136,20 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
 // -----
 func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) {
   %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   // expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1] for tensor descriptor}}
   xegpu.store_nd %data, %1
-    : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+    : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   return
 }
 
 // -----
 func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<2xf32>) {
   %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+    !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
   // expected-error at +1 {{Result shape [2] is not consistent with distributed vector shape [1, 1] for tensor descriptor}}
   xegpu.store_nd %data, %1
-    : vector<2xf32>, !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+    : vector<2xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
   return
 }
 
@@ -247,8 +247,8 @@ func.func @test_prefetch_vc_2(%src: ui64) {
 // -----
 func.func @test_create_tdesc_layout_1(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  // expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>,   #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  // expected-error at +1 {{expected layout rank to match tensor rank}}
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>,   #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
   return
 }
 
@@ -256,7 +256,7 @@ func.func @test_create_tdesc_layout_1(%src: ui64) {
 func.func @test_create_tdesc_layout_2(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error at +1 {{cannot map over non-contiguous scattered row elements}}
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,   #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [2, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,   #xegpu.layout<lane_layout = [1, 4], lane_data = [2, 1]>>
   return
 }
 
@@ -264,7 +264,7 @@ func.func @test_create_tdesc_layout_2(%src: ui64) {
 func.func @test_create_tdesc_layout_3(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error at +1 {{work item data mapping must match the number of contiguous elements}}
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>,   #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>,   #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
   return
 }
 
@@ -272,9 +272,9 @@ func.func @test_create_tdesc_layout_3(%src: ui64) {
 func.func @test_load_gather_layout_1(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,   #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,   #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
   // expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
+  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
   return
 }
 
@@ -282,9 +282,9 @@ func.func @test_load_gather_layout_1(%src: ui64) {
 func.func @test_load_gather_layout_2(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
   // expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
+  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
   return
 }
 
@@ -294,9 +294,9 @@ func.func @test_store_scatter_layout_1(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %val = arith.constant dense<2.9>: vector<1x2xf32>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
   // expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
+  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
   return
 }
 
@@ -305,9 +305,9 @@ func.func @test_store_scatter_layout_2(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %val = arith.constant dense<2.9>: vector<2xf32>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
   // expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
+  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
   return
 }
 
@@ -396,16 +396,16 @@ func.func @test_dpas_4(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) {
 // -----
 func.func @test_dpas_layout_1(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
   // expected-error at +1 {{layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code)}}
-  %1 = xegpu.dpas %a, %b {a_layout =  #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+  %1 = xegpu.dpas %a, %b {a_layout =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
   return
 }
 
 // -----
 func.func @test_dpas_layout_2(%a : vector<8x1xf16>, %b: vector<4x2xf16>) {
   // expected-error at +1 {{K-dimension mismatch}}
-  %1 = xegpu.dpas %a, %b {a_layout =  #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>,
-                          b_layout =  #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>,
-                          c_layout =  #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>}
+  %1 = xegpu.dpas %a, %b {a_layout =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+                          b_layout =  #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+                          c_layout =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
                           : vector<8x1xf16>, vector<4x2xf16> -> vector<8x1xf32>
   return
 }
@@ -438,16 +438,16 @@ func.func @tensor_desc_invalid_rank_1(%src: memref<24x32xf32>) {
 // -----
 func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-      // expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
-      !xegpu.tensor_desc<16xf32,  #xegpu.layout<scope = lane, lane_layout = [2, 16], lane_data = [1, 1]>>
+      // expected-error at +1 {{expected layout rank to match tensor rank}}
+      !xegpu.tensor_desc<16xf32,  #xegpu.layout<lane_layout = [2, 16], lane_data = [1, 1]>>
   return
 }
 
 // -----
 func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-      // expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
-      !xegpu.tensor_desc<16xf32,  #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
+      // expected-error at +1 {{expected layout rank to match tensor rank}}
+      !xegpu.tensor_desc<16xf32,  #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
   return
 }
 
@@ -455,7 +455,7 @@ func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
 func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{cannot distribute 8 over 16 work items with 1 elements each}}
-      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   return
 }
 
@@ -463,7 +463,7 @@ func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
 func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{cannot distribute 4 over 8 work items with 1 elements each}}
-      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = lane, lane_layout = [8, 2], lane_data = [1, 1]>>
+      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>>
   return
 }
 
@@ -471,7 +471,7 @@ func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
 func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{cannot distribute 4 over 2 work items with 4 elements each}}
-      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = lane, lane_layout = [2, 8], lane_data = [4, 1]>>
+      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<lane_layout = [2, 8], lane_data = [4, 1]>>
   return
 }
 
@@ -479,7 +479,7 @@ func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
 func.func @tensor_desc_invalid_map_data_1(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{cannot distribute 4 over 8 work items with 1 elements each}}
-      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = lane, lane_layout = [8, 2], lane_data = [1, 2]>>
+      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 2]>>
   return
 }
 
@@ -490,7 +490,7 @@ func.func @tensor_desc_scatter_invalid_map_data(%src: ui64) {
       // expected-error at +1 {{cannot map over non-contiguous scattered row elements}}
       !xegpu.tensor_desc<4x2xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 2>,
-         #xegpu.layout<scope = lane, lane_layout = [1, 1], lane_data = [2, 1]>>
+         #xegpu.layout<lane_layout = [1, 1], lane_data = [2, 1]>>
   return
 }
 
@@ -500,7 +500,7 @@ func.func @tensor_desc_scatter_invalid_map_data_1(%src: ui64, %offsets: vector<1
       // expected-error at +1 {{work item data mapping must match the number of contiguous elements}}
       !xegpu.tensor_desc<16xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 1>,
-         #xegpu.layout<scope = lane, lane_layout = [1, 8], lane_data = [1, 2]>>
+         #xegpu.layout<lane_layout = [8], lane_data = [2]>>
   return
 }
 
@@ -510,7 +510,7 @@ func.func @tensor_desc_scatter_invalid_chunk_size_1D(%src: ui64, %offsets: vecto
       // expected-error at +1 {{expected non-contiguous elements for 1D tensor}}
       !xegpu.tensor_desc<16xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 2>,
-         #xegpu.layout<scope = lane, lane_layout = [1, 8], lane_data = [1, 2]>>
+         #xegpu.layout<lane_layout = [1, 8], lane_data = [1, 2]>>
   return
 }
 
@@ -520,22 +520,22 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto
       // expected-error at +1 {{expected chunk blocks for 2D tensor}}
       !xegpu.tensor_desc<16x2xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 1>,
-         #xegpu.layout<scope = lane, lane_layout = [8, 1], lane_data = [1, 2]>>
+         #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 2]>>
   return
 }
 
 // -----
 func.func @test_convert_layout_same_map(%a: vector<32x64xf16>) {
   // expected-error at +1 {{expected different srcMap and resMap}}
-  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>,
-                                resMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+                                resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
   gpu.return
 }
 
 // -----
 func.func @test_convert_layout_unmatch(%a: vector<32x64xf16>) {
-  // expected-error at +1 {{expected srcMap and resMap be in the same scope}}
+  // expected-error at +1 {{expected srcMap and resMap be WgLayout or SgLayout at the same time}}
   %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
-                                resMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+                                resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
   gpu.return
 }
\ No newline at end of file
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index e52562a2f453d..54f14c6cb8c65 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -15,9 +15,9 @@ gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_1(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_create_nd_tdesc_simt_1(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -34,8 +34,8 @@ gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : ind
 gpu.func @test_create_nd_tdesc_simt_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
   //CHECK: %[[C:.*]] = arith.constant 1 : index
   %c1 = arith.constant 1 : index
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -48,8 +48,8 @@ gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_create_nd_tdesc_simt_3(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -62,8 +62,8 @@ gpu.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
 gpu.func @test_create_nd_tdesc_simt_4(%src: memref<2x24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -76,8 +76,8 @@ gpu.func @test_create_nd_tdesc_vc_5(%src: memref<2x24x32xf32, 3>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
 gpu.func @test_create_nd_tdesc_simt_5(%src: memref<2x24x32xf32, 3>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
   gpu.return
 }
 
@@ -90,8 +90,8 @@ gpu.func @test_create_nd_tdesc_vc_6(%src: memref<24x32xf32>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_6(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_create_nd_tdesc_simt_6(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -106,10 +106,10 @@ gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
 
 // CHECK: gpu.func @test_prefetch_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_prefetch_nd_simt(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -125,11 +125,11 @@ gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
 
 // CHECK: func @test_load_nd_simt(%[[arg0:.*]]: memref<8x16xf16>) {
 gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<4x2xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<4x2xf16>
   %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
-       : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<4x2xf16>
+       : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<4x2xf16>
   gpu.return
 }
 
@@ -144,10 +144,10 @@ gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
 
 // CHECK: func @test_load_nd_simt_2(%[[arg0:.*]]: memref<8x16xf16>) {
 gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<1x1xf16>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<1x1xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<1x1xf16>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<1x1xf16>
   gpu.return
 }
 
@@ -162,11 +162,11 @@ gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
 
 // CHECK: func @test_load_nd_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
   gpu.return
 }
 
@@ -181,11 +181,11 @@ gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) {
 
 // CHECK: func @test_load_nd_simt_4(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
-    !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<8x2xf16>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<8x2xf16>
+    !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<8x2xf16>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<8x2xf16>
   gpu.return
 }
 
@@ -200,11 +200,11 @@ gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
 
 // CHECK: func @test_load_nd_simt_5(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x1xf32>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x1xf32>
+    !xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<2x1xf32>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<2x1xf32>
   gpu.return
 }
 
@@ -219,11 +219,11 @@ gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) {
 
 // CHECK: func @test_load_nd_simt_6(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x1xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x1xf16>
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
-    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x1xf16>
+    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x1xf16>
   gpu.return
 }
 
@@ -238,11 +238,11 @@ gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) {
 
 // CHECK: func @test_load_nd_simt_7(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x2xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x2xf16>
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
-    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x2xf16>
+    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x2xf16>
   gpu.return
 }
 
@@ -257,10 +257,10 @@ gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) {
 
 // CHECK: func @test_load_nd_simt_8(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = lane, lane_layout = [16, 1], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = lane, lane_layout = [16, 1], lane_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = lane, lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<8x1xf32>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = lane, lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<8x1xf32>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<8x1xf32>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<8x1xf32>
   gpu.return
 }
 
@@ -279,11 +279,11 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
 gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) {
    // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48x1xf16>
   %1 = arith.constant dense<1.0>: vector<48x1xf16>
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
-    !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+    !xegpu.tensor_desc<24x32xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -305,11 +305,11 @@ gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
 gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) {
    // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2x1xf16>
   %1 = arith.constant dense<1.0>: vector<2x1xf16>
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
   %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
-    !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+    !xegpu.tensor_desc<32xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
   gpu.return
 }
 
@@ -324,10 +324,10 @@ gpu.func @test_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
 
 // CHECK: gpu.func @test_update_nd_tdesc_simt(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_update_nd_tdesc_simt(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-  %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -344,8 +344,8 @@ gpu.func @test_create_tdesc_vc(%src: ui64) {
 gpu.func @test_create_tdesc_simt(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -363,8 +363,8 @@ gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) {
 gpu.func @test_create_tdesc_simt_1(%src: memref<?xf32, 3>) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space =  slm, chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex>  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space =  slm, chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex>  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -383,7 +383,7 @@ gpu.func @test_create_tdesc_simt_2(%src: memref<?xf32>) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>
-  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>  -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>  -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
   gpu.return
 }
 
@@ -401,8 +401,8 @@ gpu.func @test_create_tdesc_vc_3(%src: ui64) {
 gpu.func @test_create_tdesc_simt_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
+  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
   gpu.return
 }
 
@@ -425,10 +425,10 @@ gpu.func @test_load_simt(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
   %1 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
-  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
-  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
-  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
   gpu.return
 }
 
@@ -451,10 +451,10 @@ gpu.func @test_load_simt_2(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
   %1 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
-  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
-  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
-  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
+  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
+  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>, vector<4xi1> -> vector<1xf32>
+  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>, vector<4xi1> -> vector<1xf32>
   gpu.return
 }
 
@@ -477,10 +477,10 @@ gpu.func @test_load_simt_3(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
   %1 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
-  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
-  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
-  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
+  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
+  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
   gpu.return
 }
 
@@ -509,10 +509,10 @@ gpu.func @test_store_simt(%src: ui64) {
   %1 = arith.constant dense<1>: vector<4xi1>
   //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2x1xf32>
   %2 = arith.constant dense<2.9>: vector<2x1xf32>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
-  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
-  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
-  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
+  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
   gpu.return
 }
 
@@ -541,10 +541,10 @@ gpu.func @test_store_simt_2(%src: ui64) {
   %1 = arith.constant dense<1>: vector<4xi1>
   //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<1x2xf16>
   %2 = arith.constant dense<2.9>: vector<1x2xf16>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
-  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
-  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1>
-  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
+  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
+  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1>
+  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1>
   gpu.return
 }
 
@@ -572,10 +572,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
   %1 = arith.constant dense<1>: vector<4xi1>
   //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32>
   %2 = arith.constant dense<2.9>: vector<1xf32>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
-  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
-  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>, vector<4xi1>
-  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>, vector<4xi1>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
+  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
+  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>, vector<4xi1>
+  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>, vector<4xi1>
   gpu.return
 }
 
@@ -583,10 +583,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
 gpu.func @test_prefetch_simt(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
-  // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
-  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+  // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
   gpu.return
 }
 
@@ -605,13 +605,13 @@ gpu.func @test_prefetch_vc(%src: ui64) {
 // CHECK: gpu.func @test_create_update_tdesc_simt(%[[arg0:.*]]: ui64) {
 gpu.func @test_create_update_tdesc_simt(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
   //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
-  //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
+  //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
   %s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
-  %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
+  %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
   gpu.return
 }
 
@@ -637,12 +637,12 @@ gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
 
 // CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8x1xf16>, %[[arg1:.*]]: vector<8x2xf16>)
 gpu.func @test_dpas_simt(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
-  // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {a_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>,
-  // CHECK: b_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>,
-  // CHECK: c_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
-  %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>,
-                          b_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>,
-                          c_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>}
+  // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {a_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+  // CHECK: b_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+  // CHECK: c_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+  %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+                          b_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+                          c_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
                           : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
   gpu.return
 }
@@ -712,8 +712,8 @@ gpu.func @test_create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
 }
 
 gpu.func @test_convert_layout(%a: vector<32x64xf16>) {
-  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [2, 1]>,
-                                resMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+                                resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
   gpu.return
 }
 

>From 784ab38e3a0dc4fd6288375eccba66c9b8db58b4 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 1 Apr 2025 19:09:09 +0000
Subject: [PATCH 27/45] refine isEvenDistributed

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 2ac3426904fa8..35a5421410305 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -107,14 +107,17 @@ isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
 
 static bool isEvenDistributed(llvm::ArrayRef<int64_t> shape,
                               xegpu::LayoutAttr attr) {
-  assert(attr && "workgroup map attribute is missing.");
+  assert(attr && "Layout attribute is missing.");
+  llvm::SmallVector<int32_t> defaults(shape.size(), 1);
   llvm::ArrayRef<int32_t> layout, data;
-  if (attr.getSgLayout()) {
-    data = attr.getSgData().asArrayRef();
-    layout = attr.getSgLayout().asArrayRef();
+  if (auto sg_layout = attr.getSgLayout()) {
+    layout = sg_layout.asArrayRef();
+    auto sg_data = attr.getSgData();
+    data = sg_data? sg_data.asArrayRef(): defaults;
   } else {
-    data = attr.getLaneData().asArrayRef();
     layout = attr.getLaneLayout().asArrayRef();
+    auto lane_data = attr.getLaneData();
+    data = lane_data? lane_data.asArrayRef(): defaults;
   }
   for (auto [s, d, l] : llvm::zip_equal(shape, data, layout)) {
     // check s % (d * l) != 0

>From 28cf69ef3bd26cfd08894deeddc79799aa8f2dcf Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 1 Apr 2025 19:49:36 +0000
Subject: [PATCH 28/45] format code

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 35a5421410305..a60288f2eb77d 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -113,11 +113,11 @@ static bool isEvenDistributed(llvm::ArrayRef<int64_t> shape,
   if (auto sg_layout = attr.getSgLayout()) {
     layout = sg_layout.asArrayRef();
     auto sg_data = attr.getSgData();
-    data = sg_data? sg_data.asArrayRef(): defaults;
+    data = sg_data ? sg_data.asArrayRef() : defaults;
   } else {
     layout = attr.getLaneLayout().asArrayRef();
     auto lane_data = attr.getLaneData();
-    data = lane_data? lane_data.asArrayRef(): defaults;
+    data = lane_data ? lane_data.asArrayRef() : defaults;
   }
   for (auto [s, d, l] : llvm::zip_equal(shape, data, layout)) {
     // check s % (d * l) != 0

>From 9ed0f874e8f3fdf897bbcc96f8d02c6a38507db6 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 1 Apr 2025 19:58:42 +0000
Subject: [PATCH 29/45] fix format issue

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 7aa698de7e2da..c95f9e90f589e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -94,7 +94,8 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
     if (!sg_layout)
       return emitError() << "expected sg_layout being used with sg_data";
     if (sg_data.size() != sg_layout.size())
-      return emitError() << "expected sg_data having the same rank as sg_layout";
+      return emitError()
+             << "expected sg_data having the same rank as sg_layout";
   }
 
   // inst_data is optional for Subgroup layout, but its presence requires
@@ -297,8 +298,8 @@ LogicalResult TensorDescType::verify(
       uint32_t numElemPerWi = laneLayout[i] * laneData[i];
       if (tensorShape[i] < numElemPerWi || tensorShape[i] % numElemPerWi != 0)
         return emitError() << "cannot distribute " << tensorShape[i] << " over "
-                           << laneLayout[i] << " work items with " << laneData[i]
-                           << " elements each";
+                           << laneLayout[i] << " work items with "
+                           << laneData[i] << " elements each";
     }
   }
 

>From 3b389bfcaa2bde33ab2651eadcc7bb4a6a4b9c78 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 1 Apr 2025 20:27:18 +0000
Subject: [PATCH 30/45] add 1D layout examples

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 3d0f52041d798..173f1462fdd73 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -77,6 +77,12 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     // A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
     xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_space = slm>>
 
+    // A 1D TensorDesc with a layout for subgroup level programming, each lane access two continuous elements
+    xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [2]>>
+
+    // A 1D TensorDesc with a layout for subgroup level programming, each lane access two elements with stride = 16
+    xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+
     // A TensorDesc with a layout for subgroup level programming
     xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 

>From 589d2171c173553cd7131425e85d920096fde19b Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 2 Apr 2025 17:10:35 +0000
Subject: [PATCH 31/45] refactor names

---
 .../XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 8e1e846c94d3e..5b812a731ec95 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -697,13 +697,13 @@ static LogicalResult attachLayoutAttributes(
     auto layout = getPropagatedLayout(r);
     if (!layout.isAssigned())
       return {};
-    SmallVector<int, 2> wiLayout, wiData;
+    SmallVector<int, 2> laneLayout, laneData;
     for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
                                                layout.getDataAsArrayRef())) {
-      wiLayout.push_back(static_cast<int>(layout));
-      wiData.push_back(static_cast<int>(data));
+      laneLayout.push_back(static_cast<int>(layout));
+      laneData.push_back(static_cast<int>(data));
     }
-    return xegpu::LayoutAttr::get(r.getContext(), wiLayout, wiData);
+    return xegpu::LayoutAttr::get(r.getContext(), laneLayout, laneData);
   };
   /// Attach the layout attributes to the results of the operations.
   auto walkResult = top->walk([&](Operation *op) {
@@ -785,22 +785,22 @@ FailureOr<VectorType> getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
   if (!layout)
     return failure();
 
-  auto wiLayout = layout.getLaneLayout();
+  auto laneLayout = layout.getLaneLayout();
   assert((originalType.getRank() == 2 || originalType.getRank() == 3) &&
          "expecting 2D or 3D shape for the original vector type");
-  assert(wiLayout.size() == 2 && "expecting 2D shape for the wi layout");
+  assert(laneLayout.size() == 2 && "expecting 2D shape for the wi layout");
   // Original type can be 2D or 3D (array_length > 1), the last two dims are the
   // block shape.
   auto blockShape = originalType.getShape().take_back(2);
   // Check if the block vector shape can be distributed evenly.
-  if (blockShape[0] % wiLayout[0] != 0 || blockShape[1] % wiLayout[1] != 0)
+  if (blockShape[0] % laneLayout[0] != 0 || blockShape[1] % laneLayout[1] != 0)
     return failure();
 
   if (originalType.getRank() == 3) {
     distributedShape.push_back(originalType.getShape()[0]);
   }
   for (unsigned i = 0; i < 2; ++i) {
-    distributedShape.push_back(blockShape[i] / wiLayout[i]);
+    distributedShape.push_back(blockShape[i] / laneLayout[i]);
   }
   auto newVectorType =
       VectorType::get(distributedShape, originalType.getElementType());

>From c6ccef2e84c1c5ffcfd8b6d98e14f73b559779d3 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 2 Apr 2025 21:47:40 +0000
Subject: [PATCH 32/45] refactor

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 7 -------
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp       | 9 ---------
 2 files changed, 16 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 57c9bbc7b29f3..80c6ce1160593 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -258,13 +258,6 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
     }
   }];
 
-  let builders = [
-    AttrBuilder<(ins
-      "ArrayRef<int>": $lane_layout,
-      "ArrayRef<int>": $lane_data
-    )>
-  ];
-
   let assemblyFormat = "`<` struct(params) `>`";
   let genVerifyDecl = 1;
 }
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index ccd0a48c8391e..c2508785d63b0 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -133,15 +133,6 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
   return success();
 }
 
-LayoutAttr LayoutAttr::get(mlir::MLIRContext *context, ArrayRef<int> laneLayout,
-                           ArrayRef<int> laneData) {
-  return Base::get(context, ScopeAttr::get(context, Scope::Lane),
-                   DenseI32ArrayAttr(), DenseI32ArrayAttr(),
-                   DenseI32ArrayAttr(),
-                   DenseI32ArrayAttr::get(context, laneLayout),
-                   DenseI32ArrayAttr::get(context, laneData));
-}
-
 //===----------------------------------------------------------------------===//
 // XeGPU_TensorDescType
 //===----------------------------------------------------------------------===//

>From cbd0af0c5bf32fcb395aafd22b83a5c0c9e8e804 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 4 Apr 2025 17:05:05 +0000
Subject: [PATCH 33/45] refine LayoutAttr verifier

---
 .../SPIRV/IR/SPIRVIntelExtEmbargoOps.td       |  85 +++++++++++++
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  25 +++-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  51 ++++----
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |  15 ++-
 mlir/test/Dialect/XeGPU/invalid.mlir          | 112 +++++++++++++++++-
 5 files changed, 254 insertions(+), 34 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtEmbargoOps.td

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtEmbargoOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtEmbargoOps.td
new file mode 100644
index 0000000000000..e3e16a0966ada
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtEmbargoOps.td
@@ -0,0 +1,85 @@
+//===- SPIRVIntelExtEmbargoOps.td - Intel SPIR-V extensions ---------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the op definition spec of Intel-specific SPIR-V extensions
+// These extensions are not part of Khronos specification and available under
+// Embargo.
+// Supported extensions
+// * SPV_INTEL_region_group
+//===----------------------------------------------------------------------===//
+
+
+#ifndef MLIR_DIALECT_SPIRV_IR_INTEL_EXT_EMBARGO_OPS
+#define MLIR_DIALECT_SPIRV_IR_INTEL_EXT_EMBARGO_OPS
+
+// -----
+
+def SPIRV_INTELSubRegionControlBarrierOp : SPIRV_IntelVendorOp<"SubRegionControlBarrier", []> {
+  let summary = "See extension SPV_INTEL_region_group";
+
+  let description = [{
+    Wait for all active invocations within the current sub-region to reach
+    the current point of execution.
+
+    All active invocations within the current sub-region reach this point of
+    execution before any invocation proceeds beyond it.
+
+    A sub-region is a subset of the workgroups in a region group. The region
+    group is partitioned into groups of SubRegionSize workgroups, and
+    the workgroups are ordered by their linearized ID. The first SubRegionSize
+    workgroups in this sequence are the first sub-region, the next
+    SubRegionSize workgroups are the next sub-region, etc. The total number of
+    workgroups in the region-group must be evenly divisible by SubRegionSize,
+    otherwise the behavior is undefined.
+
+    Behavior is undefined unless all invocations within the current sub-region
+    execute the same dynamic instance of this instruction. SubRegionSize value
+    must be the same for all invocations within the current sub-region,
+    or otherwise behavior is undefined.
+
+    If Semantics is not None, this instruction also serves as an
+    OpMemoryBarrier instruction, and also performs and adheres to the
+    description and semantics of an OpMemoryBarrier instruction with the
+    same Memory and Semantics operands. This allows atomically specifying
+    both a control barrier and a memory barrier (that is, without needing
+    two instructions). If Semantics is None, Memory is ignored.
+
+    #### Example:
+
+    ```mlir
+    spirv.SubRegionControlBarrier %0, "RegionINTEL", "None"
+    ```
+
+  }];
+
+
+  let availability = [
+    MinVersion<SPIRV_V_1_0>,
+    MaxVersion<SPIRV_V_1_6>,
+    Extension<[SPV_INTEL_region_group]>,
+    Capability<[SPIRV_C_RegionGroupINTEL]>
+  ];
+
+  let arguments = (ins
+    SPIRV_Int32:$subregion_size,
+    SPIRV_ScopeAttr:$memory_scope,
+    SPIRV_MemorySemanticsAttr:$memory_semantics
+  );
+
+  let assemblyFormat = [{
+    $subregion_size `,` $memory_scope `,` $memory_semantics attr-dict
+  }];
+
+  let results = (outs);
+
+  let hasVerifier = 0;
+}
+
+// -----
+
+#endif // MLIR_DIALECT_SPIRV_IR_INTEL_EXT_EMBARGO_OPS
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 80c6ce1160593..15aa053017b49 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -165,8 +165,7 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
     includes the following parameters:
 
     * `sg_layout`: Specifies the total number of subgroups and their layout within a workgroup.
-      It is mandatory for workgroup-level programming and optional for subgroup programming. Its
-      presence implies workgroup-level code.
+      It is mandatory for workgroup-level programming. Its presence implies workgroup-level code.
     * `sg_data`: Defines the data size accessed per subgroup. It is optionally used with `sg_layout`
       for workgroup-level programming. When it is left empty, the size accessed per subgroup can be
       derived from the tensor shape and `sg_layout` using the formula:
@@ -199,7 +198,15 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
       In this example, there are 16 work-items per subgroup, and is organized as
       [[0, 2, 4, ..., 14], [1, 3, 5, ..., 15]]. The distribution unit is 1x1.
 
-      3. Workgroup level layout:
+      3. Subgroup level layout with inst_data
+      ```mlir
+      #xegpu.layout<inst_data = [8, 16], lane_layout = [2, 8], lane_data = [2, 2]>
+      ```
+      In this example, the original problem size is divided into smaller subproblems of size [8, 16],
+      which are further distributed across 16 work-items organized as [[0, 1, 2, ..., 7], [8, 9, ..., 15]].
+      Each work-item is assigned a contiguous 2x2 block.
+
+      4. Workgroup level layout:
       ```mlir
       #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [2, 8], lane_data = [1, 1]>
       ```
@@ -207,7 +214,7 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
       arranged as [[0, 1, 2, 3], [4, 5, 6, 7]]. Each subgroup accesses a 16x16 block per instruction, which
       is further distributed to 16 work items which is organized as [[0, 1, 2, .., 7],[8, 9, .., 15]].
 
-      4. Workgroup level layout with order:
+      5. Workgroup level layout with order:
       ```mlir
       #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [2, 8], lane_data = [1, 1], order = [0, 1]>
       ```
@@ -215,6 +222,14 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
       arranged as [[0, 2, 4, 6], [1, 3, 5, 7]]. Each subgroup accesses a 16x16 block per instruction, which
       is further distributed to 16 work items which is organized as [[0, 2, 4, ..., 14], [1, 3, 5, ..., 15]].
 
+      6. Workgroup level layout with inst_data:
+      ```mlir
+      #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], inst_data = [8, 16], lane_layout = [2, 8], lane_data = [1, 1]>
+      ```
+      This example is similar to the previous ones, but the `inst_data` parameter divides `sg_data` into two instructions,
+      each processing an 8x16 block. These blocks are further distributed across 16 work-items with a distribution unit of 1x1.
+      Unlike the 2x2 distribution unit in example 3, which results in accessing contiguous 2x2 blocks, the 1x1 distribution
+      unit may result in non-contiguous access.
   }];
 
   let parameters = (ins
@@ -252,6 +267,8 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
     int64_t getRank() {
       if (auto attr = getSgLayout())
         return attr.size();
+      if (auto attr = getInstData())
+        return attr.size();
       if (auto attr = getLaneLayout())
         return attr.size();
       return 0;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index c95f9e90f589e..8206db0198eaf 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -79,13 +79,27 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
   // A valid layout must include at least one of sg_layout and lane_layout.
   // sg_layout is essential for Workgroup layout, while lane_layout is
   // required for Subgroup layout.
-  if (!sg_layout && !lane_layout) {
-    return emitError() << "expected at least one of sg_layout or lane_layout";
+  if (!sg_layout && !inst_data && !lane_layout) {
+    return emitError()
+           << "expected at least one of sg_layout, inst_data or lane_layout";
+  }
+
+  // generate code to check sg_laout, inst_data and lane_layout having the same
+  // rank if they are not null.
+
+  if (sg_layout && inst_data && sg_layout.size() != inst_data.size()) {
+    return emitError()
+           << "expected sg_layout and inst_data to have the same rank";
   }
 
   if (sg_layout && lane_layout && sg_layout.size() != lane_layout.size()) {
     return emitError()
-           << "expected sg_layout and lane_layout having the same rank";
+           << "expected sg_layout and lane_layout to have the same rank";
+  }
+
+  if (inst_data && lane_layout && inst_data.size() != lane_layout.size()) {
+    return emitError()
+           << "expected inst_data and lane_layout to have the same rank";
   }
 
   // sg_data is optional for Workgroup layout, but its presence requires
@@ -95,17 +109,7 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
       return emitError() << "expected sg_layout being used with sg_data";
     if (sg_data.size() != sg_layout.size())
       return emitError()
-             << "expected sg_data having the same rank as sg_layout";
-  }
-
-  // inst_data is optional for Subgroup layout, but its presence requires
-  // lane_layout.
-  if (inst_data) {
-    if (!lane_layout)
-      return emitError() << "expected lane_layout being used with inst_data";
-    if (inst_data.size() != lane_layout.size())
-      return emitError()
-             << "expected inst_data having the same rank as lane_layout";
+             << "expected sg_data and sg_layout to have the same rank";
   }
 
   // lane_data is optional for Subgroup layout, but its presence requires
@@ -115,16 +119,21 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
       return emitError() << "expected lane_layout being used with lane_data";
     if (lane_data.size() != lane_layout.size())
       return emitError()
-             << "expected lane_data having the same rank as lane_layout";
+             << "expected lane_data and lane_layout to have the same rank";
   }
 
   if (order) {
     if (!sg_layout && !lane_layout)
       return emitError()
              << "expected sg_layout/lane_layout being used with order";
-    if (order.size() != sg_layout.size() && order.size() != lane_layout.size())
+
+    if (sg_layout && order.size() != sg_layout.size())
+      return emitError()
+             << "expected order and sg_layout to have the same rank";
+
+    if (lane_layout && order.size() != lane_layout.size())
       return emitError()
-             << "expected order having the same rank as sg_layout/lane_layout";
+             << "expected order and lane_layout to have the same rank";
   }
 
   return success();
@@ -341,9 +350,9 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
   auto tdescShape = getShape();
 
   auto laneDataSize = 1, sgSize = 1;
-  for (auto [wiDim, laneDataDim] : llvm::zip_equal(laneLayout, laneData)) {
+  for (auto [laneDim, laneDataDim] : llvm::zip_equal(laneLayout, laneData)) {
     laneDataSize *= laneDataDim;
-    sgSize *= wiDim;
+    sgSize *= laneDim;
   }
 
   // Case 1: regular loads/stores
@@ -363,9 +372,9 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
   // Case 2: block loads/stores
   // Check if the tensor descriptor shape is distributable.
   int64_t tensorSize = 1;
-  for (auto [tdescDim, wiDim, laneDataDim] :
+  for (auto [tdescDim, laneDim, laneDataDim] :
        llvm::zip_equal(tdescShape, laneLayout, laneData)) {
-    assert((tdescDim % (wiDim * laneDataDim) == 0) &&
+    assert((tdescDim % (laneDim * laneDataDim) == 0) &&
            "tensor descriptor shape is not distributable");
     tensorSize *= tdescDim;
   }
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index a60288f2eb77d..12b45a223183a 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -119,9 +119,10 @@ static bool isEvenDistributed(llvm::ArrayRef<int64_t> shape,
     auto lane_data = attr.getLaneData();
     data = lane_data ? lane_data.asArrayRef() : defaults;
   }
-  for (auto [s, d, l] : llvm::zip_equal(shape, data, layout)) {
-    // check s % (d * l) != 0
-    if (s % d != 0 || (s / d) % l != 0)
+  for (auto [dimSize, dataFactor, layoutFactor] :
+       llvm::zip_equal(shape, data, layout)) {
+    // check dimSize % (dataFactor * layoutFactor) != 0
+    if (dimSize % dataFactor != 0 || (dimSize / dataFactor) % layoutFactor != 0)
       return false;
   }
   return true;
@@ -602,17 +603,15 @@ LogicalResult DpasOp::verify() {
 
   // make sure the layout attribute is either set for every available
   // operand or simply not set at all. C is special, since ACC is optional.
-  // If they are all set, they also should be in the same scope.
-  auto isValidSet = [&]() {
+  auto hasValidLayoutAttrs = [&]() {
     bool result = (aLayout != nullptr) ^ (bLayout != nullptr);
     if (hasAcc()) {
       result |= (aLayout != nullptr) ^ (cLayout != nullptr);
     }
-    result = !result;
-    return result;
+    return !result;
   };
 
-  if (!isValidSet())
+  if (!hasValidLayoutAttrs())
     return emitOpError(
         "layout attributes should be either set for all operands (for SIMT "
         "code) or not set at all (for SIMD code).");
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 8b5e42af2f7b8..596befa335618 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -538,4 +538,114 @@ func.func @test_convert_layout_unmatch(%a: vector<32x64xf16>) {
   %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
                                 resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
   gpu.return
-}
\ No newline at end of file
+}
+
+// -----
+func.func @tensor_desc_invalid_layout_attr(%src: ui64, %offsets: vector<16xindex>) {
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+      !xegpu.tensor_desc<16x2xf32,
+        #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+         // expected-error at +1 {{expected at least one of sg_layout, inst_data or lane_layout}}
+         #xegpu.layout<sg_data = [16, 2], lane_data = [1, 2]>>
+  return
+}
+
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+      !xegpu.tensor_desc<16x2xf32,
+        #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+        // expected-error at +1 {{expected sg_layout and lane_layout to have the same rank}}
+        #xegpu.layout<sg_layout = [1, 1, 1], sg_data = [16, 2, 1], lane_layout = [8, 1], lane_data = [1, 2]>>
+  return
+}
+
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+      !xegpu.tensor_desc<16x2xf32,
+        #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+        // expected-error at +1 {{expected sg_layout and inst_data to have the same rank}}
+        #xegpu.layout<sg_layout = [1, 1, 1], sg_data = [16, 2, 1], inst_data = [16, 2]>>
+  return
+}
+
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+      !xegpu.tensor_desc<16x2xf32,
+        #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+        // expected-error at +1 {{expected inst_data and lane_layout to have the same rank}}
+        #xegpu.layout<inst_data = [16, 2, 1], lane_layout = [8, 1], lane_data = [1, 2]>>
+  return
+}
+
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+      !xegpu.tensor_desc<16x2xf32,
+        #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+        // expected-error at +1 {{expected lane_data and lane_layout to have the same rank}}
+        #xegpu.layout<inst_data = [16, 2], lane_layout = [8, 1], lane_data = [1, 2, 1]>>
+  return
+}
+
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+      !xegpu.tensor_desc<16x2xf32,
+        #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+        // expected-error at +1 {{expected sg_data and sg_layout to have the same rank}}
+        #xegpu.layout<sg_layout = [1, 1], sg_data = [16, 2, 1], inst_data = [16, 2]>>
+  return
+}
+
+// -----
+func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) {
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+      !xegpu.tensor_desc<16x2xf32,
+        #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+        // expected-error at +1 {{expected sg_layout being used with sg_data}}
+        #xegpu.layout<sg_data = [16, 2], lane_layout = [8, 1], lane_data = [1, 2]>>
+  return
+}
+
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+      !xegpu.tensor_desc<16x2xf32,
+        #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+        // expected-error at +1 {{expected lane_layout being used with lane_data}}
+        #xegpu.layout<inst_data = [16, 2], lane_data = [1, 2]>>
+  return
+}
+
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+      !xegpu.tensor_desc<16x2xf32,
+        #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+        // expected-error at +1 {{expected sg_layout/lane_layout being used with order}}
+        #xegpu.layout<inst_data = [16, 2], order = [0, 1]>>
+  return
+}
+
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+      !xegpu.tensor_desc<16x2xf32,
+        #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+        // expected-error at +1 {{expected order and sg_layout to have the same rank}}
+        #xegpu.layout<sg_layout = [1, 1], sg_data = [16, 2], order = [0, 1, 2]>>
+  return
+}
+
+// -----
+func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) {
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+      !xegpu.tensor_desc<16x2xf32,
+        #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+        // expected-error at +1 {{expected order and lane_layout to have the same rank}}
+        #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 2], order = [0, 1, 2]>>
+  return
+}

>From 3fb4fd447825dd6e92e6354a5629e230fac09552 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 4 Apr 2025 17:08:34 +0000
Subject: [PATCH 34/45] add unit test

---
 mlir/test/Dialect/XeGPU/invalid.mlir | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 596befa335618..48df33a591908 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -600,6 +600,16 @@ func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
   return
 }
 
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+      // expected-error at +1 {{expected layout rank to match tensor rank}}
+      !xegpu.tensor_desc<16x2xf32,
+        #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+        #xegpu.layout<sg_layout = [1], sg_data = [32], inst_data = [16]>>
+  return
+}
+
 // -----
 func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) {
   %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->

>From 77fdfefb7b23b5a032f8f3da4c786ccc321a7e1a Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 4 Apr 2025 17:12:46 +0000
Subject: [PATCH 35/45] remove dump file

---
 .../SPIRV/IR/SPIRVIntelExtEmbargoOps.td       | 85 -------------------
 1 file changed, 85 deletions(-)
 delete mode 100644 mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtEmbargoOps.td

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtEmbargoOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtEmbargoOps.td
deleted file mode 100644
index e3e16a0966ada..0000000000000
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtEmbargoOps.td
+++ /dev/null
@@ -1,85 +0,0 @@
-//===- SPIRVIntelExtEmbargoOps.td - Intel SPIR-V extensions ---------------*- tablegen -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This is the op definition spec of Intel-specific SPIR-V extensions
-// These extensions are not part of Khronos specification and available under
-// Embargo.
-// Supported extensions
-// * SPV_INTEL_region_group
-//===----------------------------------------------------------------------===//
-
-
-#ifndef MLIR_DIALECT_SPIRV_IR_INTEL_EXT_EMBARGO_OPS
-#define MLIR_DIALECT_SPIRV_IR_INTEL_EXT_EMBARGO_OPS
-
-// -----
-
-def SPIRV_INTELSubRegionControlBarrierOp : SPIRV_IntelVendorOp<"SubRegionControlBarrier", []> {
-  let summary = "See extension SPV_INTEL_region_group";
-
-  let description = [{
-    Wait for all active invocations within the current sub-region to reach
-    the current point of execution.
-
-    All active invocations within the current sub-region reach this point of
-    execution before any invocation proceeds beyond it.
-
-    A sub-region is a subset of the workgroups in a region group. The region
-    group is partitioned into groups of SubRegionSize workgroups, and
-    the workgroups are ordered by their linearized ID. The first SubRegionSize
-    workgroups in this sequence are the first sub-region, the next
-    SubRegionSize workgroups are the next sub-region, etc. The total number of
-    workgroups in the region-group must be evenly divisible by SubRegionSize,
-    otherwise the behavior is undefined.
-
-    Behavior is undefined unless all invocations within the current sub-region
-    execute the same dynamic instance of this instruction. SubRegionSize value
-    must be the same for all invocations within the current sub-region,
-    or otherwise behavior is undefined.
-
-    If Semantics is not None, this instruction also serves as an
-    OpMemoryBarrier instruction, and also performs and adheres to the
-    description and semantics of an OpMemoryBarrier instruction with the
-    same Memory and Semantics operands. This allows atomically specifying
-    both a control barrier and a memory barrier (that is, without needing
-    two instructions). If Semantics is None, Memory is ignored.
-
-    #### Example:
-
-    ```mlir
-    spirv.SubRegionControlBarrier %0, "RegionINTEL", "None"
-    ```
-
-  }];
-
-
-  let availability = [
-    MinVersion<SPIRV_V_1_0>,
-    MaxVersion<SPIRV_V_1_6>,
-    Extension<[SPV_INTEL_region_group]>,
-    Capability<[SPIRV_C_RegionGroupINTEL]>
-  ];
-
-  let arguments = (ins
-    SPIRV_Int32:$subregion_size,
-    SPIRV_ScopeAttr:$memory_scope,
-    SPIRV_MemorySemanticsAttr:$memory_semantics
-  );
-
-  let assemblyFormat = [{
-    $subregion_size `,` $memory_scope `,` $memory_semantics attr-dict
-  }];
-
-  let results = (outs);
-
-  let hasVerifier = 0;
-}
-
-// -----
-
-#endif // MLIR_DIALECT_SPIRV_IR_INTEL_EXT_EMBARGO_OPS

>From 2751332899762e16a0a7424f38512554f5f5ab90 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 4 Apr 2025 17:16:01 +0000
Subject: [PATCH 36/45] fix typo

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 12b45a223183a..8cabfeec9b9de 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -564,7 +564,7 @@ LogicalResult StoreScatterOp::verify() {
                           [&]() { return emitOpError(); });
 }
 
-//===---------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
 // XeGPU_UpdateOffsetOp
 //===----------------------------------------------------------------------===//
 void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,

>From d281a149c266233ba33cf8f10368240ddc08a7d7 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 4 Apr 2025 18:08:07 +0000
Subject: [PATCH 37/45] fix an error after mering with main

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 81333f2589ee6..171a15ce27b59 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -298,17 +298,10 @@ LogicalResult TensorDescType::verify(
                               "contiguous elements";
     }
 
-    // For 1D tensor, pad the shape with an outer unit dimension to allow common
-    // validation logic.
-    SmallVector<int64_t> tensorShape(shape);
-    if (rank == 1)
-      tensorShape = {1, tensorShape.back()};
-
-    size_t dims = tensorShape.size();
-    for (size_t i = 0; i < dims; ++i) {
+    for (size_t i = 0; i < shape.size(); ++i) {
       uint32_t numElemPerWi = laneLayout[i] * laneData[i];
-      if (tensorShape[i] < numElemPerWi || tensorShape[i] % numElemPerWi != 0)
-        return emitError() << "cannot distribute " << tensorShape[i] << " over "
+      if (shape[i] < numElemPerWi || shape[i] % numElemPerWi != 0)
+        return emitError() << "cannot distribute " << shape[i] << " over "
                            << laneLayout[i] << " work items with "
                            << laneData[i] << " elements each";
     }

>From fb28ce83df7346e6c836f512d6a10ad274b50af3 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 7 Apr 2025 15:23:47 +0000
Subject: [PATCH 38/45] new line at the end of file

---
 mlir/test/Dialect/XeGPU/ops.mlir | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 54f14c6cb8c65..e9895e0d0a71d 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -723,5 +723,4 @@ gpu.func @test_convert_layout_wg(%a: vector<32x64xf16>) {
   gpu.return
 }
 
-
 }

>From f464662ab164c11cbe6630ddf6acd9f100c83c6e Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 8 Apr 2025 21:52:23 +0000
Subject: [PATCH 39/45] update doc

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 6 +++---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp           | 7 ++++++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 15aa053017b49..ab5fb4a4a7de9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -202,9 +202,9 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
       ```mlir
       #xegpu.layout<inst_data = [8, 16], lane_layout = [2, 8], lane_data = [2, 2]>
       ```
-      In this example, the original problem size is divided into smaller subproblems of size [8, 16],
-      which are further distributed across 16 work-items organized as [[0, 1, 2, ..., 7], [8, 9, ..., 15]].
-      Each work-item is assigned a contiguous 2x2 block.
+      In this example, the original problem size is partitioned into smaller subproblems of dimensions [8, 16],
+      which are then distributed among 16 work-items arranged as [[0, 1, 2, ..., 7], [8, 9, ..., 15]]. Each
+      work-item is assigned four 2x2 blocks in a round-robin manner.
 
       4. Workgroup level layout:
       ```mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 8cabfeec9b9de..0d67e3d70f945 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -105,6 +105,12 @@ isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
                    << " for tensor descriptor " << tdescTy;
 }
 
+// Checks if the given shape is evenly distributed based on the layout
+// and data factors provided by the LayoutAttr. The function ensures that
+// each dimension of the shape can be evenly divided by the corresponding
+// data factor, and the resulting quotient can be evenly divided by the
+// layout factor. Returns `true` if the shape is evenly distributed,
+// otherwise `false`.
 static bool isEvenDistributed(llvm::ArrayRef<int64_t> shape,
                               xegpu::LayoutAttr attr) {
   assert(attr && "Layout attribute is missing.");
@@ -121,7 +127,6 @@ static bool isEvenDistributed(llvm::ArrayRef<int64_t> shape,
   }
   for (auto [dimSize, dataFactor, layoutFactor] :
        llvm::zip_equal(shape, data, layout)) {
-    // check dimSize % (dataFactor * layoutFactor) != 0
     if (dimSize % dataFactor != 0 || (dimSize / dataFactor) % layoutFactor != 0)
       return false;
   }

>From 2a1d373a61ca10bca9064a2afa7ac1fb88a87fc8 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Thu, 10 Apr 2025 18:45:30 +0000
Subject: [PATCH 40/45] Switch to 1D representation for SIMT

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |  17 +-
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       |   3 +-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  26 +-
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 227 +++++++++++-------
 mlir/test/Dialect/XeGPU/invalid.mlir          | 100 ++------
 mlir/test/Dialect/XeGPU/ops.mlir              | 162 ++++++-------
 6 files changed, 250 insertions(+), 285 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 16a7f63d60c82..9af6eaf69aec3 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -833,16 +833,14 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
     data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`,
     and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS
     also requires A and B to be loaded with the required data layout. Specially,
-
     VNNI layout is required for B operand. It is achieved via adding `packed`
     attribute to the `load_nd` operator.  Due to the VNNI transformation, B operands
     can be represented as a 3D vector, with the last dimension representing the VNNI
     factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>`
     can be represented as `B: vector<8x16x2xf16>`.
 
-    In SIMT mode, DpasOp expects layout attributes `a`, `b`, and `c` (only if acc is used)
-    which describe the data fragment owned by each work-item w.r.t. the tensor descriptor
-    these data are loaded from.
+    In SIMT code, each work-item from a subgroup holds a data fragment for A, B, C and the result,
+    which are represented as 1D vectors.
 
     Note: on PVC, the hardware can perform load with VNNI transformation when data
           element type is 16-bit or lower precision, taking 2 or 4 elements from
@@ -850,13 +848,10 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
   }];
 
   let arguments = (ins
-    XeGPU_DpasOpType : $lhs,
-    XeGPU_DpasOpType : $rhs,
-    Optional<XeGPU_Vector2DType>: $acc,
-    OptionalAttr<XeGPU_LayoutAttr>:$a_layout,
-    OptionalAttr<XeGPU_LayoutAttr>:$b_layout,
-    OptionalAttr<XeGPU_LayoutAttr>:$c_layout);
-  let results = (outs XeGPU_Vector2DType: $result);
+    XeGPU_DpasOprType : $lhs,
+    XeGPU_DpasOprType : $rhs,
+    Optional<XeGPU_DpasResType>: $acc);
+  let results = (outs XeGPU_DpasResType: $result);
 
   let extraClassDeclaration = [{
     VectorType getLhsType() {
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 173f1462fdd73..3cb71788a15ef 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -17,7 +17,8 @@ def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64,
 def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>;
 def XeGPU_ScalarType: AnyTypeOf<[XeGPU_IntType, XeGPU_FloatType]>;
 def XeGPU_BaseAddrType: AnyTypeOf<[Non0RankedMemRefOf<[XeGPU_ScalarType]>, UI64, UI32, I64, I32]>;
-def XeGPU_DpasOpType: VectorOfRankAndType<[2, 3], [XeGPU_ScalarType]>;
+def XeGPU_DpasOprType: VectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>;
+def XeGPU_DpasResType: VectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
 def XeGPU_OffsetType: VectorOfRankAndType<[1], [Index]>;
 def XeGPU_MaskType: AnyTypeOf<[VectorOfRankAndType<[1], [I1]>, I1]>;
 def XeGPU_ValueType: AnyTypeOf<[VectorOfRankAndType<[1,2,3,4], [XeGPU_ScalarType]>, XeGPU_ScalarType]>;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 171a15ce27b59..269e445c3790c 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -10,6 +10,7 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include <numeric>
 
 namespace mlir {
 namespace xegpu {
@@ -336,19 +337,20 @@ LogicalResult TensorDescType::verify(
 //        [n_distribution_units, lane_data_size]
 FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
   auto layout = llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
-  // If no layout is provided, tensor desc is not used in SIMT mode.
-  if (!layout)
+  // It only works for subgroup level layout, which only has lane_layout
+  // and lane_data, and is to distribute a SIMD code into SIMT code.
+  if (!layout || !layout.isSgLayout())
     return failure();
 
   SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
   SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
   auto tdescShape = getShape();
 
-  auto laneDataSize = 1, sgSize = 1;
-  for (auto [laneDim, laneDataDim] : llvm::zip_equal(laneLayout, laneData)) {
-    laneDataSize *= laneDataDim;
-    sgSize *= laneDim;
-  }
+  // compute sgSize by multiply elements of laneLayout
+  // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1]
+  // e.g. for 1D layout, sgSize = laneLayout[0]
+  auto sgSize = std::accumulate(laneLayout.begin(), laneLayout.end(), 1,
+                                std::multiplies<int64_t>());
 
   // Case 1: regular loads/stores
   auto scatterAttr = getEncodingAsScatterTensorDescAttr();
@@ -356,12 +358,9 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
     auto chunkSize = scatterAttr.getChunkSize().getInt();
     // Verify if the first dimension of the tensor descriptor shape is
     // distributable.
-    assert(tdescShape[0] % (laneLayout[0]) == 0 &&
+    assert(tdescShape[0] == laneLayout[0] &&
            "tensor descriptor shape is not distributable");
-    if (chunkSize > 1)
-      return VectorType::get({chunkSize / laneDataSize, laneDataSize},
-                             getElementType());
-    return VectorType::get({laneDataSize}, getElementType());
+    return VectorType::get({chunkSize}, getElementType());
   }
 
   // Case 2: block loads/stores
@@ -376,8 +375,7 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
   // tensorSize must be adjusted for array_length.
   tensorSize *= getArrayLength();
 
-  return VectorType::get({tensorSize / (sgSize * laneDataSize), laneDataSize},
-                         getElementType());
+  return VectorType::get({tensorSize / sgSize}, getElementType());
 }
 
 } // namespace xegpu
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 0d67e3d70f945..fef39508c3bfe 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -73,38 +73,6 @@ static bool isWriteHintOrNone(const CachePolicyAttr &attr) {
          kind == CachePolicy::WRITE_BACK || kind == CachePolicy::WRITE_THROUGH;
 }
 
-// Helper to validate value shape of LoadNd and StoreNd ops.
-static LogicalResult
-isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
-                 ArrayRef<int64_t> adjustedTdescShape,
-                 function_ref<InFlightDiagnostic()> emitError) {
-  auto layout = tdescTy.getLayoutAttr();
-  auto valueShape = valueTy.getShape();
-  // layout not present means IR is in SIMD mode. In this case value shape must
-  // match adjusted tensor descriptor shape.
-  if (!layout)
-    return valueShape == adjustedTdescShape
-               ? success()
-               : emitError()
-                     << "Value shape " << makeString(valueShape)
-                     << " is not consistent with tensor descriptor " << tdescTy;
-
-  // layout present means IR is in SIMT mode. In this case layout determines the
-  // value shape.
-  auto expectedValueShapeOrFailure = tdescTy.getDistributedVectorType();
-  assert(succeeded(expectedValueShapeOrFailure) &&
-         "Failed to compute distributed vector shape for "
-         "tensor descriptor ");
-
-  return valueTy == expectedValueShapeOrFailure.value()
-             ? success()
-             : emitError()
-                   << "Result shape " << makeString(valueShape)
-                   << " is not consistent with distributed vector shape "
-                   << makeString(expectedValueShapeOrFailure.value().getShape())
-                   << " for tensor descriptor " << tdescTy;
-}
-
 // Checks if the given shape is evenly distributed based on the layout
 // and data factors provided by the LayoutAttr. The function ensures that
 // each dimension of the shape can be evenly divided by the corresponding
@@ -302,9 +270,35 @@ LogicalResult LoadNdOp::verify() {
   if (!isReadHintOrNone(getL3HintAttr()))
     return emitOpError("invalid l3_hint: ") << getL3HintAttr();
 
+  // Handling a 1D vector as the result can be complex. It may represent the
+  // outcome of a 1D block load in SIMD mode or a fragment of a block load
+  // result in SIMT mode. In the latter case, the tensor descriptor must be
+  // evenly distributed, with each lane holding an equally sized fragment of
+  // the result. Only subgroup size 8 or 16 is supported.
+  if (valueTy.getRank() == 1 &&
+      valueTy.getNumElements() < tdescTy.getNumElements()) {
+    // SIMT mode doesn't need LayoutAttr.
+    if (tdescTy.getLayoutAttr())
+      return emitOpError()
+             << "TensorDesc doesn't need LayoutAttr for SIMT code";
+
+    int tdescElems = tdescTy.getNumElements() * tdescTy.getArrayLength();
+    int valueElems = valueTy.getNumElements();
+
+    int lanes = tdescElems % valueElems == 0 ? tdescElems / valueElems : -1;
+    if (lanes != 16 && lanes != 8) {
+      return emitOpError()
+             << "Result shape " << makeString(getShapeOf(valueTy))
+             << " is not a valid distribution for tensor descriptor "
+             << tdescTy;
+    }
+    return success();
+  }
+
+  // Check SIMD mode.
   auto array_len = tdescTy.getArrayLength();
   // adjusted tensor descriptor shape tracks the expected shape of the result.
-  auto adjustedTdescShape = getShapeOf(tdescTy);
+  auto tdescShape = getShapeOf(tdescTy);
   auto valueShape = getShapeOf(valueTy);
 
   if (getTranspose()) {
@@ -316,7 +310,7 @@ LogicalResult LoadNdOp::verify() {
     });
 
     if (valid)
-      transpose(trans, adjustedTdescShape);
+      transpose(trans, tdescShape);
     else
       mlir::emitWarning(getLoc()) << "Invalid transpose attr. It is ignored.";
   }
@@ -325,8 +319,8 @@ LogicalResult LoadNdOp::verify() {
     if (tdescTy.getRank() == 2) {
       const int axis = 0;
       auto vnni_factor = valueShape.back();
-      adjustedTdescShape[axis] /= vnni_factor;
-      adjustedTdescShape.push_back(vnni_factor);
+      tdescShape[axis] /= vnni_factor;
+      tdescShape.push_back(vnni_factor);
     } else {
       mlir::emitWarning(getLoc())
           << "Invalid Packed Attr. It is ignored (available for 2D "
@@ -335,12 +329,16 @@ LogicalResult LoadNdOp::verify() {
   }
 
   if (array_len > 1) {
-    auto it = adjustedTdescShape.begin();
-    adjustedTdescShape.insert(it, array_len);
+    tdescShape.insert(tdescShape.begin(), array_len);
+  }
+
+  if (tdescShape != valueShape) {
+    return emitOpError() << "Result shape " << makeString(valueShape)
+                         << " is not consistent with tensor descriptor "
+                         << tdescTy;
   }
 
-  return isArgShapesValid(tdescTy, valueTy, adjustedTdescShape,
-                          [&]() { return emitOpError(); });
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -371,8 +369,37 @@ LogicalResult StoreNdOp::verify() {
   auto tdescShape = getShapeOf(dstTy);
   auto valueShape = getShapeOf(valTy);
 
-  return isArgShapesValid(dstTy, valTy, tdescShape,
-                          [&]() { return emitOpError(); });
+  // Similar to LoadNdOp, handling a 1D vector as the value can be complex. It
+  // may represent the input of a 1D block store in SIMD mode or a fragment of
+  // a block store input in SIMT mode. In the latter case, the tensor descriptor
+  // must be evenly distributed, with each lane holding an equally sized
+  // fragment of the input. Only subgroup size 8 or 16 is supported.
+  if (valTy.getRank() == 1 && valTy.getNumElements() < dstTy.getNumElements()) {
+    // SIMT mode doesn't need LayoutAttr.
+    if (dstTy.getLayoutAttr())
+      return emitOpError()
+             << "TensorDesc doesn't need LayoutAttr for SIMT code";
+
+    int tdescElems = dstTy.getNumElements() * dstTy.getArrayLength();
+    int valueElems = valueShape[0];
+
+    int lanes = tdescElems % valueElems == 0 ? tdescElems / valueElems : -1;
+    if (lanes != 16 && lanes != 8) {
+      return emitOpError()
+             << "Value shape " << makeString(getShapeOf(valTy))
+             << " is not a valid distribution for tensor descriptor " << dstTy;
+    }
+    return success();
+  }
+
+  // SIMD code should have the same shape as the tensor descriptor.
+  if (tdescShape != valueShape) {
+    return emitOpError() << "Value shape " << makeString(valueShape)
+                         << " is not consistent with tensor descriptor "
+                         << dstTy;
+  }
+
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -520,14 +547,41 @@ LogicalResult LoadGatherOp::verify() {
   if (tdescShape[0] != maskShape[0])
     return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
 
+  auto chunkSize = tdescTy.getChunkSize();
+  // for SIMT code, the value should be 1D vector with size of chunkSize.
+  if (valueTy.getRank() == 1 && valueTy.getNumElements() != tdescShape[0]) {
+    if (valueTy.getNumElements() != chunkSize) {
+      return emitOpError()
+             << "Result shape " << makeString(valueShape)
+             << " is not a valid distribution for tensor descriptor "
+             << tdescTy;
+    } else { // valid SIMT code doesn't need LayoutAttr and TransposeAttr.
+      if (tdescTy.getLayoutAttr())
+        return emitOpError()
+               << "TensorDesc doesn't need LayoutAttr for SIMT code";
+      if (getTransposeAttr())
+        return emitOpError() << "doesn't need TransposeAttr for SIMT code";
+    }
+    return success();
+  } else if (valueTy.getRank() == 1 && tdescShape[0] == chunkSize) {
+    // for 1D vector and valueTy.getNumElements() == tdescShape[0] case,
+    // it is a valid SIMT code if chunkSize happens to be the same as
+    // subgroup size, e.g., tensor_desc<16x16xf16, chunkSize = 16>
+    return success();
+  }
+
+  // For SIMD code verification.
   if (tdescTy.getRank() == 2) {
     if (!getTransposeAttr())
       return emitOpError("load of rank-2 tensor has to be transposed.");
     transpose({1, 0}, tdescShape);
   }
 
-  return isArgShapesValid(tdescTy, valueTy, tdescShape,
-                          [&]() { return emitOpError(); });
+  if (tdescShape != valueShape)
+    return emitOpError() << "Result shape " << makeString(valueShape)
+                         << " is not consistent with tensor descriptor "
+                         << tdescTy;
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -559,14 +613,42 @@ LogicalResult StoreScatterOp::verify() {
   if (tdescShape[0] != maskShape[0])
     return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
 
+  auto chunkSize = tdescTy.getChunkSize();
+  // for SIMT code, the value should be 1D vector with size of chunkSize.
+  if (valueTy.getRank() == 1 && valueTy.getNumElements() != tdescShape[0]) {
+    if (valueTy.getNumElements() != chunkSize) {
+      return emitOpError()
+             << "Value shape " << makeString(valueShape)
+             << " is not a valid distribution for tensor descriptor "
+             << tdescTy;
+    } else { // valid SIMT code doesn't need LayoutAttr and TransposeAttr.
+      if (tdescTy.getLayoutAttr())
+        return emitOpError()
+               << "TensorDesc doesn't need LayoutAttr for SIMT code";
+      if (getTransposeAttr())
+        return emitOpError() << "doesn't need TransposeAttr for SIMT code";
+    }
+    return success();
+  } else if (valueTy.getRank() == 1 && tdescShape[0] == chunkSize) {
+    // for 1D vector and valueTy.getNumElements() == tdescShape[0] case,
+    // it is a valid SIMT code if chunkSize happens to be the same as
+    // subgroup size, e.g., tensor_desc<16x16xf16, chunkSize = 16>
+    return success();
+  }
+
+  // for SIMD code verification.
   if (tdescTy.getRank() == 2) {
     if (!getTransposeAttr())
       return emitOpError("Store of a rank-2 tensor has to be transposed.");
     transpose({1, 0}, tdescShape);
   }
 
-  return isArgShapesValid(tdescTy, valueTy, tdescShape,
-                          [&]() { return emitOpError(); });
+  if (tdescShape != valueShape)
+    return emitOpError() << "Value shape " << makeString(valueShape)
+                         << " is not consistent with tensor descriptor "
+                         << tdescTy;
+
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -602,51 +684,16 @@ LogicalResult DpasOp::verify() {
   auto rhsShape = getRhsType().getShape();
   auto resShape = getResultType().getShape();
 
-  auto aLayout = getALayoutAttr();
-  auto bLayout = getBLayoutAttr();
-  auto cLayout = getCLayoutAttr();
-
-  // make sure the layout attribute is either set for every available
-  // operand or simply not set at all. C is special, since ACC is optional.
-  auto hasValidLayoutAttrs = [&]() {
-    bool result = (aLayout != nullptr) ^ (bLayout != nullptr);
-    if (hasAcc()) {
-      result |= (aLayout != nullptr) ^ (cLayout != nullptr);
-    }
-    return !result;
-  };
+  if (getAcc()) {
+    if (getAcc().getType() != getResultType())
+      return emitOpError("Expecting the acc type to be the same as result.");
+  }
 
-  if (!hasValidLayoutAttrs())
-    return emitOpError(
-        "layout attributes should be either set for all operands (for SIMT "
-        "code) or not set at all (for SIMD code).");
-
-  // query the scope from aLayout (a valid setting).
-  if (aLayout) {
-    // In SIMT mode, All data fragments must be 2D
-    if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
-      return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
-
-    auto laneLayoutA = aLayout.getLaneLayout();
-    auto laneLayoutB = bLayout.getLaneLayout();
-    auto laneLayoutC = cLayout.getLaneLayout();
-    // Obtain the expanded shapes of the operands and result using lane_layout.
-    // NOTE: For B, get rid of the packed dimension for the expanded shape.
-    SmallVector<int64_t> expandedShapeA = {lhsShape[0] * laneLayoutA[0],
-                                           lhsShape[1] * laneLayoutA[1]};
-    SmallVector<int64_t> expandedShapeB = {
-        rhsShape[0] * rhsShape[1] * laneLayoutB[0], 1 * laneLayoutB[1]};
-    SmallVector<int64_t> expandedShapeC = {resShape[0] * laneLayoutC[0],
-                                           resShape[1] * laneLayoutC[1]};
-    auto bK = expandedShapeB[0];
-    if (bK != expandedShapeA[1])
-      return emitOpError("K-dimension mismatch.");
-    if (expandedShapeA[0] != expandedShapeC[0])
-      return emitOpError("M-dimension mismatch.");
-    if (expandedShapeB[1] != expandedShapeC[1])
-      return emitOpError("N-dimension mismatch.");
-  } else { // For other scopes, operands' shape should match the mxkxn
-           // semantics.
+  // SIMT code: skip the check since lack of semantic info at this level.
+  // Users need to ensure the correctness.
+  if (lhsRank == 1 && rhsRank == 1 && resRank == 1) {
+    return success();
+  } else { // SIMD code
     if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resRank != 2)
       return emitOpError(
           "expecting lhs and result to be a 2D vector, and rhs to be either "
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 48df33a591908..c0739d735dfec 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -79,25 +79,10 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
 
 // -----
 func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  // expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1]}}
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
-      l2_hint = #xegpu.cache_hint<uncached>}>
-    : !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    -> vector<8x2xf32>
-  return
-}
-
-// -----
-func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-  // expected-error at +1 {{Result shape [8] is not consistent with distributed vector shape [1, 1]}}
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
+  // expected-error at +1 {{Result shape [8] is not a valid distribution for tensor descriptor}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
-      l2_hint = #xegpu.cache_hint<uncached>}>
-    : !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    -> vector<8xf32>
+      l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf32> -> vector<8xf32>
   return
 }
 
@@ -105,7 +90,7 @@ func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
 func.func @test_load_nd_vc_6(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
     !xegpu.tensor_desc<8x16xf32>
-  // expected-error at +1 {{Value shape [8, 1] is not consistent with tensor descriptor}}
+  // expected-error at +1 {{Result shape [8, 1] is not consistent with tensor descriptor}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
       l2_hint = #xegpu.cache_hint<uncached>}>
     : !xegpu.tensor_desc<8x16xf32> -> vector<8x1xf32>
@@ -134,22 +119,10 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
 }
 
 // -----
-func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) {
-  %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  // expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1] for tensor descriptor}}
-  xegpu.store_nd %data, %1
-    : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  return
-}
-
-// -----
-func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<2xf32>) {
-  %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-  // expected-error at +1 {{Result shape [2] is not consistent with distributed vector shape [1, 1] for tensor descriptor}}
-  xegpu.store_nd %data, %1
-    : vector<2xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+func.func @test_store_nd_simt(%dst: memref<24x32xf32>, %data: vector<4xf32>) {
+  %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
+  // expected-error at +1 {{Value shape [4] is not a valid distribution for tensor descriptor}}
+  xegpu.store_nd %data, %1 : vector<4xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
 
@@ -269,45 +242,23 @@ func.func @test_create_tdesc_layout_3(%src: ui64) {
 }
 
 // -----
-func.func @test_load_gather_layout_1(%src: ui64) {
+func.func @test_load_gather_simt_1(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,   #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  // expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+  // expected-error at +1 {{Result shape [6] is not a valid distribution for tensor descriptor}}
+  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1> -> vector<6xf32>
   return
 }
 
 // -----
-func.func @test_load_gather_layout_2(%src: ui64) {
+func.func @test_store_scatter_simt_1(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  // expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
-  return
-}
-
-
-// -----
-func.func @test_store_scatter_layout_1(%src: ui64) {
-  %0 = arith.constant dense<1>: vector<4xi1>
-  %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %val = arith.constant dense<2.9>: vector<1x2xf32>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  // expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
-  return
-}
-
-// -----
-func.func @test_store_scatter_layout_2(%src: ui64) {
-  %0 = arith.constant dense<1>: vector<4xi1>
-  %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %val = arith.constant dense<2.9>: vector<2xf32>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  // expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
+  %val = arith.constant dense<2.9>: vector<6xf32>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+  // expected-error at +1 {{Value shape [6] is not a valid distribution for tensor descriptor}}
+  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>}> : vector<6xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
   return
 }
 
@@ -393,23 +344,6 @@ func.func @test_dpas_4(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) {
   return
 }
 
-// -----
-func.func @test_dpas_layout_1(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
-  // expected-error at +1 {{layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code)}}
-  %1 = xegpu.dpas %a, %b {a_layout =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
-  return
-}
-
-// -----
-func.func @test_dpas_layout_2(%a : vector<8x1xf16>, %b: vector<4x2xf16>) {
-  // expected-error at +1 {{K-dimension mismatch}}
-  %1 = xegpu.dpas %a, %b {a_layout =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-                          b_layout =  #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-                          c_layout =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-                          : vector<8x1xf16>, vector<4x2xf16> -> vector<8x1xf32>
-  return
-}
-
 // -----
 func.func @test_atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector<16xi1>) {
   %0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index e9895e0d0a71d..71e7e9bdda07d 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -125,11 +125,11 @@ gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
 
 // CHECK: func @test_load_nd_simt(%[[arg0:.*]]: memref<8x16xf16>) {
 gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<4x2xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
   %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
-       : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<4x2xf16>
+       : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
   gpu.return
 }
 
@@ -144,10 +144,10 @@ gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
 
 // CHECK: func @test_load_nd_simt_2(%[[arg0:.*]]: memref<8x16xf16>) {
 gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<1x1xf16>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<1x1xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
   gpu.return
 }
 
@@ -162,11 +162,10 @@ gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
 
 // CHECK: func @test_load_nd_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
   gpu.return
 }
 
@@ -181,11 +180,10 @@ gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) {
 
 // CHECK: func @test_load_nd_simt_4(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
-    !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<8x2xf16>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<8x2xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
   gpu.return
 }
 
@@ -200,11 +198,10 @@ gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
 
 // CHECK: func @test_load_nd_simt_5(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<2x1xf32>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<2x1xf32>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32>
   gpu.return
 }
 
@@ -219,11 +216,11 @@ gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) {
 
 // CHECK: func @test_load_nd_simt_6(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x1xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
-    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x1xf16>
+    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<32xf16>
   gpu.return
 }
 
@@ -238,11 +235,11 @@ gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) {
 
 // CHECK: func @test_load_nd_simt_7(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x2xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
-    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x2xf16>
+    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<32xf16>
   gpu.return
 }
 
@@ -257,10 +254,10 @@ gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) {
 
 // CHECK: func @test_load_nd_simt_8(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<8x1xf32>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<8x1xf32>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32>
   gpu.return
 }
 
@@ -277,13 +274,12 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
 
 // CHECK: func @test_store_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) {
-   // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48x1xf16>
-  %1 = arith.constant dense<1.0>: vector<48x1xf16>
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
-    !xegpu.tensor_desc<24x32xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48xf16>
+  %1 = arith.constant dense<1.0>: vector<48xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+  %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48xf16>, !xegpu.tensor_desc<24x32xf16>
+  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48xf16>, !xegpu.tensor_desc<24x32xf16>
   gpu.return
 }
 
@@ -303,13 +299,12 @@ gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
 
 // CHECK: func @test_store_nd_simt_2(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) {
-   // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2x1xf16>
-  %1 = arith.constant dense<1.0>: vector<2x1xf16>
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-  %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
-    !xegpu.tensor_desc<32xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+  // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2xf16>
+  %1 = arith.constant dense<1.0>: vector<2xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
+  %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
+  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf16>, !xegpu.tensor_desc<32xf16>
+  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2xf16>, !xegpu.tensor_desc<32xf16>
   gpu.return
 }
 
@@ -425,10 +420,10 @@ gpu.func @test_load_simt(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
   %1 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
-  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1> -> vector<2xf32>
+  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1> -> vector<2xf32>
   gpu.return
 }
 
@@ -451,10 +446,10 @@ gpu.func @test_load_simt_2(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
   %1 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
-  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
-  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>, vector<4xi1> -> vector<1xf32>
-  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>, vector<4xi1> -> vector<1xf32>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
+  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
+  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<1xf32>
+  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<1xf32>
   gpu.return
 }
 
@@ -477,10 +472,10 @@ gpu.func @test_load_simt_3(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
   %1 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
-  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
-  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
-  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
+  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
+  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<4xi1> -> vector<8xf16>
+  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<4xi1> -> vector<8xf16>
   gpu.return
 }
 
@@ -507,12 +502,12 @@ gpu.func @test_store_simt(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
   %1 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2x1xf32>
-  %2 = arith.constant dense<2.9>: vector<2x1xf32>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
-  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
+  //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2xf32>
+  %2 = arith.constant dense<2.9>: vector<2xf32>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1>
+  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
   gpu.return
 }
 
@@ -539,12 +534,12 @@ gpu.func @test_store_simt_2(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
   %1 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<1x2xf16>
-  %2 = arith.constant dense<2.9>: vector<1x2xf16>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
-  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
-  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1>
-  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1>
+  //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<2xf16>
+  %2 = arith.constant dense<2.9>: vector<2xf16>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1>
+  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
   gpu.return
 }
 
@@ -572,10 +567,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
   %1 = arith.constant dense<1>: vector<4xi1>
   //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32>
   %2 = arith.constant dense<2.9>: vector<1xf32>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
-  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
-  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>, vector<4xi1>
-  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>, vector<4xi1>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
+  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
+  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1>
+  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1>
   gpu.return
 }
 
@@ -635,15 +630,10 @@ gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8x1xf16>, %[[arg1:.*]]: vector<8x2xf16>)
-gpu.func @test_dpas_simt(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
-  // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {a_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-  // CHECK: b_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-  // CHECK: c_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
-  %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-                          b_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-                          c_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-                          : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+// CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8xf16>, %[[arg1:.*]]: vector<16xf16>)
+gpu.func @test_dpas_simt(%a : vector<8xf16>, %b: vector<16xf16>) {
+  // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
+  %1 = xegpu.dpas %a, %b : vector<8xf16>, vector<16xf16> -> vector<8xf32>
   gpu.return
 }
 

>From 2159119977dfb62c11d808777529dd34ed0abd43 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Thu, 10 Apr 2025 20:25:00 +0000
Subject: [PATCH 41/45] refine verfier for load_nd and store_nd

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |  4 +-
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 53 +++++++++----------
 mlir/test/Dialect/XeGPU/invalid.mlir          | 19 +++++--
 3 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 9af6eaf69aec3..5fa18754305ca 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -840,7 +840,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
     can be represented as `B: vector<8x16x2xf16>`.
 
     In SIMT code, each work-item from a subgroup holds a data fragment for A, B, C and the result,
-    which are represented as 1D vectors.
+    which are represented as 1D vectors. Please refer to [OpenCL Intel extentions]
+    (https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html)
+    for more details about the fragment distribution.
 
     Note: on PVC, the hardware can perform load with VNNI transformation when data
           element type is 16-bit or lower precision, taking 2 or 4 elements from
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index fef39508c3bfe..1dafc9936107e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -270,33 +270,31 @@ LogicalResult LoadNdOp::verify() {
   if (!isReadHintOrNone(getL3HintAttr()))
     return emitOpError("invalid l3_hint: ") << getL3HintAttr();
 
-  // Handling a 1D vector as the result can be complex. It may represent the
-  // outcome of a 1D block load in SIMD mode or a fragment of a block load
-  // result in SIMT mode. In the latter case, the tensor descriptor must be
-  // evenly distributed, with each lane holding an equally sized fragment of
-  // the result. Only subgroup size 8 or 16 is supported.
-  if (valueTy.getRank() == 1 &&
-      valueTy.getNumElements() < tdescTy.getNumElements()) {
+  int tdescElems = tdescTy.getNumElements() * tdescTy.getArrayLength();
+  int valueElems = valueTy.getNumElements();
+
+  // If the result vector is 1D and has less elements than the tensor
+  // descriptor, it is supposed to be a SIMT op. The layout attribute in
+  // tensor_desc is not needed.
+  if (valueElems < tdescElems && valueTy.getRank() == 1) {
     // SIMT mode doesn't need LayoutAttr.
     if (tdescTy.getLayoutAttr())
       return emitOpError()
              << "TensorDesc doesn't need LayoutAttr for SIMT code";
 
-    int tdescElems = tdescTy.getNumElements() * tdescTy.getArrayLength();
-    int valueElems = valueTy.getNumElements();
-
-    int lanes = tdescElems % valueElems == 0 ? tdescElems / valueElems : -1;
-    if (lanes != 16 && lanes != 8) {
+    // For SIMT code, the load is evenly distributed across all lanes in a
+    // subgroup. Since subgroup size is arch dependent, we only check even
+    // distribution here.
+    if (tdescElems % valueElems)
       return emitOpError()
              << "Result shape " << makeString(getShapeOf(valueTy))
              << " is not a valid distribution for tensor descriptor "
              << tdescTy;
-    }
+
     return success();
   }
 
   // Check SIMD mode.
-  auto array_len = tdescTy.getArrayLength();
   // adjusted tensor descriptor shape tracks the expected shape of the result.
   auto tdescShape = getShapeOf(tdescTy);
   auto valueShape = getShapeOf(valueTy);
@@ -328,6 +326,7 @@ LogicalResult LoadNdOp::verify() {
     }
   }
 
+  auto array_len = tdescTy.getArrayLength();
   if (array_len > 1) {
     tdescShape.insert(tdescShape.begin(), array_len);
   }
@@ -366,25 +365,23 @@ LogicalResult StoreNdOp::verify() {
   if (!isWriteHintOrNone(getL3HintAttr()))
     return emitOpError("invalid l3_hint: ") << getL3HintAttr();
 
-  auto tdescShape = getShapeOf(dstTy);
-  auto valueShape = getShapeOf(valTy);
+  auto array_len = dstTy.getArrayLength();
+  if (array_len > 1)
+    return emitOpError("array length is not supported by store_nd.\n");
+
+  auto tdescElems = dstTy.getNumElements();
+  auto valueElems = valTy.getNumElements();
 
-  // Similar to LoadNdOp, handling a 1D vector as the value can be complex. It
-  // may represent the input of a 1D block store in SIMD mode or a fragment of
-  // a block store input in SIMT mode. In the latter case, the tensor descriptor
-  // must be evenly distributed, with each lane holding an equally sized
-  // fragment of the input. Only subgroup size 8 or 16 is supported.
-  if (valTy.getRank() == 1 && valTy.getNumElements() < dstTy.getNumElements()) {
+  // Similar to LoadNdOp, if the value vector is 1D and has less elements than
+  // the tensor descriptor, it is supposed to be a SIMT op. The layout attribute
+  // in tensor_desc is not needed.
+  if (valTy.getRank() == 1 && valueElems < tdescElems) {
     // SIMT mode doesn't need LayoutAttr.
     if (dstTy.getLayoutAttr())
       return emitOpError()
              << "TensorDesc doesn't need LayoutAttr for SIMT code";
 
-    int tdescElems = dstTy.getNumElements() * dstTy.getArrayLength();
-    int valueElems = valueShape[0];
-
-    int lanes = tdescElems % valueElems == 0 ? tdescElems / valueElems : -1;
-    if (lanes != 16 && lanes != 8) {
+    if (tdescElems % valueElems) {
       return emitOpError()
              << "Value shape " << makeString(getShapeOf(valTy))
              << " is not a valid distribution for tensor descriptor " << dstTy;
@@ -393,6 +390,8 @@ LogicalResult StoreNdOp::verify() {
   }
 
   // SIMD code should have the same shape as the tensor descriptor.
+  auto tdescShape = getShapeOf(dstTy);
+  auto valueShape = getShapeOf(valTy);
   if (tdescShape != valueShape) {
     return emitOpError() << "Value shape " << makeString(valueShape)
                          << " is not consistent with tensor descriptor "
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index c0739d735dfec..a02427b6e317b 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -80,9 +80,9 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
 // -----
 func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
-  // expected-error at +1 {{Result shape [8] is not a valid distribution for tensor descriptor}}
+  // expected-error at +1 {{Result shape [3] is not a valid distribution for tensor descriptor}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
-      l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf32> -> vector<8xf32>
+      l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf32> -> vector<3xf32>
   return
 }
 
@@ -119,10 +119,19 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
 }
 
 // -----
-func.func @test_store_nd_simt(%dst: memref<24x32xf32>, %data: vector<4xf32>) {
+func.func @test_store_nd_vc_3(%dst: memref<24x32xf16>) {
+  %1 = arith.constant dense<1.0>: vector<2x24x32xf16>
+  %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+  // expected-error at +1 {{array length is not supported by store_nd}}
+  xegpu.store_nd %1, %2: vector<2x24x32xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+  return
+}
+
+// -----
+func.func @test_store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) {
   %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
-  // expected-error at +1 {{Value shape [4] is not a valid distribution for tensor descriptor}}
-  xegpu.store_nd %data, %1 : vector<4xf32>, !xegpu.tensor_desc<16xf32>
+  // expected-error at +1 {{Value shape [3] is not a valid distribution for tensor descriptor}}
+  xegpu.store_nd %data, %1 : vector<3xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
 

>From 21f50c09992cf9ef629ab02036d2b4be273113e1 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 10 Apr 2025 20:31:43 +0000
Subject: [PATCH 42/45] fix issues

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  4 ++
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 52 +++++++++----------
 2 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 254568e00dfcb..53372a23a2182 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -378,6 +378,10 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
   // tensorSize must be adjusted for array_length.
   tensorSize *= getArrayLength();
 
+  if (layout.getRank() == 1) {
+    return VectorType::get({tensorSize / sgSize}, getElementType());
+  }
+
   return VectorType::get({tensorSize / (sgSize * laneDataSize), laneDataSize},
                          getElementType());
 }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 5b812a731ec95..ed9418696c69b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -36,6 +36,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/ADT/bit.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/raw_ostream.h"
@@ -781,30 +782,27 @@ namespace {
 /// | 2x32x16               | [1, 16]     | 2x32x1                   |
 FailureOr<VectorType> getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
                                                       VectorType originalType) {
-  llvm::SmallVector<int64_t, 2> distributedShape;
   if (!layout)
     return failure();
 
-  auto laneLayout = layout.getLaneLayout();
-  assert((originalType.getRank() == 2 || originalType.getRank() == 3) &&
-         "expecting 2D or 3D shape for the original vector type");
-  assert(laneLayout.size() == 2 && "expecting 2D shape for the wi layout");
-  // Original type can be 2D or 3D (array_length > 1), the last two dims are the
-  // block shape.
-  auto blockShape = originalType.getShape().take_back(2);
-  // Check if the block vector shape can be distributed evenly.
-  if (blockShape[0] % laneLayout[0] != 0 || blockShape[1] % laneLayout[1] != 0)
-    return failure();
-
-  if (originalType.getRank() == 3) {
-    distributedShape.push_back(originalType.getShape()[0]);
-  }
-  for (unsigned i = 0; i < 2; ++i) {
-    distributedShape.push_back(blockShape[i] / laneLayout[i]);
+  auto laneLayout = layout.getLaneLayout().asArrayRef();
+  assert(originalType.getShape().size() >= laneLayout.size() &&
+         "Rank of the original vector type should be greater or equal to the "
+         "size of the lane layout to distribute the vector type.");
+  SmallVector<int64_t> distributedShape(originalType.getShape());
+  /// Only distribute the last `laneLayout.size()` dimensions. The remaining
+  /// dimensions are not distributed.
+  unsigned distributionStart = originalType.getRank() - laneLayout.size();
+  for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
+    if (i < distributionStart) {
+      continue;
+    }
+    /// Check if the dimension can be distributed evenly.
+    if (dim % laneLayout[i - distributionStart] != 0)
+      return failure();
+    distributedShape[i] = dim / laneLayout[i - distributionStart];
   }
-  auto newVectorType =
-      VectorType::get(distributedShape, originalType.getElementType());
-  return newVectorType;
+  return VectorType::get(distributedShape, originalType.getElementType());
 }
 
 static VectorType getDistributedVectorType(xegpu::LayoutAttr layout,
@@ -1028,15 +1026,14 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
       return rewriter.notifyMatchFailure(
           storeOp, "the source tensor descriptor lacks sg_map attribute");
 
-    if (storeOp.getTensorDescType().getShape().size() != 2)
-      return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
-
-    auto distriburtedTypeByWarpOp =
+    auto distributedTypeByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
-    if (failed(distriburtedTypeByWarpOp))
+    if (failed(distributedTypeByWarpOpOrFailure))
       return rewriter.notifyMatchFailure(storeOp,
                                          "Failed to distribute the type");
-    VectorType distributedTypeByWarpOp = distriburtedTypeByWarpOp.value();
+    VectorType distributedTypeByWarpOp =
+        distributedTypeByWarpOpOrFailure.value();
+    llvm::errs() << "distributed type: " << distributedTypeByWarpOp << "\n";
 
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
@@ -1066,7 +1063,8 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
     newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[1]));
 
     rewriter.create<xegpu::StoreNdOp>(newWarpOp.getLoc(), TypeRange{},
-                                      newStoreOperands, storeOp->getAttrs());
+                                      newStoreOperands);
+    storeOp->setDialectAttrs(storeOp->getDialectAttrs());
     rewriter.eraseOp(storeOp);
     return success();
   }

>From c81b2e05e6a5fde3c314d557d7993f8b73cf66cd Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 10 Apr 2025 22:18:05 +0000
Subject: [PATCH 43/45] fix issues

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 130 +++++++++---------
 .../Transforms/XeGPUSubgroupDistribute.cpp    |  91 +++++++-----
 .../Dialect/XeGPU/subgroup-distribution.mlir  |  70 ++++++++++
 3 files changed, 192 insertions(+), 99 deletions(-)
 create mode 100644 mlir/test/Dialect/XeGPU/subgroup-distribution.mlir

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index cb5b87d233595..d563bce6b9c9b 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -597,70 +597,72 @@ void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
 // XeGPU_DpasOp
 //===----------------------------------------------------------------------===//
 LogicalResult DpasOp::verify() {
-  int64_t lhsRank = getLhsType().getRank();
-  int64_t rhsRank = getRhsType().getRank();
-  int64_t resRank = getResultType().getRank();
-  auto lhsShape = getLhsType().getShape();
-  auto rhsShape = getRhsType().getShape();
-  auto resShape = getResultType().getShape();
-
-  auto aLayout = getALayoutAttr();
-  auto bLayout = getBLayoutAttr();
-  auto cLayout = getCLayoutAttr();
-
-  // make sure the layout attribute is either set for every available
-  // operand or simply not set at all. C is special, since ACC is optional.
-  auto hasValidLayoutAttrs = [&]() {
-    bool result = (aLayout != nullptr) ^ (bLayout != nullptr);
-    if (hasAcc()) {
-      result |= (aLayout != nullptr) ^ (cLayout != nullptr);
-    }
-    return !result;
-  };
-
-  if (!hasValidLayoutAttrs())
-    return emitOpError(
-        "layout attributes should be either set for all operands (for SIMT "
-        "code) or not set at all (for SIMD code).");
-
-  // query the scope from aLayout (a valid setting).
-  if (aLayout) {
-    // In SIMT mode, All data fragments must be 2D
-    if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
-      return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
-
-    auto laneLayoutA = aLayout.getLaneLayout();
-    auto laneLayoutB = bLayout.getLaneLayout();
-    auto laneLayoutC = cLayout.getLaneLayout();
-    // Obtain the expanded shapes of the operands and result using lane_layout.
-    // NOTE: For B, get rid of the packed dimension for the expanded shape.
-    SmallVector<int64_t> expandedShapeA = {lhsShape[0] * laneLayoutA[0],
-                                           lhsShape[1] * laneLayoutA[1]};
-    SmallVector<int64_t> expandedShapeB = {
-        rhsShape[0] * rhsShape[1] * laneLayoutB[0], 1 * laneLayoutB[1]};
-    SmallVector<int64_t> expandedShapeC = {resShape[0] * laneLayoutC[0],
-                                           resShape[1] * laneLayoutC[1]};
-    auto bK = expandedShapeB[0];
-    if (bK != expandedShapeA[1])
-      return emitOpError("K-dimension mismatch.");
-    if (expandedShapeA[0] != expandedShapeC[0])
-      return emitOpError("M-dimension mismatch.");
-    if (expandedShapeB[1] != expandedShapeC[1])
-      return emitOpError("N-dimension mismatch.");
-  } else { // For other scopes, operands' shape should match the mxkxn
-           // semantics.
-    if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resRank != 2)
-      return emitOpError(
-          "expecting lhs and result to be a 2D vector, and rhs to be either "
-          "2D or 3D (packed) vector.");
-    auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
-    if (bK != lhsShape[1])
-      return emitOpError("K-dimension mismatch.");
-    if (lhsShape[0] != resShape[0])
-      return emitOpError("M-dimension mismatch.");
-    if (rhsShape[1] != resShape[1])
-      return emitOpError("N-dimension mismatch.");
-  }
+  // int64_t lhsRank = getLhsType().getRank();
+  // int64_t rhsRank = getRhsType().getRank();
+  // int64_t resRank = getResultType().getRank();
+  // auto lhsShape = getLhsType().getShape();
+  // auto rhsShape = getRhsType().getShape();
+  // auto resShape = getResultType().getShape();
+
+  // auto aLayout = getALayoutAttr();
+  // auto bLayout = getBLayoutAttr();
+  // auto cLayout = getCLayoutAttr();
+
+  // // make sure the layout attribute is either set for every available
+  // // operand or simply not set at all. C is special, since ACC is optional.
+  // auto hasValidLayoutAttrs = [&]() {
+  //   bool result = (aLayout != nullptr) ^ (bLayout != nullptr);
+  //   if (hasAcc()) {
+  //     result |= (aLayout != nullptr) ^ (cLayout != nullptr);
+  //   }
+  //   return !result;
+  // };
+
+  // if (!hasValidLayoutAttrs())
+  //   return emitOpError(
+  //       "layout attributes should be either set for all operands (for SIMT "
+  //       "code) or not set at all (for SIMD code).");
+
+  // // query the scope from aLayout (a valid setting).
+  // if (aLayout) {
+  //   // In SIMT mode, All data fragments must be 2D
+  //   if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
+  //     return emitOpError("expecting lhs, rhs, and result to be a 2D
+  //     vector.");
+
+  //   auto laneLayoutA = aLayout.getLaneLayout();
+  //   auto laneLayoutB = bLayout.getLaneLayout();
+  //   auto laneLayoutC = cLayout.getLaneLayout();
+  //   // Obtain the expanded shapes of the operands and result using
+  //   lane_layout.
+  //   // NOTE: For B, get rid of the packed dimension for the expanded shape.
+  //   SmallVector<int64_t> expandedShapeA = {lhsShape[0] * laneLayoutA[0],
+  //                                          lhsShape[1] * laneLayoutA[1]};
+  //   SmallVector<int64_t> expandedShapeB = {
+  //       rhsShape[0] * rhsShape[1] * laneLayoutB[0], 1 * laneLayoutB[1]};
+  //   SmallVector<int64_t> expandedShapeC = {resShape[0] * laneLayoutC[0],
+  //                                          resShape[1] * laneLayoutC[1]};
+  //   auto bK = expandedShapeB[0];
+  //   if (bK != expandedShapeA[1])
+  //     return emitOpError("K-dimension mismatch.");
+  //   if (expandedShapeA[0] != expandedShapeC[0])
+  //     return emitOpError("M-dimension mismatch.");
+  //   if (expandedShapeB[1] != expandedShapeC[1])
+  //     return emitOpError("N-dimension mismatch.");
+  // } else { // For other scopes, operands' shape should match the mxkxn
+  //          // semantics.
+  //   if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resRank != 2)
+  //     return emitOpError(
+  //         "expecting lhs and result to be a 2D vector, and rhs to be either "
+  //         "2D or 3D (packed) vector.");
+  //   auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
+  //   if (bK != lhsShape[1])
+  //     return emitOpError("K-dimension mismatch.");
+  //   if (lhsShape[0] != resShape[0])
+  //     return emitOpError("M-dimension mismatch.");
+  //   if (rhsShape[1] != resShape[1])
+  //     return emitOpError("N-dimension mismatch.");
+  // }
   return success();
 }
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index ed9418696c69b..34e0ac7b2d094 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -38,6 +38,8 @@
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/ADT/bit.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -63,6 +65,8 @@ constexpr unsigned packedSizeInBitsForDefault =
     16; // Minimum packing size per register for DPAS A.
 constexpr unsigned packedSizeInBitsForDpasB =
     32; // Minimum packing size per register for DPAS B.
+static const char *const operandLayoutNamePrefix = "layout_operand_";
+static const char *const resultLayoutNamePrefix = "layout_result_";
 
 namespace {
 
@@ -686,7 +690,8 @@ void attachLayoutAttributeToUsers(Value v, xegpu::LayoutAttr layout) {
       continue;
     }
     /// For every other user, use a generic attribute name.
-    std::string attrName = "op" + std::to_string(operandNumber);
+    std::string attrName =
+        operandLayoutNamePrefix + std::to_string(operandNumber);
     owner->setAttr(attrName, layout);
   }
 }
@@ -746,7 +751,7 @@ static LogicalResult attachLayoutAttributes(
     for (auto [i, r] : llvm::enumerate(op->getResults())) {
       auto layoutInfo = getLayoutInfoForResult(r);
       if (layoutInfo) {
-        auto attrName = "r" + std::to_string(i);
+        auto attrName = resultLayoutNamePrefix + std::to_string(i);
         op->setAttr(attrName, layoutInfo);
         /// Attach the layout attribute to the users of the result.
         attachLayoutAttributeToUsers(r, layoutInfo);
@@ -819,16 +824,29 @@ static VectorType getDistributedVectorType(xegpu::LayoutAttr layout,
   return distVecTyOrFailure.value();
 }
 
-static Value reconcileDistribtedVecType(Value orig, VectorType expected,
-                                        PatternRewriter &rewriter) {
+static Value reshapeDistributedVecType(Value orig, VectorType expected,
+                                       PatternRewriter &rewriter) {
   assert(isa<VectorType>(orig.getType()) && "expecting vector type");
   auto origVecType = cast<VectorType>(orig.getType());
   /// No need to reconcile if the types are the same.
   if (origVecType == expected)
     return orig;
-  auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
-                                                            expected, orig);
-  return castOp.getResult(0);
+  auto castOp =
+      rewriter.create<vector::ShapeCastOp>(orig.getLoc(), expected, orig);
+  return castOp.getResult();
+}
+
+static SmallVector<NamedAttribute>
+filterTemporaryLayoutAttributes(ArrayRef<NamedAttribute> attrs) {
+  SmallVector<NamedAttribute> newAttrs;
+  for (auto attr : attrs) {
+    if (attr.getName().strref().contains(operandLayoutNamePrefix) ||
+        attr.getName().strref().contains(resultLayoutNamePrefix)) {
+      continue;
+    }
+    newAttrs.push_back(attr);
+  }
+  return newAttrs;
 }
 
 /// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
@@ -903,11 +921,11 @@ struct MoveFuncBodyToWarpExecuteOnLane0
 };
 
 /// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
-/// `gpu.warp_execute_on_lane_0` and put it after the warp op. The warp op will
-/// still contain the original op that will not be used by the yield op (and
-/// should be cleaned up later with dce). The yield op will bypass the
-/// create_nd_tdesc's arguments. Tensor descriptor is not distributed because it
-/// is a uniform value accorss all work items within the subgroup.
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op. The warp op
+/// will still contain the original op that will not be used by the yield op
+/// (and should be cleaned up later with dce). The yield op will bypass the
+/// create_nd_tdesc's arguments. Tensor descriptor is not distributed because
+/// it is a uniform value accorss all work items within the subgroup.
 ///
 /// Example:
 ///
@@ -985,10 +1003,10 @@ struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
   }
 };
 
-/// Sink a store_nd op at the end of enclosing `gpu.warp_execute_on_lane_0`. In
-/// case arguments for the store are passed through the warp op interface they
-/// would be propagated as returned values. Only the source vector for the store
-/// is distributed according to sg_map attribute.
+/// Sink a store_nd op at the end of enclosing `gpu.warp_execute_on_lane_0`.
+/// In case arguments for the store are passed through the warp op interface
+/// they would be propagated as returned values. Only the source vector for
+/// the store is distributed according to sg_map attribute.
 ///
 /// Example:
 ///
@@ -1033,7 +1051,6 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
                                          "Failed to distribute the type");
     VectorType distributedTypeByWarpOp =
         distributedTypeByWarpOpOrFailure.value();
-    llvm::errs() << "distributed type: " << distributedTypeByWarpOp << "\n";
 
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
@@ -1050,21 +1067,21 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
 
     /// For the value operand, there can be a conflict between the vector type
     /// distributed by the warp op and (xegpu-specific) distributed type
-    /// supported by the store op. We reconcile these mismatches by inserting a
-    /// cast. These gets cancelled out later.
+    /// supported by the store op. We reconcile these mismatches by inserting
+    /// a cast. These gets cancelled out later.
     auto storeNdDistributedValueTyOrFailure =
         storeOp.getTensorDescType().getDistributedVectorType();
     if (failed(storeNdDistributedValueTyOrFailure))
       return rewriter.notifyMatchFailure(
           storeOp, "Failed to get distributed vector type for the store op");
-    newStoreOperands.push_back(reconcileDistribtedVecType(
+    newStoreOperands.push_back(reshapeDistributedVecType(
         newWarpOp.getResult(newRetIndices[0]),
         storeNdDistributedValueTyOrFailure.value(), rewriter));
     newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[1]));
 
-    rewriter.create<xegpu::StoreNdOp>(newWarpOp.getLoc(), TypeRange{},
-                                      newStoreOperands);
-    storeOp->setDialectAttrs(storeOp->getDialectAttrs());
+    rewriter.create<xegpu::StoreNdOp>(
+        newWarpOp.getLoc(), TypeRange{}, newStoreOperands,
+        filterTemporaryLayoutAttributes(storeOp->getAttrs()));
     rewriter.eraseOp(storeOp);
     return success();
   }
@@ -1074,8 +1091,9 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
 /// `gpu.warp_execute_on_lane_0` and put it after the warp op.
 /// The warp op will still contain the original op that will not be used by
 /// the yield op (and should be cleaned up later with dce). The yield op will
-/// bypass the load's arguments. Only the loaded vector is distributed according
-/// to sg_map attribute and, tensor descriptor types is not distributed.
+/// bypass the load's arguments. Only the loaded vector is distributed
+/// according to sg_map attribute and, tensor descriptor types is not
+/// distributed.
 ///
 /// Example:
 ///
@@ -1122,7 +1140,8 @@ struct SubgroupOpLoadNd final : public gpu::WarpDistributionPattern {
 
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, subgroupOp, /* new yielded values = */ loadOp.getTensorDesc(),
+        rewriter, subgroupOp,
+        /* new yielded values = */ loadOp.getTensorDesc(),
         /* new yielded types = */ tensorDescTy, newRetIndices);
 
     /// Create a new load op outside the warp op with the distributed vector
@@ -1135,13 +1154,14 @@ struct SubgroupOpLoadNd final : public gpu::WarpDistributionPattern {
           loadOp, "Failed to get distributed vector type for the load op");
     Value newLoadOp = rewriter.create<xegpu::LoadNdOp>(
         newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
-        newWarpOp->getResult(newRetIndices[0]), loadOp->getAttrs());
+        newWarpOp->getResult(newRetIndices[0]),
+        filterTemporaryLayoutAttributes(loadOp->getAttrs()));
     Value distributedVal = newWarpOp.getResult(operandIdx);
-    /// There can be a conflict between the vector type distributed by the warp
-    /// op and (xegpu-specific) distributed type supported by the load op. We
-    /// reconcile these mismatches by inserting a cast.
-    newLoadOp = reconcileDistribtedVecType(newLoadOp, distributedTypeByWarpOp,
-                                           rewriter);
+    /// There can be a conflict between the vector type distributed by the
+    /// warp op and (xegpu-specific) distributed type supported by the load
+    /// op. We reconcile these mismatches by inserting a cast.
+    newLoadOp =
+        reshapeDistributedVecType(newLoadOp, distributedTypeByWarpOp, rewriter);
     rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
     return success();
   }
@@ -1161,8 +1181,9 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
     unsigned operandIdx = operand->getOperandNumber();
     xegpu::LayoutAttr layoutA = dpasOp.getALayoutAttr();
     xegpu::LayoutAttr layoutB = dpasOp.getBLayoutAttr();
+    auto layoutCName = llvm::formatv("{0}{1}", resultLayoutNamePrefix, 0).str();
     xegpu::LayoutAttr layoutOut =
-        dpasOp->getAttrOfType<xegpu::LayoutAttr>("r0");
+        dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutCName);
     if (!layoutA || !layoutB || !layoutOut)
       return rewriter.notifyMatchFailure(
           dpasOp,
@@ -1211,7 +1232,7 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
     }
 
     for (auto i : newRetIndices) {
-      newDpasOperands.push_back(reconcileDistribtedVecType(
+      newDpasOperands.push_back(reshapeDistributedVecType(
           newWarpOp.getResult(i),
           newDpasOperandExpectedTypes[newDpasOperands.size()], rewriter));
     }
@@ -1220,7 +1241,7 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
         newDpasOperands, dpasOp->getAttrs());
     Value disributedVal = newWarpOp.getResult(operandIdx);
     /// Reconile the output type.
-    disributedVal = reconcileDistribtedVecType(
+    disributedVal = reshapeDistributedVecType(
         disributedVal,
         getDistributedVectorType(layoutOut, dpasOp.getResultType()), rewriter);
     rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
new file mode 100644
index 0000000000000..6369eb7dd035e
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
@@ -0,0 +1,70 @@
+gpu.module @test {
+gpu.func @test_store_nd_1d(%arg0: memref<16xf32>){
+  %c0 = arith.constant 0 : index
+  %1 = arith.constant dense<1.000000e+00> : vector<16xf32>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+  xegpu.store_nd %1, %0 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+  gpu.return
+}
+}
+
+// -----
+gpu.module @test {
+gpu.func @test_store_nd_2d(%arg0: memref<16x16xf16>){
+  %c0 = arith.constant 0 : index
+  %1 = arith.constant dense<1.000000e+00> : vector<16x16xf16>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  xegpu.store_nd %1, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  gpu.return
+}
+}
+
+
+
+// -----
+gpu.module @test {
+gpu.func @test_load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+  %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+  xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+  gpu.return
+}
+}
+
+// -----
+gpu.module @test {
+gpu.func @test_load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  gpu.return
+}
+}
+
+// -----
+gpu.module @test {
+gpu.func @test_load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
+  %2 = vector.extract %1[%c0] : vector<16x16xf16> from vector<2x16x16xf16>
+  %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  gpu.return
+}
+}
+
+// -----
+gpu.module @test {
+gpu.func @test_dpas(%arg0: vector<8x16xf16>, %arg1: vector<16x16xf16>, %arg3: vector<8x16xf32>, %arg2: memref<8x16xf32>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.dpas %arg0, %arg1, %arg3 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+  %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+  xegpu.store_nd %0, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  gpu.return
+}
+}

>From 2f2ec101b06f5f38459eea46454b93d6e47c1278 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 14 Apr 2025 17:23:00 +0000
Subject: [PATCH 44/45] fix issues

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |  12 +-
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 129 ++++++++++++------
 .../Dialect/XeGPU/subgroup-distribution.mlir  | 108 +++++++--------
 3 files changed, 151 insertions(+), 98 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index f8e04f9b3aef7..1dafc9936107e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -676,12 +676,12 @@ void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
 // XeGPU_DpasOp
 //===----------------------------------------------------------------------===//
 LogicalResult DpasOp::verify() {
-  // int64_t lhsRank = getLhsType().getRank();
-  // int64_t rhsRank = getRhsType().getRank();
-  // int64_t resRank = getResultType().getRank();
-  // auto lhsShape = getLhsType().getShape();
-  // auto rhsShape = getRhsType().getShape();
-  // auto resShape = getResultType().getShape();
+  int64_t lhsRank = getLhsType().getRank();
+  int64_t rhsRank = getRhsType().getRank();
+  int64_t resRank = getResultType().getRank();
+  auto lhsShape = getLhsType().getShape();
+  auto rhsShape = getRhsType().getShape();
+  auto resShape = getResultType().getShape();
 
   if (getAcc()) {
     if (getAcc().getType() != getResultType())
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 34e0ac7b2d094..f64f0b5235705 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -21,6 +21,7 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
@@ -679,17 +680,7 @@ void attachLayoutAttributeToUsers(Value v, xegpu::LayoutAttr layout) {
   for (OpOperand &user : v.getUses()) {
     Operation *owner = user.getOwner();
     unsigned operandNumber = user.getOperandNumber();
-    /// If the user is a DpasOp, set A, B or C layout attributes.
-    if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
-      if (operandNumber == 0)
-        dpasOp.setALayoutAttr(layout);
-      else if (operandNumber == 1)
-        dpasOp.setBLayoutAttr(layout);
-      else if (operandNumber == 2)
-        dpasOp.setCLayoutAttr(layout);
-      continue;
-    }
-    /// For every other user, use a generic attribute name.
+    /// Use a generic name for ease of querying the layout attribute later.
     std::string attrName =
         operandLayoutNamePrefix + std::to_string(operandNumber);
     owner->setAttr(attrName, layout);
@@ -824,18 +815,66 @@ static VectorType getDistributedVectorType(xegpu::LayoutAttr layout,
   return distVecTyOrFailure.value();
 }
 
-static Value reshapeDistributedVecType(Value orig, VectorType expected,
-                                       PatternRewriter &rewriter) {
-  assert(isa<VectorType>(orig.getType()) && "expecting vector type");
-  auto origVecType = cast<VectorType>(orig.getType());
-  /// No need to reconcile if the types are the same.
-  if (origVecType == expected)
+static xegpu::TensorDescType dropLayouts(xegpu::TensorDescType tensorDesc) {
+  return xegpu::TensorDescType::get(
+      tensorDesc.getContext(), tensorDesc.getShape(),
+      tensorDesc.getElementType(), tensorDesc.getEncoding(),
+      xegpu::LayoutAttr());
+}
+
+template <typename T>
+static Value resolveDistributedTy(Value orig, T expected,
+                                  PatternRewriter &rewriter) {
+  /// If orig and expected types are the same, return orig.
+  if (orig.getType() == expected)
     return orig;
-  auto castOp =
-      rewriter.create<vector::ShapeCastOp>(orig.getLoc(), expected, orig);
-  return castOp.getResult();
+  /// If orig is a vector type, create a shape cast op to reconcile the types.
+  if (auto origVecType = isa<VectorType>(orig.getType())) {
+    auto castOp =
+        rewriter.create<vector::ShapeCastOp>(orig.getLoc(), expected, orig);
+    return castOp.getResult();
+  }
+  /// If orig is a tensor descriptor type, create an unrealized conversion cast
+  /// op to reconcile the types.
+  if (auto origTensorDescTy = isa<xegpu::TensorDescType>(orig.getType())) {
+    auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
+                                                              expected, orig);
+    return castOp.getResult(0);
+  }
+  llvm_unreachable("Unsupported type for reconciliation");
+  return orig;
 }
 
+// static Value reconcileDistributedTensorDescTy(Value orig,
+//                                               xegpu::TensorDescType expected,
+//                                               PatternRewriter &rewriter) {
+//   assert(isa<xegpu::TensorDescType>(orig.getType()) &&
+//          "expecting tensor descriptor type");
+//   auto origTensorDescTy = cast<xegpu::TensorDescType>(orig.getType());
+//   /// No need to reconcile if the types are the same.
+//   if (origTensorDescTy == expected)
+//     return orig;
+//   auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
+//                                                             expected, orig);
+//   return castOp.getResult(0);
+// }
+
+// // unify above 2 functions with a template
+// template <typename T>
+// static Value reconcileDistributedType(Value orig, T expected,
+//                                        PatternRewriter &rewriter) {
+//   if constexpr (std::is_same_v<T, VectorType>) {
+//     return reconcileDistributedVecType(orig, expected, rewriter);
+//   } else if constexpr (std::is_same_v<T, xegpu::TensorDescType>) {
+//     return reconcileDistributedTensorDescTy(orig, expected, rewriter);
+//   } else {
+//     static_assert(llvm::is_one_of<T, VectorType,
+//     xegpu::TensorDescType>::value,
+//                   "Unsupported type for reconciliation");
+//   }
+//   return orig;
+// }
+
 static SmallVector<NamedAttribute>
 filterTemporaryLayoutAttributes(ArrayRef<NamedAttribute> attrs) {
   SmallVector<NamedAttribute> newAttrs;
@@ -951,7 +990,7 @@ struct MoveFuncBodyToWarpExecuteOnLane0
 ///                                 -> !xegpu.tensor_desc<4x8xf32>
 ///
 /// ```
-struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
+struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
                                 PatternRewriter &rewriter) const override {
@@ -993,8 +1032,11 @@ struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
       newDescOperands.push_back(newWarpOp.getResult(i));
     }
     rewriter.setInsertionPointAfter(newWarpOp);
+    auto distributedTensorDescTy =
+        dropLayouts(descOp.getType()); /// Distributed tensor descriptor type
+                                       /// does not contain layout info.
     auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
-        newWarpOp.getLoc(), descOp.getType(), newDescOperands,
+        newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
         descOp->getAttrs());
 
     Value distributedVal = newWarpOp.getResult(operandIdx);
@@ -1027,7 +1069,7 @@ struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
 ///     !xegpu.tensor_desc<4x8xf32>
 ///
 /// ```
-struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
+struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
                                 PatternRewriter &rewriter) const override {
@@ -1065,19 +1107,24 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
     rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Value> newStoreOperands;
 
-    /// For the value operand, there can be a conflict between the vector type
+    /// For the value operand, there can be a mismatch between the vector type
     /// distributed by the warp op and (xegpu-specific) distributed type
-    /// supported by the store op. We reconcile these mismatches by inserting
-    /// a cast. These gets cancelled out later.
+    /// supported by the store op. Type mismatch must be resolved using
+    /// appropriate cast op.
     auto storeNdDistributedValueTyOrFailure =
         storeOp.getTensorDescType().getDistributedVectorType();
     if (failed(storeNdDistributedValueTyOrFailure))
       return rewriter.notifyMatchFailure(
           storeOp, "Failed to get distributed vector type for the store op");
-    newStoreOperands.push_back(reshapeDistributedVecType(
+    newStoreOperands.push_back(resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[0]),
         storeNdDistributedValueTyOrFailure.value(), rewriter));
-    newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[1]));
+    /// For the tensor descriptor operand, the layout attibute is dropped after
+    /// distribution. Types needs to be resolved in this case also.
+    auto distributedTensorDescTy = dropLayouts(storeOp.getTensorDescType());
+    newStoreOperands.push_back(
+        resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
+                             distributedTensorDescTy, rewriter));
 
     rewriter.create<xegpu::StoreNdOp>(
         newWarpOp.getLoc(), TypeRange{}, newStoreOperands,
@@ -1117,7 +1164,7 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
 ///   %ld = xegpu.load_nd %r#0: !xegpu.tensor_desc<4x8xf32> -> vector<4x1xf32>
 ///
 /// ```
-struct SubgroupOpLoadNd final : public gpu::WarpDistributionPattern {
+struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
                                 PatternRewriter &rewriter) const override {
@@ -1161,13 +1208,13 @@ struct SubgroupOpLoadNd final : public gpu::WarpDistributionPattern {
     /// warp op and (xegpu-specific) distributed type supported by the load
     /// op. We reconcile these mismatches by inserting a cast.
     newLoadOp =
-        reshapeDistributedVecType(newLoadOp, distributedTypeByWarpOp, rewriter);
+        resolveDistributedTy(newLoadOp, distributedTypeByWarpOp, rewriter);
     rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
     return success();
   }
 };
 
-struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
+struct DpasDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
                                 PatternRewriter &rewriter) const override {
@@ -1179,15 +1226,21 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
 
     auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
     unsigned operandIdx = operand->getOperandNumber();
-    xegpu::LayoutAttr layoutA = dpasOp.getALayoutAttr();
-    xegpu::LayoutAttr layoutB = dpasOp.getBLayoutAttr();
+    auto layoutAName =
+        llvm::formatv("{0}{1}", operandLayoutNamePrefix, 0).str();
+    auto layoutBName =
+        llvm::formatv("{0}{1}", operandLayoutNamePrefix, 1).str();
     auto layoutCName = llvm::formatv("{0}{1}", resultLayoutNamePrefix, 0).str();
+    xegpu::LayoutAttr layoutA =
+        dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutAName);
+    xegpu::LayoutAttr layoutB =
+        dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutBName);
     xegpu::LayoutAttr layoutOut =
         dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutCName);
     if (!layoutA || !layoutB || !layoutOut)
       return rewriter.notifyMatchFailure(
           dpasOp,
-          "the xegpu::Dpas op lacks sg_map attribute for A, B or output");
+          "the xegpu::Dpas op lacks layout attribute for A, B or output");
 
     auto distLhsTypeByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType());
@@ -1232,7 +1285,7 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
     }
 
     for (auto i : newRetIndices) {
-      newDpasOperands.push_back(reshapeDistributedVecType(
+      newDpasOperands.push_back(resolveDistributedTy(
           newWarpOp.getResult(i),
           newDpasOperandExpectedTypes[newDpasOperands.size()], rewriter));
     }
@@ -1241,7 +1294,7 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
         newDpasOperands, dpasOp->getAttrs());
     Value disributedVal = newWarpOp.getResult(operandIdx);
     /// Reconile the output type.
-    disributedVal = reshapeDistributedVecType(
+    disributedVal = resolveDistributedTy(
         disributedVal,
         getDistributedVectorType(layoutOut, dpasOp.getResultType()), rewriter);
     rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
@@ -1266,8 +1319,8 @@ struct XeGPUSubgroupDistributePass final
 
 void xegpu::populateXeGPUSubgroupDistributePatterns(
     RewritePatternSet &patterns) {
-  patterns.add<SubgroupOpTensorDescOp, SubgroupOpStoreNd, SubgroupOpLoadNd,
-               SubgroupOpDpas>(patterns.getContext());
+  patterns.add<CreateNdDescDistribution, StoreNdDistribution,
+               LoadNdDistribution, DpasDistribution>(patterns.getContext());
 }
 
 void XeGPUSubgroupDistributePass::runOnOperation() {
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
index 6369eb7dd035e..7197ddfb286eb 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
@@ -8,63 +8,63 @@ gpu.func @test_store_nd_1d(%arg0: memref<16xf32>){
 }
 }
 
-// -----
-gpu.module @test {
-gpu.func @test_store_nd_2d(%arg0: memref<16x16xf16>){
-  %c0 = arith.constant 0 : index
-  %1 = arith.constant dense<1.000000e+00> : vector<16x16xf16>
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-  xegpu.store_nd %1, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
-  gpu.return
-}
-}
+// // -----
+// gpu.module @test {
+// gpu.func @test_store_nd_2d(%arg0: memref<16x16xf16>){
+//   %c0 = arith.constant 0 : index
+//   %1 = arith.constant dense<1.000000e+00> : vector<16x16xf16>
+//   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+//   xegpu.store_nd %1, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+//   gpu.return
+// }
+// }
 
 
 
-// -----
-gpu.module @test {
-gpu.func @test_load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
-  %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-  xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
-  gpu.return
-}
-}
+// // -----
+// gpu.module @test {
+// gpu.func @test_load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){
+//   %c0 = arith.constant 0 : index
+//   %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+//   %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+//   %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+//   xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+//   gpu.return
+// }
+// }
 
-// -----
-gpu.module @test {
-gpu.func @test_load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-  %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-  xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
-  gpu.return
-}
-}
+// // -----
+// gpu.module @test {
+// gpu.func @test_load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
+//   %c0 = arith.constant 0 : index
+//   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+//   %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+//   %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+//   xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+//   gpu.return
+// }
+// }
 
-// -----
-gpu.module @test {
-gpu.func @test_load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
-  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
-  %2 = vector.extract %1[%c0] : vector<16x16xf16> from vector<2x16x16xf16>
-  %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-  xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
-  gpu.return
-}
-}
+// // -----
+// gpu.module @test {
+// gpu.func @test_load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
+//   %c0 = arith.constant 0 : index
+//   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+//   %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
+//   %2 = vector.extract %1[%c0] : vector<16x16xf16> from vector<2x16x16xf16>
+//   %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+//   xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+//   gpu.return
+// }
+// }
 
-// -----
-gpu.module @test {
-gpu.func @test_dpas(%arg0: vector<8x16xf16>, %arg1: vector<16x16xf16>, %arg3: vector<8x16xf32>, %arg2: memref<8x16xf32>){
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.dpas %arg0, %arg1, %arg3 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-  %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-  xegpu.store_nd %0, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  gpu.return
-}
-}
+// // -----
+// gpu.module @test {
+// gpu.func @test_dpas(%arg0: vector<8x16xf16>, %arg1: vector<16x16xf16>, %arg3: vector<8x16xf32>, %arg2: memref<8x16xf32>){
+//   %c0 = arith.constant 0 : index
+//   %0 = xegpu.dpas %arg0, %arg1, %arg3 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+//   %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+//   xegpu.store_nd %0, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+//   gpu.return
+// }
+// }

>From 2ae3543e7a56f8fb37f5ee86c23b980b451e1aac Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 14 Apr 2025 21:24:31 +0000
Subject: [PATCH 45/45] fix issues

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    |  49 ++++----
 .../Dialect/XeGPU/subgroup-distribution.mlir  | 108 +++++++++---------
 2 files changed, 83 insertions(+), 74 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index f64f0b5235705..05d15a7c71e58 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -717,6 +717,11 @@ static LogicalResult attachLayoutAttributes(
     /// If no results, move on.
     if (op->getNumResults() == 0)
       return WalkResult::advance();
+    /// If all the results are scalars, move on.
+    if (llvm::all_of(op->getResultTypes(),
+                     [](Type t) { return t.isIntOrIndexOrFloat(); }))
+      return WalkResult::advance();
+
     if (auto tensorDescTy =
             dyn_cast<xegpu::TensorDescType>(op->getResult(0).getType())) {
       auto layoutInfo = getLayoutInfoForResult(op->getResult(0));
@@ -738,7 +743,7 @@ static LogicalResult attachLayoutAttributes(
       op->erase();
       return WalkResult::advance();
     }
-    /// Otherwise simply attach the sg_map to the op itself.
+    /// Otherwise simply attach the layout to the op itself.
     for (auto [i, r] : llvm::enumerate(op->getResults())) {
       auto layoutInfo = getLayoutInfoForResult(r);
       if (layoutInfo) {
@@ -1199,14 +1204,19 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
     if (failed(loadNdDistValueTyOrFailure))
       return rewriter.notifyMatchFailure(
           loadOp, "Failed to get distributed vector type for the load op");
+    auto distributedTensorDescTy =
+        dropLayouts(loadOp.getTensorDescType()); /// Distributed tensor
+                                                 /// descriptor type does not
+                                                 /// contain layout info.
     Value newLoadOp = rewriter.create<xegpu::LoadNdOp>(
         newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
-        newWarpOp->getResult(newRetIndices[0]),
+        resolveDistributedTy(newWarpOp->getResult(newRetIndices[0]),
+                             distributedTensorDescTy, rewriter),
         filterTemporaryLayoutAttributes(loadOp->getAttrs()));
     Value distributedVal = newWarpOp.getResult(operandIdx);
     /// There can be a conflict between the vector type distributed by the
     /// warp op and (xegpu-specific) distributed type supported by the load
-    /// op. We reconcile these mismatches by inserting a cast.
+    /// op. Resolve these mismatches by inserting a cast.
     newLoadOp =
         resolveDistributedTy(newLoadOp, distributedTypeByWarpOp, rewriter);
     rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
@@ -1274,29 +1284,28 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
     rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Value> newDpasOperands;
     SmallVector<VectorType> newDpasOperandExpectedTypes;
-    /// Reconcile the distributed types with the original types.
+    /// Resolve the distributed types with the original types.
     newDpasOperandExpectedTypes.push_back(
         getDistributedVectorType(layoutA, dpasOp.getLhsType()));
     newDpasOperandExpectedTypes.push_back(
         getDistributedVectorType(layoutB, dpasOp.getRhsType()));
-    if (dpasOp.getAcc()) {
-      newDpasOperandExpectedTypes.push_back(
-          getDistributedVectorType(layoutOut, dpasOp.getResultType()));
-    }
-
-    for (auto i : newRetIndices) {
-      newDpasOperands.push_back(resolveDistributedTy(
-          newWarpOp.getResult(i),
-          newDpasOperandExpectedTypes[newDpasOperands.size()], rewriter));
+    auto distributedResultTy =
+        getDistributedVectorType(layoutOut, dpasOp.getResultType());
+    if (dpasOp.getAcc())
+      newDpasOperandExpectedTypes.push_back(distributedResultTy);
+
+    for (unsigned i = 0; i < newRetIndices.size(); i++) {
+      newDpasOperands.push_back(
+          resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
+                               newDpasOperandExpectedTypes[i], rewriter));
     }
-    auto newDpasOp = rewriter.create<xegpu::DpasOp>(
-        newWarpOp->getLoc(), distResultTypeByWarpOpOrFailure.value(),
-        newDpasOperands, dpasOp->getAttrs());
+    Value newDpasOp = rewriter.create<xegpu::DpasOp>(
+        newWarpOp->getLoc(), distributedResultTy, newDpasOperands,
+        filterTemporaryLayoutAttributes(dpasOp->getAttrs()));
     Value disributedVal = newWarpOp.getResult(operandIdx);
-    /// Reconile the output type.
-    disributedVal = resolveDistributedTy(
-        disributedVal,
-        getDistributedVectorType(layoutOut, dpasOp.getResultType()), rewriter);
+    /// Resolve the output type.
+    newDpasOp = resolveDistributedTy(
+        newDpasOp, distResultTypeByWarpOpOrFailure.value(), rewriter);
     rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
     return success();
   }
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
index 7197ddfb286eb..6369eb7dd035e 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
@@ -8,63 +8,63 @@ gpu.func @test_store_nd_1d(%arg0: memref<16xf32>){
 }
 }
 
-// // -----
-// gpu.module @test {
-// gpu.func @test_store_nd_2d(%arg0: memref<16x16xf16>){
-//   %c0 = arith.constant 0 : index
-//   %1 = arith.constant dense<1.000000e+00> : vector<16x16xf16>
-//   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-//   xegpu.store_nd %1, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
-//   gpu.return
-// }
-// }
+// -----
+gpu.module @test {
+gpu.func @test_store_nd_2d(%arg0: memref<16x16xf16>){
+  %c0 = arith.constant 0 : index
+  %1 = arith.constant dense<1.000000e+00> : vector<16x16xf16>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  xegpu.store_nd %1, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  gpu.return
+}
+}
 
 
 
-// // -----
-// gpu.module @test {
-// gpu.func @test_load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){
-//   %c0 = arith.constant 0 : index
-//   %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-//   %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
-//   %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-//   xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
-//   gpu.return
-// }
-// }
+// -----
+gpu.module @test {
+gpu.func @test_load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+  %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+  xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+  gpu.return
+}
+}
 
-// // -----
-// gpu.module @test {
-// gpu.func @test_load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
-//   %c0 = arith.constant 0 : index
-//   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-//   %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-//   %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-//   xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
-//   gpu.return
-// }
-// }
+// -----
+gpu.module @test {
+gpu.func @test_load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  gpu.return
+}
+}
 
-// // -----
-// gpu.module @test {
-// gpu.func @test_load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
-//   %c0 = arith.constant 0 : index
-//   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
-//   %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
-//   %2 = vector.extract %1[%c0] : vector<16x16xf16> from vector<2x16x16xf16>
-//   %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-//   xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
-//   gpu.return
-// }
-// }
+// -----
+gpu.module @test {
+gpu.func @test_load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
+  %2 = vector.extract %1[%c0] : vector<16x16xf16> from vector<2x16x16xf16>
+  %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  gpu.return
+}
+}
 
-// // -----
-// gpu.module @test {
-// gpu.func @test_dpas(%arg0: vector<8x16xf16>, %arg1: vector<16x16xf16>, %arg3: vector<8x16xf32>, %arg2: memref<8x16xf32>){
-//   %c0 = arith.constant 0 : index
-//   %0 = xegpu.dpas %arg0, %arg1, %arg3 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-//   %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-//   xegpu.store_nd %0, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-//   gpu.return
-// }
-// }
+// -----
+gpu.module @test {
+gpu.func @test_dpas(%arg0: vector<8x16xf16>, %arg1: vector<16x16xf16>, %arg3: vector<8x16xf32>, %arg2: memref<8x16xf32>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.dpas %arg0, %arg1, %arg3 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+  %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+  xegpu.store_nd %0, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  gpu.return
+}
+}