[Mlir-commits] [mlir] d30554b - [mlir][xegpu] SIMT distribution patterns for XeGPU CreateNdTdesc, LoadNd, StoreNd and Dpas Ops. (#135271)

Wed Apr 30 12:16:50 PDT 2025

Author: Charitha Saumya
Date: 2025-04-30T12:16:47-07:00
New Revision: d30554b19edc27bc9ca3475b888c1b3e4eda87c4

URL: https://github.com/llvm/llvm-project/commit/d30554b19edc27bc9ca3475b888c1b3e4eda87c4
DIFF: https://github.com/llvm/llvm-project/commit/d30554b19edc27bc9ca3475b888c1b3e4eda87c4.diff

LOG: [mlir][xegpu] SIMT distribution patterns for XeGPU CreateNdTdesc, LoadNd, StoreNd and Dpas Ops. (#135271)

This PR adds the SIMT distribution patterns for create_nd_tdesc, load_nd, store_nd and dpas XeGPU ops.

Added: 
    mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
    mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt
    mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
    mlir/test/Dialect/XeGPU/subgroup-distribution.mlir

Modified: 
    mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
    mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
    mlir/lib/Dialect/XeGPU/CMakeLists.txt
    mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
    mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
    mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
    mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 3cb71788a15ef..ecab280b76f55 100644

--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -189,11 +189,6 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
         return scatter_attr.getChunkSize().getInt();
       return 1;
     }
-
-    // This returns a vector type that represents the fragment of data owned by
-    // a work item in SIMT mode if this tensor descriptor is used in a XeGPU
-    // load/store operation.
-    FailureOr<VectorType> getDistributedVectorType();
   }];
 
   let hasCustomAssemblyFormat = true;

diff  --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index 63ea26df06937..3e94021c7a1ea 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -16,6 +16,8 @@ namespace xegpu {
 
 /// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`.
 void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns);
+/// Appends patterns for XeGPU SIMT distribution into `patterns`.
+void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns);
 
 } // namespace xegpu
 } // namespace mlir

diff  --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
new file mode 100644
index 0000000000000..3616fa614e7f9
--- /dev/null
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -0,0 +1,57 @@
+//===- XeGPUUtils.h - Vector Utilities --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_
+#define MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_
+
+#include "mlir/IR/BuiltinTypes.h"
+namespace mlir {
+
+class VectorType;
+namespace xegpu {
+class LayoutAttr;
+class TensorDescType;
+} // namespace xegpu
+
+namespace xegpu {
+
+/// If tensor descriptor has a layout attribute it is used in SIMT mode.
+/// In this mode, the distributed vector shape is determined as follows:
+/// Definitions:
+///        lane_data_size = lane_data[0] × lane_data[1]
+///        subgroup_size = lane_layout[0] × lane_layout[1]
+///        distribution_unit_size = subgroup_size × lane_data_size
+///
+/// Case 1: Regular loads/stores.
+/// The following conditions must be met:
+///        * tensor_desc[0] == lane_layout[0]
+/// Distributed vector is a 1D vector with shape:
+///        [chunk_size]
+///
+/// Case 2: Block loads/stores
+/// Additional definitions:
+///        tensor_size = tensor_desc[0] * .. * tensor_desc[r-1] * array_length
+///        n_distribution_units = tensor_size / distribution_unit_size
+///        fragment_size = n_distribution_units * lane_data_size
+/// Given above definitions, the following conditions must be met:
+///        * tensor_desc[0] % (lane_layout[0] × lane_data[0]) == 0
+///        * tensor_desc[1] % (lane_layout[1] × lane_data[1]) == 0
+/// Distributed vector is a 1D vector with shape:
+///        [fragment_size]
+FailureOr<VectorType> getDistributedVectorType(xegpu::TensorDescType tdescTy);
+
+/// Helper to get the distributed vector type for a given vector type according
+/// to a given LayoutAttr.
+FailureOr<VectorType> getDistributedVectorType(VectorType originalType,
+                                               LayoutAttr layout);
+
+} // namespace xegpu
+
+} // namespace mlir
+
+#endif // MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_

diff  --git a/mlir/lib/Dialect/XeGPU/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/CMakeLists.txt
index 9f57627c321fb..31167e6af908b 100644
--- a/mlir/lib/Dialect/XeGPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(IR)
 add_subdirectory(Transforms)
+add_subdirectory(Utils)

diff  --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index b2d217d192934..6790c5e3af2c0 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -376,74 +376,6 @@ LogicalResult TensorDescType::verify(
   return success();
 }
 
-// If tensor descriptor has a layout attribute it is used in SIMT mode.
-// In this mode, the distributed vector shape is determined as follows:
-// Definitions:
-//        lane_data_size = lane_data[0] × lane_data[1]
-//        subgroup_size = lane_layout[0] × lane_layout[1]
-//        distribution_unit_size = subgroup_size × lane_data_size
-// ---------------------------------------------------------------------
-// Case 1: Regular loads/stores.
-// ---------------------------------------------------------------------
-// The following conditions must be met:
-//        * tensor_desc[0] == lane_layout[0]
-// Distributed vector is a 1D vector with shape:
-//        [chunk_size]
-// ---------------------------------------------------------------------
-// Case 2: Block loads/stores
-// ---------------------------------------------------------------------
-// Additional definitions:
-//        tensor_size = tensor_desc[0] * .. * tensor_desc[r-1] * array_length
-//        n_distribution_units = tensor_size / distribution_unit_size
-//        fragment_size = n_distribution_units * lane_data_size
-// Given above definitions, the following conditions must be met:
-//        * tensor_desc[0] % (lane_layout[0] × lane_data[0]) == 0
-//        * tensor_desc[1] % (lane_layout[1] × lane_data[1]) == 0
-// Distributed vector is a 1D vector with shape:
-//        [fragment_size]
-FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
-  auto layout = llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
-  // It only works for subgroup level layout, which only has lane_layout
-  // and lane_data, and is to distribute a SIMD code into SIMT code.
-  if (!layout || !layout.isSgLayout())
-    return failure();
-
-  SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
-  SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
-  auto tdescShape = getShape();
-
-  // compute sgSize by multiply elements of laneLayout
-  // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1]
-  // e.g. for 1D layout, sgSize = laneLayout[0]
-  auto sgSize = std::accumulate(laneLayout.begin(), laneLayout.end(), 1,
-                                std::multiplies<int64_t>());
-
-  // Case 1: regular loads/stores
-  auto scatterAttr = getEncodingAsScatterTensorDescAttr();
-  if (scatterAttr) {
-    auto chunkSize = scatterAttr.getChunkSize().getInt();
-    // Verify if the first dimension of the tensor descriptor shape is
-    // distributable.
-    assert(tdescShape[0] == laneLayout[0] &&
-           "tensor descriptor shape is not distributable");
-    return VectorType::get({chunkSize}, getElementType());
-  }
-
-  // Case 2: block loads/stores
-  // Check if the tensor descriptor shape is distributable.
-  int64_t tensorSize = 1;
-  for (auto [tdescDim, laneDim, laneDataDim] :
-       llvm::zip_equal(tdescShape, laneLayout, laneData)) {
-    assert((tdescDim % (laneDim * laneDataDim) == 0) &&
-           "tensor descriptor shape is not distributable");
-    tensorSize *= tdescDim;
-  }
-  // tensorSize must be adjusted for array_length.
-  tensorSize *= getArrayLength();
-
-  return VectorType::get({tensorSize / sgSize}, getElementType());
-}
-
 } // namespace xegpu
 } // namespace mlir
 

diff  --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
index 9f041aae511df..901e02d3c9cf5 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -16,4 +16,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms
   MLIRPass
   MLIRTransforms
   MLIRGPUDialect
+  MLIRXeGPUUtils
+  MLIRGPUUtils
+  MLIRVectorTransforms
 )

diff  --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 721a815cf76b9..019032f7743bf 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -10,14 +10,33 @@
 #include "mlir/Analysis/DataFlow/SparseAnalysis.h"
 #include "mlir/Analysis/DataFlowFramework.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeRange.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/Visitors.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/InliningUtils.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/InterleavedRange.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -28,27 +47,32 @@ namespace xegpu {
 } // namespace xegpu
 } // namespace mlir
 
+#define DEBUG_TYPE "xegpu-subgroup-distribute"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
 using namespace mlir;
 using namespace mlir::dataflow;
 
 /// HW dependent constants.
 /// TODO: These constants should be queried from the target information.
-constexpr unsigned subgroupSize = 16; // How many work items in a subgroup.
+constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup.
 /// If DPAS A or B operands have low precision element types they must be packed
 /// according to the following sizes.
 constexpr unsigned packedSizeInBitsForDefault =
     16; // Minimum packing size per register for DPAS A.
 constexpr unsigned packedSizeInBitsForDpasB =
     32; // Minimum packing size per register for DPAS B.
+static const char *const operandLayoutNamePrefix = "layout_operand_";
+static const char *const resultLayoutNamePrefix = "layout_result_";
 
 namespace {
 
-///===----------------------------------------------------------------------===///
-/// Layout
-///===----------------------------------------------------------------------===///
+//===----------------------------------------------------------------------===//
+// Layout
+//===----------------------------------------------------------------------===//
 
-/// Helper class to store the ND layout of work items within a subgroup and data
-/// owned by each work item.
+/// Helper class to store the ND layout of lanes within a subgroup and data
+/// owned by each lane.
 struct Layout {
   SmallVector<int64_t, 3> layout;
   Layout() = default;
@@ -67,235 +91,248 @@ int64_t Layout::operator[](size_t idx) const {
   return layout[idx];
 }
 
-/// WiLayout represents the layout of work items within a subgroup when it
-/// accesses some value. WiData represents the layout of data owned by each work
-/// item.
-using WiLayout = Layout;
-using WiData = Layout;
+/// LaneLayout represents the logical layout of lanes within a subgroup when it
+/// accesses some value. LaneData represents the logical layout of data owned by
+/// each work item.
+using LaneLayout = Layout;
+using LaneData = Layout;
 
-///===----------------------------------------------------------------------===///
-/// SGMap
-///===----------------------------------------------------------------------===///
+//===----------------------------------------------------------------------===//
+// LayoutInfo
+//===----------------------------------------------------------------------===//
 
-/// Helper class for tracking the analysis state of a value. For SGPropagation,
-/// the analysis state is simply the wi_layout and wi_data of each value.
-/// Purpose of this analysis to propagate some unique layout for each value in
-/// the program starting from some known values (like DPAS, StoreNd, etc.).
+/// Helper class for tracking the analysis state of an mlir value. For layout
+/// propagation, the analysis state is simply the lane_layout and lane_data of
+/// each value. Purpose of this analysis to propagate some unique layout for
+/// each value in the program starting from a set of anchor operations (like
+/// DPAS, StoreNd, etc.).
 ///
-/// Given this, SGMap satisifies the following properties:
-///  1) SGMap is a lattice with two states - assigned and not assigned.
-///  2) Two SGMap values are equal if they are both assigned or both not
-///  assigned. The concrete value of assigned state does not matter.
+/// Given this, LayoutInfo  satisifies the following properties:
+///  1) A LayoutInfo value can be in one of two states - `assigned` or `not
+///  assigned`.
+///  2) Two LayoutInfo values are equal if they are both assigned or
+///  both not assigned. The concrete value of assigned state does not matter.
 ///  3) The meet operator works as follows:
 ///     - If current state is assigned, return the current state. (already
 ///     a unique layout is assigned. don't change it)
 ///     - Otherwise, return the other state.
 
-struct SGMap {
+struct LayoutInfo {
 private:
-  WiLayout wiLayout;
-  WiData wiData;
+  LaneLayout laneLayout;
+  LaneData laneData;
 
 public:
-  SGMap() = default;
-  SGMap(const WiLayout &layout, const WiData &data)
-      : wiLayout(layout), wiData(data) {}
+  LayoutInfo() = default;
+  LayoutInfo(const LaneLayout &layout, const LaneData &data)
+      : laneLayout(layout), laneData(data) {}
 
-  /// Two lattice values are equal if they have `some` layout. The actual
-  /// content of the layout does not matter.
-  bool operator==(const SGMap &other) const {
+  // Two lattice values are equal if they have `some` layout. The actual
+  // content of the layout does not matter.
+  bool operator==(const LayoutInfo &other) const {
     return this->isAssigned() == other.isAssigned();
   }
 
-  static SGMap meet(const SGMap &lhs, const SGMap &rhs);
+  static LayoutInfo meet(const LayoutInfo &lhs, const LayoutInfo &rhs);
 
-  static SGMap join(const SGMap &lhs, const SGMap &rhs);
+  static LayoutInfo join(const LayoutInfo &lhs, const LayoutInfo &rhs);
 
   void print(raw_ostream &os) const;
 
-  bool isAssigned() const { return wiLayout.size() > 0 && wiData.size() > 0; }
+  bool isAssigned() const {
+    return laneLayout.size() > 0 && laneData.size() > 0;
+  }
 
-  SGMap getTransposedLayout(ArrayRef<int64_t> permutation) const;
+  LayoutInfo getTransposedLayout(ArrayRef<int64_t> permutation) const;
 
-  const WiLayout &getLayout() const { return wiLayout; }
-  const WiData &getData() const { return wiData; }
+  const LaneLayout &getLayout() const { return laneLayout; }
+  const LaneData &getData() const { return laneData; }
+  ArrayRef<int64_t> getLayoutAsArrayRef() const { return laneLayout.layout; }
+  ArrayRef<int64_t> getDataAsArrayRef() const { return laneData.layout; }
 };
 
-void SGMap::print(raw_ostream &os) const {
+void LayoutInfo::print(raw_ostream &os) const {
   if (isAssigned()) {
-    os << "wi_layout: ";
-    wiLayout.print(os);
-    os << ", wi_data: ";
-    wiData.print(os);
+    os << "lane_layout: ";
+    laneLayout.print(os);
+    os << ", lane_data: ";
+    laneData.print(os);
   } else
     os << "Not assigned.";
 }
 
-SGMap SGMap::meet(const SGMap &lhs, const SGMap &rhs) {
+LayoutInfo LayoutInfo::meet(const LayoutInfo &lhs, const LayoutInfo &rhs) {
   if (!lhs.isAssigned())
     return rhs;
   return lhs;
 }
 
 /// Since this is a backward analysis, join method is not used.
-SGMap SGMap::join(const SGMap &lhs, const SGMap &rhs) {
-  llvm_unreachable("Join should not be triggered by SGMapPropagation.");
+LayoutInfo LayoutInfo::join(const LayoutInfo &lhs, const LayoutInfo &rhs) {
+  llvm_unreachable("Join should not be triggered by layout propagation.");
 }
 
 /// Get the transposed layout according to the given permutation.
-SGMap SGMap::getTransposedLayout(ArrayRef<int64_t> permutation) const {
+LayoutInfo
+LayoutInfo::getTransposedLayout(ArrayRef<int64_t> permutation) const {
   if (!isAssigned())
     return {};
-  WiLayout newLayout;
-  WiData newData;
-  for (auto idx : permutation) {
-    newLayout.layout.push_back(wiLayout.layout[idx]);
-    newData.layout.push_back(wiData.layout[idx]);
+  LaneLayout newLayout;
+  LaneData newData;
+  for (int64_t idx : permutation) {
+    newLayout.layout.push_back(laneLayout.layout[idx]);
+    newData.layout.push_back(laneData.layout[idx]);
   }
-  return SGMap(newLayout, newData);
+  return LayoutInfo(newLayout, newData);
 }
 
-///===----------------------------------------------------------------------===///
-/// SGMapLattice
-///===----------------------------------------------------------------------===///
+//===----------------------------------------------------------------------===//
+// LayoutInfoLattice
+//===----------------------------------------------------------------------===//
 
-/// Lattice holding the SGMap for each value.
-struct SGMapLattice : public Lattice<SGMap> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SGMapLattice)
+/// Lattice holding the LayoutInfo for each value.
+struct LayoutInfoLattice : public Lattice<LayoutInfo> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LayoutInfoLattice)
   using Lattice::Lattice;
 };
 
 /// Helper Functions to get default layouts. A `default layout` is a layout that
 /// is assigned to a value when the layout is not fixed by some anchor operation
-/// (like DPAS). This is the natural layout work items are arranged in a
-/// subgroup.
+/// (like DPAS).
 
 /// Helper Function to get the default layout for uniform values like constants.
-/// For 1D vector, wi_layout is [subgroupSize] and wi_data is [1].
-/// For 2D vector, wi_layout is [1, subgroupSize] and wi_data is [1, 1].
-static SGMap getDefaultSgMap(unsigned rank) {
+/// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1].
+/// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1].
+static LayoutInfo getDefaultLayoutInfo(unsigned rank) {
   assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
   if (rank == 1)
-    return SGMap(WiLayout({subgroupSize}), WiData({1}));
-  return SGMap(WiLayout({1, subgroupSize}), WiData({1, 1}));
+    return LayoutInfo(LaneLayout({subgroupSize}), LaneData({1}));
+  return LayoutInfo(LaneLayout({1, subgroupSize}), LaneData({1, 1}));
 }
 
 /// Helper to get the default layout for a vector type.
-static SGMap getDefaultSgMap(VectorType vectorTy) {
-  /// Expecting a 1D or 2D vector.
+static LayoutInfo getDefaultLayoutInfo(VectorType vectorTy) {
+  // Expecting a 1D or 2D vector.
   assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) &&
          "Expected 1D or 2D vector.");
-  /// Expecting int or float element type.
+  // Expecting int or float element type.
   assert(vectorTy.getElementType().isIntOrFloat() &&
          "Expected int or float element type.");
-  /// If the rank is 1, then return default layout for 1D vector.
+  // If the rank is 1, then return default layout for 1D vector.
   if (vectorTy.getRank() == 1)
-    return getDefaultSgMap(1);
-  /// Packing factor is determined by the element type bitwidth.
+    return getDefaultLayoutInfo(1);
+  // Packing factor is determined by the element type bitwidth.
   int packingFactor = 1;
-  auto bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth();
+  unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth();
   if (bitwidth < packedSizeInBitsForDefault)
     packingFactor = packedSizeInBitsForDefault / bitwidth;
-  return SGMap(WiLayout({1, subgroupSize}), WiData({1, packingFactor}));
+  return LayoutInfo(LaneLayout({1, subgroupSize}),
+                    LaneData({1, packingFactor}));
 }
 
-/// Helper Function to get the expected layouts for DPAS operands. `wi_data` is
-/// set according to the following criteria:
+/// Helper Function to get the expected layouts for DPAS operands. `lane_data`
+/// is set according to the following criteria:
 /// * For A operand, the data must be packed in minimum
 /// `packedSizeInBitsForDefault`
 /// * For B operand, the data must be packed in minimum
 /// `packedSizeInBitsForDpasB`
-static SGMap getSGMapForDPASOperand(VectorType vectorTy, unsigned operandNum) {
-  auto elementTy = vectorTy.getElementType();
+static LayoutInfo getLayoutInfoForDPASOperand(VectorType vectorTy,
+                                              unsigned operandNum) {
+  Type elementTy = vectorTy.getElementType();
   assert(elementTy.isIntOrFloat() &&
          "Expected int or float type in DPAS operands");
-  WiLayout layout({1, subgroupSize});
-  /// For B operand, data must be packed in minimum `packedDpasBSizeInBits` and
-  /// must have the VNNI format.
+  LaneLayout layout({1, subgroupSize});
+  // For B operand, data must be packed in minimum `packedDpasBSizeInBits` and
+  // must have the VNNI format.
   if (operandNum == 1 &&
       elementTy.getIntOrFloatBitWidth() < packedSizeInBitsForDpasB) {
-    WiData data(
+    LaneData data(
         {packedSizeInBitsForDpasB / elementTy.getIntOrFloatBitWidth(), 1});
-    return SGMap(layout, data);
+    return LayoutInfo(layout, data);
   }
-  /// Otherwise, return the default layout for the vector type.
-  return getDefaultSgMap(vectorTy);
+  // Otherwise, return the default layout for the vector type.
+  return getDefaultLayoutInfo(vectorTy);
 }
 
-///===----------------------------------------------------------------------===///
-/// SGMapPropagation
-///===----------------------------------------------------------------------===///
+//===----------------------------------------------------------------------===//
+// LayoutInfoPropagation
+//===----------------------------------------------------------------------===//
 
-/// Backward data flow analysis to propagate the wi_layout and wi_data of each
-/// value in the program. Currently, the layouts for operands DPAS, StoreNd, and
-/// StoreScatter are fixed (known before propagation). Purpose of this analysis
-/// is to propagate those known layouts to all their producers and (other)
-/// consumers.
-class SGMapPropagation : public SparseBackwardDataFlowAnalysis<SGMapLattice> {
+/// Backward data flow analysis to propagate the lane_layout and lane_data of
+/// each value in the program. Currently, the layouts for operands DPAS,
+/// StoreNd, and StoreScatter are fixed (known before propagation). Purpose of
+/// this analysis is to propagate those known layouts to all their producers and
+/// (other) consumers.
+class LayoutInfoPropagation
+    : public SparseBackwardDataFlowAnalysis<LayoutInfoLattice> {
 private:
-  void visitDpasOp(xegpu::DpasOp dpas, ArrayRef<SGMapLattice *> operands,
-                   ArrayRef<const SGMapLattice *> results);
+  void visitDpasOp(xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
+                   ArrayRef<const LayoutInfoLattice *> results);
 
-  void visitStoreNdOp(xegpu::StoreNdOp store, ArrayRef<SGMapLattice *> operands,
-                      ArrayRef<const SGMapLattice *> results);
+  void visitStoreNdOp(xegpu::StoreNdOp store,
+                      ArrayRef<LayoutInfoLattice *> operands,
+                      ArrayRef<const LayoutInfoLattice *> results);
 
   void visitStoreScatterOp(xegpu::StoreScatterOp storeScatter,
-                           ArrayRef<SGMapLattice *> operands,
-                           ArrayRef<const SGMapLattice *> results);
+                           ArrayRef<LayoutInfoLattice *> operands,
+                           ArrayRef<const LayoutInfoLattice *> results);
 
-  void visitLoadNdOp(xegpu::LoadNdOp load, ArrayRef<SGMapLattice *> operands,
-                     ArrayRef<const SGMapLattice *> results);
+  void visitLoadNdOp(xegpu::LoadNdOp load,
+                     ArrayRef<LayoutInfoLattice *> operands,
+                     ArrayRef<const LayoutInfoLattice *> results);
 
   void visitLoadGatherOp(xegpu::LoadGatherOp load,
-                         ArrayRef<SGMapLattice *> operands,
-                         ArrayRef<const SGMapLattice *> results);
+                         ArrayRef<LayoutInfoLattice *> operands,
+                         ArrayRef<const LayoutInfoLattice *> results);
 
   void visitTransposeOp(vector::TransposeOp transpose,
-                        ArrayRef<SGMapLattice *> operands,
-                        ArrayRef<const SGMapLattice *> results);
+                        ArrayRef<LayoutInfoLattice *> operands,
+                        ArrayRef<const LayoutInfoLattice *> results);
 
   void visitVectorBitcastOp(vector::BitCastOp bitcast,
-                            ArrayRef<SGMapLattice *> operands,
-                            ArrayRef<const SGMapLattice *> results);
+                            ArrayRef<LayoutInfoLattice *> operands,
+                            ArrayRef<const LayoutInfoLattice *> results);
 
   void visitCreateDescOp(xegpu::CreateDescOp createDesc,
-                         ArrayRef<SGMapLattice *> operands,
-                         ArrayRef<const SGMapLattice *> results);
+                         ArrayRef<LayoutInfoLattice *> operands,
+                         ArrayRef<const LayoutInfoLattice *> results);
 
   void visitUpdateNdOffsetOp(xegpu::UpdateNdOffsetOp updateNdOffset,
-                             ArrayRef<SGMapLattice *> operands,
-                             ArrayRef<const SGMapLattice *> results);
+                             ArrayRef<LayoutInfoLattice *> operands,
+                             ArrayRef<const LayoutInfoLattice *> results);
 
   void visitVectorMultiReductionOp(vector::MultiDimReductionOp reduction,
-                                   ArrayRef<SGMapLattice *> operands,
-                                   ArrayRef<const SGMapLattice *> results);
+                                   ArrayRef<LayoutInfoLattice *> operands,
+                                   ArrayRef<const LayoutInfoLattice *> results);
 
 public:
-  SGMapPropagation(DataFlowSolver &solver, SymbolTableCollection &symbolTable)
+  LayoutInfoPropagation(DataFlowSolver &solver,
+                        SymbolTableCollection &symbolTable)
       : SparseBackwardDataFlowAnalysis(solver, symbolTable) {}
   using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis;
 
-  LogicalResult visitOperation(Operation *op, ArrayRef<SGMapLattice *> operands,
-                               ArrayRef<const SGMapLattice *> results) override;
+  LogicalResult
+  visitOperation(Operation *op, ArrayRef<LayoutInfoLattice *> operands,
+                 ArrayRef<const LayoutInfoLattice *> results) override;
 
   void visitBranchOperand(OpOperand &operand) override {};
 
   void visitCallOperand(OpOperand &operand) override {};
 
   void visitExternalCall(CallOpInterface call,
-                         ArrayRef<SGMapLattice *> operands,
-                         ArrayRef<const SGMapLattice *> results) override {};
+                         ArrayRef<LayoutInfoLattice *> operands,
+                         ArrayRef<const LayoutInfoLattice *> results) override {
+  };
 
-  void setToExitState(SGMapLattice *lattice) override {
-    (void)lattice->meet(SGMap());
+  void setToExitState(LayoutInfoLattice *lattice) override {
+    (void)lattice->meet(LayoutInfo());
   }
 };
 } // namespace
 
-LogicalResult
-SGMapPropagation::visitOperation(Operation *op,
-                                 ArrayRef<SGMapLattice *> operands,
-                                 ArrayRef<const SGMapLattice *> results) {
+LogicalResult LayoutInfoPropagation::visitOperation(
+    Operation *op, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
   TypeSwitch<Operation *>(op)
       .Case<xegpu::DpasOp>(
           [&](auto dpasOp) { visitDpasOp(dpasOp, operands, results); })
@@ -315,8 +352,8 @@ SGMapPropagation::visitOperation(Operation *op,
       .Case<xegpu::UpdateNdOffsetOp>([&](auto updateNdOffsetOp) {
         visitUpdateNdOffsetOp(updateNdOffsetOp, operands, results);
       })
-      /// No need to propagate the layout to operands in CreateNdDescOp because
-      /// they are scalars (offsets, sizes, etc.).
+      // No need to propagate the layout to operands in CreateNdDescOp because
+      // they are scalars (offsets, sizes, etc.).
       .Case<xegpu::CreateNdDescOp>([&](auto createNdDescOp) {})
       .Case<vector::TransposeOp>([&](auto transposeOp) {
         visitTransposeOp(transposeOp, operands, results);
@@ -327,245 +364,251 @@ SGMapPropagation::visitOperation(Operation *op,
       .Case<vector::MultiDimReductionOp>([&](auto reductionOp) {
         visitVectorMultiReductionOp(reductionOp, operands, results);
       })
-      /// All other ops.
+      // All other ops.
       .Default([&](Operation *op) {
-        for (const SGMapLattice *r : results) {
-          for (SGMapLattice *operand : operands) {
-            /// Propagate the layout of the result to the operand.
+        for (const LayoutInfoLattice *r : results) {
+          for (LayoutInfoLattice *operand : operands) {
+            // Propagate the layout of the result to the operand.
             if (r->getValue().isAssigned())
               meet(operand, *r);
           }
         }
       });
-  /// Add a dependency from each result to program point after the operation.
-  for (const SGMapLattice *r : results) {
-    addDependency(const_cast<SGMapLattice *>(r), getProgramPointAfter(op));
+  // Add a dependency from each result to program point after the operation.
+  for (const LayoutInfoLattice *r : results) {
+    addDependency(const_cast<LayoutInfoLattice *>(r), getProgramPointAfter(op));
   }
   return success();
 }
 
-void SGMapPropagation::visitVectorMultiReductionOp(
-    vector::MultiDimReductionOp reduction, ArrayRef<SGMapLattice *> operands,
-    ArrayRef<const SGMapLattice *> results) {
-  /// The layout of the result must be present.
-  auto resultLayout = results[0]->getValue();
+void LayoutInfoPropagation::visitVectorMultiReductionOp(
+    vector::MultiDimReductionOp reduction,
+    ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  // The layout of the result must be present.
+  LayoutInfo resultLayout = results[0]->getValue();
   if (!resultLayout.isAssigned())
     return;
-  /// We only consider 2D -> 1D reductions at this point.
+  // We only consider 2D -> 1D reductions at this point.
   assert(resultLayout.getLayout().size() == 1 &&
          "Expected 1D layout for reduction result.");
-  /// Given that the result is 1D, the layout of the operand should be 2D with
-  /// default layout.
-  auto operandLayout = getDefaultSgMap(2);
+  // Given that the result is 1D, the layout of the operand should be 2D with
+  // default layout.
+  LayoutInfo operandLayout = getDefaultLayoutInfo(2);
   propagateIfChanged(operands[0], operands[0]->meet(operandLayout));
-  /// Accumulator should have the same layout as the result.
+  // Accumulator should have the same layout as the result.
   propagateIfChanged(operands[1], operands[1]->meet(resultLayout));
 }
 
 /// Propagate the layout of the result tensor to the source tensor descriptor in
 /// UpdateNdOffsetOp.
-void SGMapPropagation::visitUpdateNdOffsetOp(
-    xegpu::UpdateNdOffsetOp updateNdOffset, ArrayRef<SGMapLattice *> operands,
-    ArrayRef<const SGMapLattice *> results) {
-  /// The layout of the result must be present.
-  auto resultLayout = results[0]->getValue();
+void LayoutInfoPropagation::visitUpdateNdOffsetOp(
+    xegpu::UpdateNdOffsetOp updateNdOffset,
+    ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  // The layout of the result must be present.
+  LayoutInfo resultLayout = results[0]->getValue();
   if (!resultLayout.isAssigned())
     return;
-  /// Propagate the layout to the source operand.
+  // Propagate the layout to the source operand.
   propagateIfChanged(operands[0], operands[0]->meet(resultLayout));
 }
 
 /// Set the layouts for DPAS A, B, and C operands.
-void SGMapPropagation::visitDpasOp(xegpu::DpasOp dpas,
-                                   ArrayRef<SGMapLattice *> operands,
-                                   ArrayRef<const SGMapLattice *> results) {
-  auto aTy = dpas.getLhsType();
-  auto bTy = dpas.getRhsType();
+void LayoutInfoPropagation::visitDpasOp(
+    xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  VectorType aTy = dpas.getLhsType();
+  VectorType bTy = dpas.getRhsType();
   propagateIfChanged(operands[0],
-                     operands[0]->meet(getSGMapForDPASOperand(aTy, 0)));
+                     operands[0]->meet(getLayoutInfoForDPASOperand(aTy, 0)));
   propagateIfChanged(operands[1],
-                     operands[1]->meet(getSGMapForDPASOperand(bTy, 1)));
+                     operands[1]->meet(getLayoutInfoForDPASOperand(bTy, 1)));
   if (operands.size() > 2) {
-    auto cTy = dpas.getAccType();
+    VectorType cTy = dpas.getAccType();
     propagateIfChanged(operands[2],
-                       operands[2]->meet(getSGMapForDPASOperand(cTy, 2)));
+                       operands[2]->meet(getLayoutInfoForDPASOperand(cTy, 2)));
   }
 }
 
 /// Set the layout for the value and tensor descriptor operands in StoreNdOp.
-void SGMapPropagation::visitStoreNdOp(xegpu::StoreNdOp store,
-                                      ArrayRef<SGMapLattice *> operands,
-                                      ArrayRef<const SGMapLattice *> results) {
-  auto storeLayout = getDefaultSgMap(store.getValueType());
-  /// Both operands should have the same layout
-  for (SGMapLattice *operand : operands) {
+void LayoutInfoPropagation::visitStoreNdOp(
+    xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  LayoutInfo storeLayout = getDefaultLayoutInfo(store.getValueType());
+  // Both operands should have the same layout
+  for (LayoutInfoLattice *operand : operands) {
     propagateIfChanged(operand, operand->meet(storeLayout));
   }
 }
 
 /// Propagate the layout of the value to the tensor descriptor operand in
 /// LoadNdOp.
-void SGMapPropagation::visitLoadNdOp(xegpu::LoadNdOp load,
-                                     ArrayRef<SGMapLattice *> operands,
-                                     ArrayRef<const SGMapLattice *> results) {
-  auto valueLayout = results[0]->getValue();
-  /// Need the layout of the value to propagate to the tensor descriptor.
+void LayoutInfoPropagation::visitLoadNdOp(
+    xegpu::LoadNdOp load, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  LayoutInfo valueLayout = results[0]->getValue();
+  // Need the layout of the value to propagate to the tensor descriptor.
   if (!valueLayout.isAssigned())
     return;
-  SGMap tensorDescLayout = valueLayout;
-  /// LoadNdOp has the transpose effect. However, at the stage of this analysis
-  /// this effect is not expected and should be abstracted away. Emit a warning.
+  LayoutInfo tensorDescLayout = valueLayout;
+  // LoadNdOp has the transpose effect. However, at the stage of this analysis
+  // this effect is not expected and should be abstracted away. Emit a warning.
   if (auto transpose = load.getTranspose()) {
     load.emitWarning("Transpose effect is not expected for LoadNdOp at "
-                     "SGMapPropagation stage.");
+                     "LayoutInfoPropagation stage.");
     tensorDescLayout = valueLayout.getTransposedLayout(transpose.value());
   }
-  /// Propagate the new layout to the tensor descriptor operand.
+  // Propagate the new layout to the tensor descriptor operand.
   propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout));
 }
 
 /// For vector::TransposeOp, the layout of the result is transposed and
 /// propagated to the operand.
-void SGMapPropagation::visitTransposeOp(
-    vector::TransposeOp transpose, ArrayRef<SGMapLattice *> operands,
-    ArrayRef<const SGMapLattice *> results) {
-  /// Need the layout of transpose result to propagate to the operands.
-  auto resultLayout = results[0]->getValue();
+void LayoutInfoPropagation::visitTransposeOp(
+    vector::TransposeOp transpose, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  // Need the layout of transpose result to propagate to the operands.
+  LayoutInfo resultLayout = results[0]->getValue();
   if (!resultLayout.isAssigned())
     return;
-  auto newLayout = resultLayout.getTransposedLayout(transpose.getPermutation());
-  /// Propagate the new layout to the vector operand.
+  LayoutInfo newLayout =
+      resultLayout.getTransposedLayout(transpose.getPermutation());
+  // Propagate the new layout to the vector operand.
   propagateIfChanged(operands[0], operands[0]->meet(newLayout));
 }
 
-/// For vector::BitCastOp, the wi_data of the source layout is changed based on
-/// the bit width of the source and result types.
-void SGMapPropagation::visitVectorBitcastOp(
-    vector::BitCastOp bitcast, ArrayRef<SGMapLattice *> operands,
-    ArrayRef<const SGMapLattice *> results) {
-  /// Need the layout of bitcast result to propagate to the operands.
-  auto resultLayout = results[0]->getValue();
+/// For vector::BitCastOp, the lane_data of the source layout is changed based
+/// on the bit width of the source and result types.
+void LayoutInfoPropagation::visitVectorBitcastOp(
+    vector::BitCastOp bitcast, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  // Need the layout of bitcast result to propagate to the operands.
+  LayoutInfo resultLayout = results[0]->getValue();
   if (!resultLayout.isAssigned())
     return;
-  auto inElemTyBitWidth =
+  int inElemTyBitWidth =
       bitcast.getSourceVectorType().getElementType().getIntOrFloatBitWidth();
-  auto outElemTyBitWidth =
+  int outElemTyBitWidth =
       bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth();
 
-  /// WiLayout does not change.
-  const WiLayout &newWiLayout = resultLayout.getLayout();
-  const WiData &currData = resultLayout.getData();
-  WiData newWiData;
-  /// It's a widening bitcast
+  // LaneLayout does not change.
+  const LaneLayout &newLaneLayout = resultLayout.getLayout();
+  const LaneData &currData = resultLayout.getData();
+  LaneData newLaneData;
+  // It's a widening bitcast
   if (inElemTyBitWidth < outElemTyBitWidth) {
-    auto ratio = outElemTyBitWidth / inElemTyBitWidth;
-    newWiData = resultLayout.getData()[0] == 1
-                    ? WiData({1, currData[1] * ratio})
-                    : WiData({currData[0] * ratio, 1});
+    int ratio = outElemTyBitWidth / inElemTyBitWidth;
+    newLaneData = resultLayout.getData()[0] == 1
+                      ? LaneData({1, currData[1] * ratio})
+                      : LaneData({currData[0] * ratio, 1});
   } else {
-    /// It's a narrowing bitcast
-    auto ratio = inElemTyBitWidth / outElemTyBitWidth;
-    newWiData = resultLayout.getData()[0] == 1
-                    ? WiData({1, currData[1] / ratio})
-                    : WiData({currData[0] / ratio, 1});
+    // It's a narrowing bitcast
+    int ratio = inElemTyBitWidth / outElemTyBitWidth;
+    newLaneData = resultLayout.getData()[0] == 1
+                      ? LaneData({1, currData[1] / ratio})
+                      : LaneData({currData[0] / ratio, 1});
   }
 
   propagateIfChanged(operands[0],
-                     operands[0]->meet(SGMap(newWiLayout, newWiData)));
+                     operands[0]->meet(LayoutInfo(newLaneLayout, newLaneData)));
 }
 
 /// Propagate the layout of the result to the tensor descriptor and mask
 /// operands in LoadGatherOp.
-void SGMapPropagation::visitLoadGatherOp(
-    xegpu::LoadGatherOp load, ArrayRef<SGMapLattice *> operands,
-    ArrayRef<const SGMapLattice *> results) {
-  auto valueLayout = results[0]->getValue();
-  /// Need the layout of the value to propagate to the tensor descriptor.
+void LayoutInfoPropagation::visitLoadGatherOp(
+    xegpu::LoadGatherOp load, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  LayoutInfo valueLayout = results[0]->getValue();
+  // Need the layout of the value to propagate to the tensor descriptor.
   if (!valueLayout.isAssigned())
     return;
 
-  SGMap tensorDescLayout = valueLayout;
+  LayoutInfo tensorDescLayout = valueLayout;
   if (load.getTranspose()) {
-    /// LoadGatherOp has the transpose effect. However, at the stage of this
-    /// analyis this effect is not expected and should be abstracted away. Emit
-    /// a warning.
+    // LoadGatherOp has the transpose effect. However, at the stage of this
+    // analyis this effect is not expected and should be abstracted away. Emit
+    // a warning.
     load.emitWarning("Transpose effect is not expected for LoadGatherOp at "
-                     "SGMapPropagation stage.");
+                     "LayoutInfoPropagation stage.");
     tensorDescLayout = valueLayout.getTransposedLayout({1, 0});
   }
-  /// Mask operand should have 1D default layout.
-  auto maskLayout = getDefaultSgMap(1);
-  /// Propagate the new layout to the tensor descriptor operand.
+  // Mask operand should have 1D default layout.
+  LayoutInfo maskLayout = getDefaultLayoutInfo(1);
+  // Propagate the new layout to the tensor descriptor operand.
   propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout));
-  /// Propagate the new layout to the mask operand.
+  // Propagate the new layout to the mask operand.
   propagateIfChanged(operands[1], operands[1]->meet(maskLayout));
 }
 
 /// Propagate the layout of the descriptor to the vector offset operand in
 /// CreateDescOp.
-void SGMapPropagation::visitCreateDescOp(
-    xegpu::CreateDescOp createDesc, ArrayRef<SGMapLattice *> operands,
-    ArrayRef<const SGMapLattice *> results) {
-  auto descLayout = results[0]->getValue();
-  /// Need the layout of the descriptor to propagate to the operands.
+void LayoutInfoPropagation::visitCreateDescOp(
+    xegpu::CreateDescOp createDesc, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  LayoutInfo descLayout = results[0]->getValue();
+  // Need the layout of the descriptor to propagate to the operands.
   if (!descLayout.isAssigned())
     return;
-  /// For offset operand propagate 1D default layout.
-  SGMap layout = getDefaultSgMap(1);
+  // For offset operand propagate 1D default layout.
+  LayoutInfo layout = getDefaultLayoutInfo(1);
   propagateIfChanged(operands[1], operands[1]->meet(layout));
 }
 
 /// Set the layout for the value, tensor descriptor, and mask operands in the
 /// StoreScatterOp.
-void SGMapPropagation::visitStoreScatterOp(
-    xegpu::StoreScatterOp storeScatter, ArrayRef<SGMapLattice *> operands,
-    ArrayRef<const SGMapLattice *> results) {
-  /// Currently, for 2D StoreScatterOp we expect that the height dimension of
-  /// the tensor descriptor is evenly divisible by the subgroup size.
-  /// TODO: Add support for other 2D shapes.
-  auto tdescShape = storeScatter.getTensorDescType().getShape();
-  if (tdescShape.size() > 1 && tdescShape[0] % subgroupSize != 0) {
-    storeScatter.emitError("Height dimension of the tensor descriptor should "
-                           "be evenly divisible by the subgroup size.");
-    return;
-  }
-  auto valueLayout = getDefaultSgMap(storeScatter.getValueType());
-  SGMap storeScatterLayout = valueLayout;
+void LayoutInfoPropagation::visitStoreScatterOp(
+    xegpu::StoreScatterOp storeScatter, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  // Currently, for 2D StoreScatterOp we expect that the height dimension of
+  // the tensor descriptor is equal to the subgroup size. This is ensured by
+  // the op verifier.
+  ArrayRef<int64_t> tdescShape = storeScatter.getTensorDescType().getShape();
+  if (tdescShape.size() > 1)
+    assert(
+        tdescShape[0] == subgroupSize &&
+        "Expected the first dimension of 2D tensor descriptor to be equal to "
+        "subgroup size.");
+
+  LayoutInfo valueLayout = getDefaultLayoutInfo(storeScatter.getValueType());
+  LayoutInfo storeScatterLayout = valueLayout;
   if (storeScatter.getTranspose()) {
-    /// StoreScatteOp allows transpose effect. However, at the stage of this
-    /// analyis this effect is not expected and should be abstracted away. Emit
-    /// a warning.
+    // StoreScatteOp allows transpose effect. However, at the stage of this
+    // analyis this effect is not expected and should be abstracted away. Emit
+    // a warning.
     storeScatter.emitWarning("Transpose effect is not expected for "
-                             "StoreScatterOp at SGMapPropagation stage.");
+                             "StoreScatterOp at LayoutInfoPropagation stage.");
     storeScatterLayout = valueLayout.getTransposedLayout({1, 0});
   }
-  /// Propagate the value layout.
+  // Propagate the value layout.
   propagateIfChanged(operands[0], operands[0]->meet(valueLayout));
-  /// Propagate the tensor descriptor layout.
+  // Propagate the tensor descriptor layout.
   propagateIfChanged(operands[1], operands[1]->meet(storeScatterLayout));
-  /// Use default 1D layout for mask operand.
-  auto maskLayout = getDefaultSgMap(1);
+  // Use default 1D layout for mask operand.
+  LayoutInfo maskLayout = getDefaultLayoutInfo(1);
   propagateIfChanged(operands[2], operands[2]->meet(maskLayout));
 }
 
 namespace {
 
-///===----------------------------------------------------------------------===///
-/// RunSGMapPropagation
-///===----------------------------------------------------------------------===///
+//===----------------------------------------------------------------------===//
+// RunLayoutInfoPropagation
+//===----------------------------------------------------------------------===//
 
-/// Driver class for running the SGMapPropagation analysis.
-class RunSGMapPropagation {
+/// Driver class for running the LayoutInfoPropagation analysis.
+class RunLayoutInfoPropagation {
 public:
-  RunSGMapPropagation(Operation *op) : target(op) {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RunLayoutInfoPropagation)
+
+  RunLayoutInfoPropagation(Operation *op) : target(op) {
     SymbolTableCollection symbolTable;
     solver.load<DeadCodeAnalysis>();
     solver.load<SparseConstantPropagation>();
-    solver.load<SGMapPropagation>(symbolTable);
+    solver.load<LayoutInfoPropagation>(symbolTable);
     (void)solver.initializeAndRun(op);
   }
 
-  SGMap getSGMap(Value val);
+  LayoutInfo getLayoutInfo(Value val);
 
   void printAnalysisResult(llvm::raw_ostream &os);
 
@@ -575,21 +618,21 @@ class RunSGMapPropagation {
 };
 } // namespace
 
-SGMap RunSGMapPropagation::getSGMap(Value val) {
-  auto *state = solver.lookupState<SGMapLattice>(val);
+LayoutInfo RunLayoutInfoPropagation::getLayoutInfo(Value val) {
+  auto *state = solver.lookupState<LayoutInfoLattice>(val);
   if (!state)
     return {};
   return state->getValue();
 }
 
-void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
+void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) {
   auto printFunctionResult = [&](FunctionOpInterface funcOp) {
     os << "function: " << funcOp.getName() << ":\n";
     // Function arguments
-    for (auto arg : funcOp.getArguments()) {
-      auto layout = getSGMap(arg);
+    for (BlockArgument arg : funcOp.getArguments()) {
+      LayoutInfo layout = getLayoutInfo(arg);
       os << "argument: " << arg << "\n";
-      os << "sg_map  : ";
+      os << "layout  : ";
       layout.print(os);
       os << "\n";
     }
@@ -599,16 +642,16 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
       if (op->getResults().empty())
         return;
       os << "op    : ";
-      /// For control-flow ops, print the op name only.
+      // For control-flow ops, print the op name only.
       if (isa<BranchOpInterface>(op) || isa<RegionBranchOpInterface>(op))
         os << op->getName();
       else
         op->print(os);
       os << "\n";
-      /// Print the sg_map for each result.
+      // Print the layout for each result.
       for (auto [i, r] : llvm::enumerate(op->getResults())) {
-        auto layout = getSGMap(r);
-        os << "sg_map for result #" << i << ": ";
+        LayoutInfo layout = getLayoutInfo(r);
+        os << "layout for result #" << i << ": ";
         layout.print(os);
         os << "\n";
       }
@@ -620,19 +663,757 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
     for (auto funcOp : modOp.getOps<FunctionOpInterface>()) {
       funcOps.push_back(funcOp);
     }
-    /// Collect all GpuFuncOps in the module.
+    // Collect all GpuFuncOps in the module.
     for (auto gpuModOp : modOp.getOps<gpu::GPUModuleOp>()) {
       for (auto gpuFuncOp : gpuModOp.getOps<FunctionOpInterface>()) {
         funcOps.push_back(gpuFuncOp);
       }
     }
   }
-  /// Print the analysis result for each function.
-  for (auto funcOp : funcOps) {
+  // Print the analysis result for each function.
+  for (FunctionOpInterface funcOp : funcOps) {
     printFunctionResult(funcOp);
   }
 }
 
+namespace {
+
+//===----------------------------------------------------------------------===//
+// LayoutAttrAssignment
+//===----------------------------------------------------------------------===//
+
+/// This class is responsible for assigning the layout attributes to the ops and
+/// their users based on the layout propagation analysis result.
+class LayoutAttrAssignment {
+public:
+  LayoutAttrAssignment(Operation *top,
+                       function_ref<LayoutInfo(Value)> getLayout)
+      : getAnalysisResult(getLayout), top(top) {}
+
+  LogicalResult run();
+
+private:
+  LogicalResult assign(Operation *op);
+  void assignToUsers(Value v, xegpu::LayoutAttr layout);
+  xegpu::LayoutAttr getLayoutAttrForValue(Value v);
+  LogicalResult resolveConflicts();
+  // Callable to get the layout of a value based on the layout propagation
+  // analysis.
+  function_ref<LayoutInfo(Value)> getAnalysisResult;
+  Operation *top;
+};
+
+} // namespace
+
+/// Helper to assign the layout attribute to the users of the value.
+void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) {
+  for (OpOperand &user : v.getUses()) {
+    Operation *owner = user.getOwner();
+    unsigned operandNumber = user.getOperandNumber();
+    // Use a generic name for ease of querying the layout attribute later.
+    std::string attrName =
+        operandLayoutNamePrefix + std::to_string(operandNumber);
+    owner->setAttr(attrName, layout);
+  }
+}
+
+/// Convert the layout assigned to a value to xegpu::LayoutAttr.
+xegpu::LayoutAttr LayoutAttrAssignment::getLayoutAttrForValue(Value v) {
+  LayoutInfo layout = getAnalysisResult(v);
+  if (!layout.isAssigned())
+    return {};
+  SmallVector<int, 2> laneLayout, laneData;
+  for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
+                                             layout.getDataAsArrayRef())) {
+    laneLayout.push_back(static_cast<int>(layout));
+    laneData.push_back(static_cast<int>(data));
+  }
+  return xegpu::LayoutAttr::get(v.getContext(), laneLayout, laneData);
+}
+
+/// Assign xegpu::LayoutAttr to the op and its users. The layout is assigned
+/// based on the layout propagation analysis result.
+LogicalResult LayoutAttrAssignment::assign(Operation *op) {
+  // For function ops, propagate the function argument layout to the users.
+  if (auto func = dyn_cast<FunctionOpInterface>(op)) {
+    for (BlockArgument arg : func.getArguments()) {
+      xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(arg);
+      if (layoutInfo) {
+        assignToUsers(arg, layoutInfo);
+      }
+    }
+    return success();
+  }
+  // If no results, move on.
+  if (op->getNumResults() == 0)
+    return success();
+  // If all the results are scalars, move on.
+  if (llvm::all_of(op->getResultTypes(),
+                   [](Type t) { return t.isIntOrIndexOrFloat(); }))
+    return success();
+  // If the op has more than one result and at least one result is a tensor
+  // descriptor, exit. This case is not supported yet.
+  // TODO: Support this case.
+  if (op->getNumResults() > 1 && llvm::any_of(op->getResultTypes(), [](Type t) {
+        return isa<xegpu::TensorDescType>(t);
+      })) {
+    LLVM_DEBUG(
+        DBGS() << op->getName()
+               << " op has more than one result and at least one is a tensor "
+                  "descriptor. This case is not handled.\n");
+    return failure();
+  }
+  // If the result is a tensor descriptor, attach the layout to the tensor
+  // descriptor itself.
+  if (auto tensorDescTy =
+          dyn_cast<xegpu::TensorDescType>(op->getResultTypes()[0])) {
+    xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(op->getResult(0));
+    if (!layoutInfo) {
+      LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n");
+      return failure();
+    }
+
+    // Clone the op, attach the layout to the result tensor descriptor, and
+    // remove the original op.
+    OpBuilder builder(op);
+    Operation *newOp = builder.clone(*op);
+    auto newTensorDescTy = xegpu::TensorDescType::get(
+        tensorDescTy.getContext(), tensorDescTy.getShape(),
+        tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layoutInfo);
+    newOp->getResult(0).setType(newTensorDescTy);
+    op->replaceAllUsesWith(newOp->getResults());
+    op->erase();
+    return success();
+  }
+  // Otherwise simply attach the layout to the op itself.
+  for (auto [i, r] : llvm::enumerate(op->getResults())) {
+    xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(r);
+    if (layoutInfo) {
+      std::string attrName = resultLayoutNamePrefix + std::to_string(i);
+      op->setAttr(attrName, layoutInfo);
+      // Attach the layout attribute to the users of the result.
+      assignToUsers(r, layoutInfo);
+    }
+  }
+  return success();
+}
+
+/// Walk the IR and attach xegpu::LayoutAttr to all ops and their users.
+LogicalResult LayoutAttrAssignment::run() {
+  auto walkResult = top->walk([&](Operation *op) {
+    if (failed(assign(op)))
+      return WalkResult::interrupt();
+    return WalkResult::advance();
+  });
+
+  if (walkResult.wasInterrupted())
+    return failure();
+
+  return resolveConflicts();
+}
+
+/// TODO: Implement the layout conflict resolution. This must ensure mainly two
+/// things:
+/// 1) Is a given layout supported by the op? (need to query the target
+///    HW info). Otherwise can we achive this layout using a layout conversion?
+/// 2) Do all the operands have the required layout? If not, can it
+///    be resolved using a layout conversion?
+LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); }
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// SIMT Distribution Patterns
+//===----------------------------------------------------------------------===//
+
+/// Helper function to get  distributed vector type for a source vector type
+/// according to the lane_layout. We simply divide each dimension of tensor
+/// descriptor shape by corresponding lane_layout dimension. If
+/// array_length > 1, that is appended to the front of the ditributed shape.
+/// NOTE: This is the vector type that will be returned by the
+/// gpu.warp_execute_on_lane0 op.
+///
+/// Examples:
+/// | original vector shape | lane_layout | distributed vector shape |
+/// |-----------------------|-------------|--------------------------|
+/// | 32x16                 | [1, 16]     | 32x1                     |
+/// | 32x16                 | [2, 8]      | 16x2                     |
+/// | 2x32x16               | [1, 16]     | 2x32x1                   |
+static FailureOr<VectorType>
+getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
+                                VectorType originalType) {
+  if (!layout)
+    return failure();
+
+  auto laneLayout = layout.getLaneLayout().asArrayRef();
+  assert(originalType.getShape().size() >= laneLayout.size() &&
+         "Rank of the original vector type should be greater or equal to the "
+         "size of the lane layout to distribute the vector type.");
+  SmallVector<int64_t> distributedShape(originalType.getShape());
+  // Only distribute the last `laneLayout.size()` dimensions. The remaining
+  // dimensions are not distributed.
+  unsigned distributionStart = originalType.getRank() - laneLayout.size();
+  for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
+    if (i < distributionStart) {
+      continue;
+    }
+    // Check if the dimension can be distributed evenly.
+    if (dim % laneLayout[i - distributionStart] != 0)
+      return failure();
+    distributedShape[i] = dim / laneLayout[i - distributionStart];
+  }
+  return VectorType::get(distributedShape, originalType.getElementType());
+}
+
+// Drop the layout attribute from the tensor descriptor type if layout is
+// present.
+static xegpu::TensorDescType dropLayouts(xegpu::TensorDescType tensorDesc) {
+  if (tensorDesc.getLayoutAttr() == xegpu::LayoutAttr())
+    return tensorDesc;
+
+  return xegpu::TensorDescType::get(
+      tensorDesc.getContext(), tensorDesc.getShape(),
+      tensorDesc.getElementType(), tensorDesc.getEncoding(),
+      xegpu::LayoutAttr());
+}
+
+/// Helper function to resolve types if the distributed type out of
+/// gpu.warp_execute_on_lane0 is 
diff erent from the expected xegpu SIMT type.
+/// Example 1:
+///   distributed type: vector<8x1xf32>
+///   expected type: vector<8xf32>
+///   resolved using,
+///   %0 = vector.shape_cast %1 : vector<8x1xf32> to vector<8xf32>
+/// Example 2:
+///   distributed type: xegpu.tensor_desc<8x16xf32, #xegpu.layout<...>>
+///   expected type: xegpu.tensor_desc<8x16xf32>
+///   resolved using,
+///   %0 = unrealized_conversion_cast %1 :
+///      xegpu.tensor_desc<8x16xf32, #xegpu.layout<..>> ->
+///      xegpu.tensor_desc<8x16xf32>
+template <typename T>
+static Value resolveDistributedTy(Value orig, T expected,
+                                  PatternRewriter &rewriter) {
+  // If orig and expected types are the same, return orig.
+  if (orig.getType() == expected)
+    return orig;
+  // If orig is a vector type, create a shape cast op to reconcile the types.
+  if (auto origVecType = isa<VectorType>(orig.getType())) {
+    auto castOp =
+        rewriter.create<vector::ShapeCastOp>(orig.getLoc(), expected, orig);
+    return castOp.getResult();
+  }
+  // If orig is a tensor descriptor type, create an unrealized conversion cast
+  // op to reconcile the types.
+  if (auto origTensorDescTy = isa<xegpu::TensorDescType>(orig.getType())) {
+    auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
+                                                              expected, orig);
+    return castOp.getResult(0);
+  }
+  llvm_unreachable("Unsupported type for reconciliation");
+  return orig;
+}
+
+/// Helper function to filter out the temporary layout attributes attached
+/// during the layout assignment process. These are not needed after going to
+/// SIMT.
+static SmallVector<NamedAttribute>
+removeTemporaryLayoutAttributes(ArrayRef<NamedAttribute> attrs) {
+  SmallVector<NamedAttribute> newAttrs;
+  for (NamedAttribute attr : attrs) {
+    if (attr.getName().strref().contains(operandLayoutNamePrefix) ||
+        attr.getName().strref().contains(resultLayoutNamePrefix)) {
+      continue;
+    }
+    newAttrs.push_back(attr);
+  }
+  return newAttrs;
+}
+
+/// Helper function to check if the layout is packed. Layout is packed if it is
+/// 2D and lane_data[0] != 1 (data packed from col dimension).
+static bool hasPackedLayout(xegpu::LayoutAttr layout) {
+  if (layout == xegpu::LayoutAttr())
+    return false;
+  DenseI32ArrayAttr laneData = layout.getLaneData();
+  if (!laneData || laneData.size() != 2)
+    return false;
+  return laneData.asArrayRef()[0] != 1;
+}
+
+/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
+/// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
+/// contained within a WarpExecuteOnLane0Op.
+/// Example:
+///
+/// ```
+///   gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
+///     ...
+///     ...
+///     gpu.return %result: vector<8x16xf32>
+///   }
+/// ```
+/// To
+/// ```
+///   gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
+///     %laneid = gpu.lane_id : index
+///     %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> {
+///       ...
+///       ...
+///       gpu.yield %result: vector<8x16xf32>
+///     }
+///     return %0
+///   }
+struct MoveFuncBodyToWarpExecuteOnLane0
+    : public OpRewritePattern<gpu::GPUFuncOp> {
+  using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
+                                PatternRewriter &rewriter) const override {
+    // If the function only contains a single void return, skip.
+    if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
+          return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
+        }))
+      return failure();
+    // If the function already moved inside a warp_execute_on_lane0, skip.
+    if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
+          return isa<gpu::WarpExecuteOnLane0Op>(op);
+        }))
+      return failure();
+    // Create a new function with the same signature.
+    auto newGpuFunc = rewriter.create<gpu::GPUFuncOp>(
+        gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType());
+    // Create a WarpExecuteOnLane0Op with same arguments and results as the
+    // original gpuFuncOp.
+    rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
+    auto laneId = rewriter.create<gpu::LaneIdOp>(
+        newGpuFunc.getLoc(), rewriter.getIndexType(),
+        /** upperBound = **/ mlir::IntegerAttr());
+    ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
+    auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
+        laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize,
+        newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes());
+    Block &warpBodyBlock = warpOp.getBodyRegion().front();
+    // Replace the ReturnOp of the original gpu function with a YieldOp.
+    auto origRetunOp =
+        cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
+    rewriter.setInsertionPointAfter(origRetunOp);
+    rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
+                                  origRetunOp.getOperands());
+    rewriter.eraseOp(origRetunOp);
+    // Move the original function body to the WarpExecuteOnLane0Op body.
+    rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
+                                warpOp.getBodyRegion().begin());
+    rewriter.eraseBlock(&warpBodyBlock);
+    // Insert a new ReturnOp after the WarpExecuteOnLane0Op.
+    rewriter.setInsertionPointAfter(warpOp);
+    rewriter.create<gpu::ReturnOp>(newGpuFunc.getLoc(), warpOp.getResults());
+    rewriter.replaceOp(gpuFuncOp, newGpuFunc);
+    return success();
+  }
+};
+
+/// Distribute a create_nd_tdesc feeding into vector.yield op of the enclosing
+/// `gpu.warp_execute_on_lane_0` region. After the sinking, the warp op will
+/// still contain the original op that will not be used by the yield op (and
+/// should be cleaned up later). The yield op will bypass the create_nd_tdesc's
+/// arguments. Tensor descriptor shape is not distributed because it is a
+/// uniform value across all work items within the subgroup. However, the
+/// layout information is dropped in the new tensor descriptor type.
+///
+/// Example:
+///
+/// ```
+///   #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
+///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
+///                   (!xegpu.tensor_desc<4x8xf32, #lo0>) {
+///     ...
+///     %td = xegpu.create_nd_tdesc %arg0[0, 0]
+///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #lo0>
+///     vector.yield %td
+///   }
+/// ```
+/// To
+/// ```
+///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) {
+///     ...
+///     %dead = xegpu.create_nd_tdesc %arg0[0, 0]
+///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #lo0>
+///     vector.yield %arg0, %dead
+///   }
+///   %td = xegpu.create_nd_tdesc %r#0[0, 0]: memref<4x8xf32>
+///                                 -> !xegpu.tensor_desc<4x8xf32>
+///
+/// ```
+struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                PatternRewriter &rewriter) const override {
+    OpOperand *operand =
+        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
+    if (!operand)
+      return rewriter.notifyMatchFailure(
+          subgroupOp, "warp result is not a xegpu::CreateNdDesc op");
+    auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
+    unsigned operandIdx = operand->getOperandNumber();
+
+    xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr();
+    if (!layout)
+      return rewriter.notifyMatchFailure(
+          descOp, "the tensor descriptor lacks layout attribute");
+
+    SmallVector<size_t> newRetIndices;
+    SmallVector<Value> newYieldValues;
+    SmallVector<Type> newYieldTypes;
+
+    for (Value operand : descOp->getOperands()) {
+      newYieldValues.push_back(operand);
+      newYieldTypes.push_back(operand.getType());
+    }
+    rewriter.setInsertionPoint(subgroupOp);
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, subgroupOp, /* new yieled values = */ newYieldValues,
+        /* new yielded types = */ newYieldTypes, newRetIndices);
+
+    SmallVector<Value> newDescOperands;
+    for (size_t i : newRetIndices) {
+      newDescOperands.push_back(newWarpOp.getResult(i));
+    }
+    rewriter.setInsertionPointAfter(newWarpOp);
+    xegpu::TensorDescType distributedTensorDescTy =
+        dropLayouts(descOp.getType()); // Distributed tensor descriptor type
+                                       // does not contain layout info.
+    auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
+        newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
+        descOp->getAttrs());
+
+    Value distributedVal = newWarpOp.getResult(operandIdx);
+    rewriter.replaceAllUsesWith(distributedVal, newDescOp);
+    return success();
+  }
+};
+
+/// Distribute a store_nd op at the end of enclosing
+/// `gpu.warp_execute_on_lane_0`. In case arguments for the store are passed
+/// through the warp op interface they would be propagated as returned values.
+/// Source vector is distributed based on lane layout. Appropriate cast ops are
+/// inserted if the distributed types does not match expected xegpu SIMT types.
+///
+/// Example:
+///
+/// ```
+///   #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
+///   gpu.warp_execute_on_lane_0(%laneid) -> () {
+///     ...
+///     xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
+///                                 !xegpu.tensor_desc<4x8xf32, #lo0>
+///   }
+/// ```
+/// To
+/// ```
+///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
+///   !xegpu.tensor_desc<4x8xf32, #lo0>) {
+///     gpu.yield %arg0, %arg1: vector<4x8xf32>, !xegpu.tensor_desc<4x8xf32,
+///     #lo0>
+///   }
+///   %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32>
+///   %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
+///   #lo0>
+///     -> !xegpu.tensor_desc<4x8xf32>
+///   xegpu.store_nd %0, %1: vector<4xf32>,
+///     !xegpu.tensor_desc<4x8xf32>
+///
+/// ```
+struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                PatternRewriter &rewriter) const override {
+    auto yield = cast<gpu::YieldOp>(
+        subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+    Operation *lastNode = yield->getPrevNode();
+    auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
+    if (!storeOp)
+      return failure();
+
+    xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType();
+    xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
+    if (!layout)
+      return rewriter.notifyMatchFailure(
+          storeOp, "the source tensor descriptor lacks layout attribute");
+
+    FailureOr<VectorType> distributedTypeByWarpOpOrFailure =
+        getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
+    if (failed(distributedTypeByWarpOpOrFailure))
+      return rewriter.notifyMatchFailure(storeOp,
+                                         "Failed to distribute the type");
+    VectorType distributedTypeByWarpOp =
+        distributedTypeByWarpOpOrFailure.value();
+
+    SmallVector<size_t> newRetIndices;
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, subgroupOp,
+        /* new yielded values = */
+        ValueRange{storeOp.getValue(), storeOp.getTensorDesc()},
+        /* new yielded types = */
+        TypeRange{distributedTypeByWarpOp, storeOp.getTensorDescType()},
+        newRetIndices);
+    // Create a new store op outside the warp op with the distributed vector
+    // type. Tensor descriptor is not distributed.
+    rewriter.setInsertionPointAfter(newWarpOp);
+    SmallVector<Value> newStoreOperands;
+
+    // For the value operand, there can be a mismatch between the vector type
+    // distributed by the warp op and (xegpu-specific) distributed type
+    // supported by the store op. Type mismatch must be resolved using
+    // appropriate cast op.
+    FailureOr<VectorType> storeNdDistributedValueTyOrFailure =
+        xegpu::getDistributedVectorType(storeOp.getTensorDescType());
+    if (failed(storeNdDistributedValueTyOrFailure))
+      return rewriter.notifyMatchFailure(
+          storeOp, "Failed to get distributed vector type for the store op");
+    newStoreOperands.push_back(resolveDistributedTy(
+        newWarpOp.getResult(newRetIndices[0]),
+        storeNdDistributedValueTyOrFailure.value(), rewriter));
+    // For the tensor descriptor operand, the layout attibute is dropped after
+    // distribution. Types needs to be resolved in this case also.
+    xegpu::TensorDescType distributedTensorDescTy =
+        dropLayouts(storeOp.getTensorDescType());
+    newStoreOperands.push_back(
+        resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
+                             distributedTensorDescTy, rewriter));
+
+    rewriter.create<xegpu::StoreNdOp>(
+        newWarpOp.getLoc(), TypeRange{}, newStoreOperands,
+        removeTemporaryLayoutAttributes(storeOp->getAttrs()));
+    rewriter.eraseOp(storeOp);
+    return success();
+  }
+};
+
+/// Distribute a load_nd op feeding into vector.yield op for the enclosing
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
+/// The warp op will still contain the original op that will not be used by
+/// the yield op (and should be cleaned up later). The yield op will
+/// bypass the load's arguments. Only the loaded vector is distributed
+/// according to lane layout and, tensor descriptor types is not
+/// distributed. Appropriate cast ops are inserted if the distributed types does
+/// not match expected xegpu SIMT types.
+///
+/// Example:
+///
+/// ```
+///   #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
+///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
+///                   (vector<4x1xf32>) {
+///     ...
+///     %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32, #lo0> ->
+///       vector<4x8xf32>
+///     gpu.yield %ld
+///   }
+/// ```
+/// To
+/// ```
+///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
+///   !xegpu.tensor_desc<4x8xf32, #lo0>) {
+///     ...
+///     %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32, #lo0> ->
+///     vector<4x8xf32> gpu.yield %dead, %arg0
+///   }
+///   %0 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
+///        #lo0> -> !xegpu.tensor_desc<4x8xf32>
+///   %1 = xegpu.load_nd %0: !xegpu.tensor_desc<4x8xf32> -> vector<4xf32>
+///   %2 = vector.shape_cast %r#0: vector<4xf32> to vector<4x1xf32>
+///
+/// ```
+struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                PatternRewriter &rewriter) const override {
+    OpOperand *operand =
+        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::LoadNdOp>);
+    if (!operand)
+      return rewriter.notifyMatchFailure(
+          subgroupOp, "warp result is not a xegpu::LoadNd op");
+
+    auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
+    xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
+    xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
+    if (!layout)
+      return rewriter.notifyMatchFailure(
+          loadOp, "the source tensor descriptor lacks layout attribute");
+
+    unsigned operandIdx = operand->getOperandNumber();
+    VectorType distributedTypeByWarpOp =
+        cast<VectorType>(subgroupOp.getResult(operandIdx).getType());
+
+    SmallVector<size_t> newRetIndices;
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, subgroupOp,
+        /* new yielded values = */ loadOp.getTensorDesc(),
+        /* new yielded types = */ tensorDescTy, newRetIndices);
+
+    // Create a new load op outside the warp op with the distributed vector
+    // type.
+    rewriter.setInsertionPointAfter(newWarpOp);
+    FailureOr<VectorType> loadNdDistValueTyOrFailure =
+        xegpu::getDistributedVectorType(loadOp.getTensorDescType());
+    if (failed(loadNdDistValueTyOrFailure))
+      return rewriter.notifyMatchFailure(
+          loadOp, "Failed to get distributed vector type for the load op");
+    xegpu::TensorDescType distributedTensorDescTy =
+        dropLayouts(loadOp.getTensorDescType()); // Distributed tensor
+                                                 // descriptor type does not
+                                                 // contain layout info.
+    auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(
+        newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
+        resolveDistributedTy(newWarpOp->getResult(newRetIndices[0]),
+                             distributedTensorDescTy, rewriter),
+        removeTemporaryLayoutAttributes(loadOp->getAttrs()));
+    // Set the packed attribute if the layout requires it.
+    newLoadOp.setPacked(hasPackedLayout(layout));
+    Value distributedVal = newWarpOp.getResult(operandIdx);
+    // There can be a conflict between the vector type distributed by the
+    // warp op and (xegpu-specific) distributed type supported by the load
+    // op. Resolve these mismatches by inserting a cast.
+    Value tyResolvedVal = resolveDistributedTy(
+        newLoadOp.getResult(), distributedTypeByWarpOp, rewriter);
+    rewriter.replaceAllUsesWith(distributedVal, tyResolvedVal);
+    return success();
+  }
+};
+
+/// Distribute a dpas op feeding into vector.yield op for the enclosing
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
+/// The warp op will still contain the original op that will not be used by
+/// the yield op (and should be cleaned up later). The yield op will
+/// bypass the dpas's arguments. Appropriate cast ops are inserted if the
+/// distributed types does not match expected xegpu SIMT types.
+/// Example:
+/// ```
+///   #lo_a = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
+///   #lo_b = #xegpu.layout<wi_layout = [1, 16], wi_data = [2, 1]>
+///   #lo_c = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
+///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
+///                   (vector<8x1xf32>) {
+///     ...
+///     %dpas = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16> ->
+///       vector<8x16xf32>
+///     gpu.yield %dpas
+///   }
+/// ```
+/// To
+/// ```
+///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<8x1xf32>,
+///   vector<8x1xf16>, vector<16x1xf16>) {
+///     ...
+///     %dead = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16>
+///       -> vector<8x16xf32>
+///     gpu.yield %dead, %arg0, %arg1
+///   }
+///   %0 = vector.shape_cast %r#1: vector<8x1xf16> to vector<8xf16>
+///   %1 = vector.shape_cast %r#2: vector<16x1xf16> to vector<16xf16>
+///   %2 = xegpu.dpas %0, %1: vector<8xf16>, vector<16xf16> ->
+///     vector<8xf32>
+///   %dpas = vector.shape_cast %2: vector<8xf32> to vector<8x1xf32>
+/// ```
+struct DpasDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                PatternRewriter &rewriter) const override {
+    OpOperand *operand =
+        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::DpasOp>);
+    if (!operand)
+      return rewriter.notifyMatchFailure(subgroupOp,
+                                         "warp result is not a xegpu::Dpas op");
+
+    auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
+    unsigned operandIdx = operand->getOperandNumber();
+    std::string layoutAName =
+        llvm::formatv("{0}{1}", operandLayoutNamePrefix, 0).str();
+    std::string layoutBName =
+        llvm::formatv("{0}{1}", operandLayoutNamePrefix, 1).str();
+    auto layoutCName = llvm::formatv("{0}{1}", resultLayoutNamePrefix, 0).str();
+    xegpu::LayoutAttr layoutA =
+        dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutAName);
+    xegpu::LayoutAttr layoutB =
+        dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutBName);
+    xegpu::LayoutAttr layoutOut =
+        dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutCName);
+    if (!layoutA || !layoutB || !layoutOut)
+      return rewriter.notifyMatchFailure(
+          dpasOp,
+          "the xegpu::Dpas op lacks layout attribute for A, B or output");
+
+    FailureOr<VectorType> distLhsTypeByWarpOpOrFailure =
+        getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType());
+    FailureOr<VectorType> distRhsTypeByWarpOpOrFailure =
+        getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType());
+    FailureOr<VectorType> distResultTypeByWarpOpOrFailure =
+        getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());
+    if (failed(distLhsTypeByWarpOpOrFailure) ||
+        failed(distRhsTypeByWarpOpOrFailure) ||
+        failed(distResultTypeByWarpOpOrFailure))
+      return rewriter.notifyMatchFailure(
+          dpasOp,
+          "Failed to distribute the A, B or output types in xegpu::Dpas op");
+
+    llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(),
+                                               dpasOp.getRhs()};
+    llvm::SmallVector<Type, 3> newYieldTypes{
+        distLhsTypeByWarpOpOrFailure.value(),
+        distRhsTypeByWarpOpOrFailure.value()};
+    // Dpas acc operand is optional.
+    if (dpasOp.getAcc()) {
+      newYieldValues.push_back(dpasOp.getAcc());
+      newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
+    }
+    // Create a new warp op without the dpas.
+    SmallVector<size_t> newRetIndices;
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
+
+    FailureOr<VectorType> expectedDistLhsTyOrFailure =
+        xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA);
+    FailureOr<VectorType> expectedDistRhsTyOrFailure =
+        xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB);
+    FailureOr<VectorType> expectedDistResultTyOrFailure =
+        xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut);
+    if (failed(expectedDistLhsTyOrFailure) ||
+        failed(expectedDistRhsTyOrFailure) ||
+        failed(expectedDistResultTyOrFailure))
+      return rewriter.notifyMatchFailure(
+          dpasOp,
+          "Failed to get distributed vector type for the dpas operands.");
+    // Create a new dpas op outside the warp op.
+    rewriter.setInsertionPointAfter(newWarpOp);
+    SmallVector<Value> newDpasOperands;
+    SmallVector<VectorType> newDpasOperandExpectedTypes;
+
+    // Resolve the distributed types with the original types.
+    newDpasOperandExpectedTypes.push_back(expectedDistLhsTyOrFailure.value());
+    newDpasOperandExpectedTypes.push_back(expectedDistRhsTyOrFailure.value());
+    VectorType distributedResultTy = expectedDistResultTyOrFailure.value();
+    if (dpasOp.getAcc())
+      newDpasOperandExpectedTypes.push_back(distributedResultTy);
+
+    for (unsigned i = 0; i < newRetIndices.size(); i++) {
+      newDpasOperands.push_back(
+          resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
+                               newDpasOperandExpectedTypes[i], rewriter));
+    }
+    Value newDpasOp = rewriter.create<xegpu::DpasOp>(
+        newWarpOp->getLoc(), distributedResultTy, newDpasOperands,
+        removeTemporaryLayoutAttributes(dpasOp->getAttrs()));
+    Value distributedVal = newWarpOp.getResult(operandIdx);
+    // Resolve the output type.
+    newDpasOp = resolveDistributedTy(
+        newDpasOp, distResultTypeByWarpOpOrFailure.value(), rewriter);
+    rewriter.replaceAllUsesWith(distributedVal, newDpasOp);
+    return success();
+  }
+};
+
+} // namespace
+
 namespace {
 struct XeGPUSubgroupDistributePass final
     : public xegpu::impl::XeGPUSubgroupDistributeBase<
@@ -646,14 +1427,61 @@ struct XeGPUSubgroupDistributePass final
 };
 } // namespace
 
-void XeGPUSubgroupDistributePass::runOnOperation() {
-  Operation *op = getOperation();
-  RunSGMapPropagation solver(op);
+void xegpu::populateXeGPUSubgroupDistributePatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<CreateNdDescDistribution, StoreNdDistribution,
+               LoadNdDistribution, DpasDistribution>(patterns.getContext());
+}
 
-  // Print the analysis result and exit.
+void XeGPUSubgroupDistributePass::runOnOperation() {
+  auto &analyis = getAnalysis<RunLayoutInfoPropagation>();
+  // Print the analysis result and exit. (for testing purposes)
   if (printOnly) {
     auto &os = llvm::outs();
-    solver.printAnalysisResult(os);
+    analyis.printAnalysisResult(os);
+    return;
+  }
+  auto getPropagatedLayout = [&](Value val) {
+    return analyis.getLayoutInfo(val);
+  };
+
+  // Assign xegpu::LayoutAttr to all ops and their users based on the layout
+  // propagation analysis result.
+  LayoutAttrAssignment layoutAssignment(getOperation(), getPropagatedLayout);
+  if (failed(layoutAssignment.run())) {
+    signalPassFailure();
+    return;
+  }
+
+  // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0
+  // operation.
+  {
+    RewritePatternSet patterns(&getContext());
+    patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
+
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+  // Finally, do the SIMD to SIMT distribution.
+  RewritePatternSet patterns(&getContext());
+  xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
+  // TODO: distributionFn and shuffleFn are not used at this point.
+  auto distributionFn = [](Value val) {
+    VectorType vecType = dyn_cast<VectorType>(val.getType());
+    int64_t vecRank = vecType ? vecType.getRank() : 0;
+    OpBuilder builder(val.getContext());
+    if (vecRank == 0)
+      return AffineMap::get(val.getContext());
+    return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext());
+  };
+  auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
+                      int64_t warpSz) { return Value(); };
+  vector::populatePropagateWarpVectorDistributionPatterns(
+      patterns, distributionFn, shuffleFn);
+  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+    signalPassFailure();
     return;
   }
 }

diff  --git a/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt
new file mode 100644
index 0000000000000..afd8e2d5c4df3
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_mlir_dialect_library(MLIRXeGPUUtils
+  XeGPUUtils.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU/Utils
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRXeGPUDialect
+  )

diff  --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
new file mode 100644
index 0000000000000..6b45ed0ae4ced
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -0,0 +1,85 @@
+//===---- XeGPUUtils.cpp - MLIR Utilities for XeGPUOps   ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utility methods for working with the XeGPU dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include <cstdint>
+#include <numeric>
+
+using namespace mlir;
+
+FailureOr<VectorType>
+mlir::xegpu::getDistributedVectorType(xegpu::TensorDescType tdescTy) {
+  auto layout = llvm::dyn_cast_if_present<LayoutAttr>(tdescTy.getLayout());
+  // It only works for subgroup level layout, which only has lane_layout
+  // and lane_data, and is to distribute a SIMD code into SIMT code.
+  if (!layout || !layout.isSgLayout())
+    return failure();
+
+  SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
+  SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
+  auto tdescShape = tdescTy.getShape();
+  auto elementType = tdescTy.getElementType();
+
+  // compute sgSize by multiply elements of laneLayout
+  // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1]
+  // e.g. for 1D layout, sgSize = laneLayout[0]
+  auto sgSize = std::accumulate(laneLayout.begin(), laneLayout.end(), 1,
+                                std::multiplies<int64_t>());
+
+  // Case 1: regular loads/stores
+  auto scatterAttr = tdescTy.getEncodingAsScatterTensorDescAttr();
+  if (scatterAttr) {
+    auto chunkSize = scatterAttr.getChunkSize().getInt();
+    // Verify if the first dimension of the tensor descriptor shape is
+    // distributable.
+    assert(tdescShape[0] == laneLayout[0] &&
+           "tensor descriptor shape is not distributable");
+    return VectorType::get({chunkSize}, elementType);
+  }
+
+  // Case 2: block loads/stores
+  // Check if the tensor descriptor shape is distributable.
+  int64_t tensorSize = 1;
+  for (auto [tdescDim, laneDim, laneDataDim] :
+       llvm::zip_equal(tdescShape, laneLayout, laneData)) {
+    assert((tdescDim % (laneDim * laneDataDim) == 0) &&
+           "tensor descriptor shape is not distributable");
+    tensorSize *= tdescDim;
+  }
+  // tensorSize must be adjusted for array_length.
+  tensorSize *= tdescTy.getArrayLength();
+
+  return VectorType::get({tensorSize / sgSize}, elementType);
+}
+
+FailureOr<VectorType>
+mlir::xegpu::getDistributedVectorType(VectorType originalType,
+                                      xegpu::LayoutAttr layout) {
+  int64_t rank = originalType.getRank();
+  // Distributed vector type is only supported for 1D, 2D and 3D vectors.
+  if (rank < 1 || rank > 3)
+    return failure();
+  ArrayRef<int64_t> shape = originalType.getShape();
+  // arrayLength is 1 for 1D and 2D vectors, and equal to the first dimension
+  // of the 3D vector.
+  int arrayLength = 1;
+  if (rank == 3) {
+    arrayLength = shape[0];
+    shape = shape.drop_front();
+  }
+  auto helperTdescTy = xegpu::TensorDescType::get(
+      shape, originalType.getElementType(), arrayLength,
+      /*boundary_check=*/true,
+      /*memory_space=*/xegpu::MemorySpace::Global, layout);
+  return xegpu::getDistributedVectorType(helperTdescTy);
+}

diff  --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
new file mode 100644
index 0000000000000..f8f2cd55c28d0
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
@@ -0,0 +1,162 @@
+// RUN: mlir-opt -xegpu-subgroup-distribute -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: gpu.func @store_nd_1d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) {
+// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32>
+// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+// CHECK: xegpu.store_nd %[[CST]], %[[T0]]  : vector<1xf32>, !xegpu.tensor_desc<16xf32>
+// CHECK: gpu.return
+gpu.module @test {
+gpu.func @store_nd_1d(%arg0: memref<16xf32>){
+  %c0 = arith.constant 0 : index
+  %1 = arith.constant dense<1.000000e+00> : vector<16xf32>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+  xegpu.store_nd %1, %0 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @store_nd_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
+// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf16>
+// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: xegpu.store_nd %[[CST]], %[[T0]]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+gpu.module @test {
+gpu.func @store_nd_2d(%arg0: memref<16x16xf16>){
+  %c0 = arith.constant 0 : index
+  %1 = arith.constant dense<1.000000e+00> : vector<16x16xf16>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  xegpu.store_nd %1, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  gpu.return
+}
+}
+
+
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_1d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16xf32>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<16xf32> -> vector<1xf32>
+// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+// CHECK: xegpu.store_nd %[[T1]], %[[T2]]  : vector<1xf32>, !xegpu.tensor_desc<16xf32>
+gpu.module @test {
+gpu.func @load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+  %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+  xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: xegpu.store_nd %[[T1]], %[[T2]]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+gpu.module @test {
+gpu.func @load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_array_length
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
+// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<32xf16> to vector<2x16x1xf16>
+// CHECK: %[[T3:.*]] = vector.extract %[[T2]][0] : vector<16x1xf16> from vector<2x16x1xf16>
+// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16>
+// CHECK: xegpu.store_nd %[[T5]], %[[T4]]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+gpu.module @test {
+gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
+  %2 = vector.extract %1[%c0] : vector<16x16xf16> from vector<2x16x16xf16>
+  %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @dpas
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: vector<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: vector<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: vector<8x16xf32>, %[[ARG3:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %[[T1:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]
+// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>, memref<8x16xf32>) -> (vector<8x1xf16>, vector<16x1xf16>, vector<8x1xf32>) {
+// CHECK: ^bb0(%[[ARG4:[0-9a-zA-Z]+]]: vector<8x16xf16>, %[[ARG5:[0-9a-zA-Z]+]]: vector<16x16xf16>, %[[ARG6:[0-9a-zA-Z]+]]: vector<8x16xf32>, %[[ARG7:[0-9a-zA-Z]+]]: memref<8x16xf32>):
+// CHECK:  gpu.yield %[[ARG4]], %[[ARG5]], %[[ARG6]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>
+// CHECK: }
+// CHECK-DAG: %[[T2:.*]] = vector.shape_cast %[[T1]]#0 : vector<8x1xf16> to vector<8xf16>
+// CHECK-DAG: %[[T3:.*]] = vector.shape_cast %[[T1]]#1 : vector<16x1xf16> to vector<16xf16>
+// CHECK-DAG: %[[T4:.*]] = vector.shape_cast %[[T1]]#2 : vector<8x1xf32> to vector<8xf32>
+// CHECK: %[[T5:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[T4]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
+// CHECK: %[[T6:.*]] = xegpu.create_nd_tdesc %[[ARG3]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK: xegpu.store_nd %[[T5]], %[[T6]]  : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+gpu.module @test {
+gpu.func @dpas(%arg0: vector<8x16xf16>, %arg1: vector<16x16xf16>, %arg3: vector<8x16xf32>, %arg2: memref<8x16xf32>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.dpas %arg0, %arg1, %arg3 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+  %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+  xegpu.store_nd %0, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_dpas_store
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]]  : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
+// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]]  : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+gpu.module @test {
+gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg3: memref<8x16xf32>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+  %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  %3 = xegpu.load_nd %2 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %4 = xegpu.dpas %1, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+  %5 = xegpu.create_nd_tdesc %arg3[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+  xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  gpu.return
+}
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index,
+// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index,
+// CHECK-SAME: %[[ARG5:[0-9a-zA-Z]+]]: index, %[[ARG6:[0-9a-zA-Z]+]]: index, %[[ARG7:[0-9a-zA-Z]+]]: index) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}], [%[[ARG2]], %[[ARG3]]], [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}], [%[[ARG2]], %[[ARG3]]], [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: xegpu.store_nd %[[T1]], %[[T2]]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64,
+  %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) {
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0 [%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16>
+  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16>
+  xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  gpu.return
+}
+}

diff  --git a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
index 1ae4348af33e6..a5468681e68dc 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
@@ -2,27 +2,27 @@
 
 // CHECK: function: test_dpas_f16:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: op    : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
 // CHECK-NEXT: op    : %{{.*}} = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T2:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T3:.*]] = xegpu.load_nd %[[T1]]  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
@@ -40,17 +40,17 @@ func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg
 // -----
 // CHECK: function: test_dpas_i8:
 // CHECK-NEXT: argument: <block argument> of type 'vector<8x32xi8>' at index: 0
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [1, 2]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 2]
 // CHECK-NEXT: argument: <block argument> of type 'vector<32x16xi8>' at index: 1
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [4, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [4, 1]
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xi32>' at index: 2
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: op    : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.dpas %{{.*}} : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
@@ -62,27 +62,27 @@ func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2:
 // -----
 // CHECK: function: test_load_with_transpose_effect:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: op    : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
 // CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2]
 // CHECK-NEXT: op    : %[[T2:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T3:.*]] = xegpu.load_nd %[[T1]] <{transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
@@ -99,29 +99,29 @@ func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memre
 // -----
 // CHECK: function: test_vector_transpose:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: op    : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
 // CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2]
 // CHECK-NEXT: op    : %[[T2:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T3:.*]] = xegpu.load_nd %[[T1]]  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2]
 // CHECK-NEXT: op    : %[[T4:.*]] = vector.transpose %[[T3]], [1, 0] : vector<16x16xf16> to vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T5:.*]] = xegpu.dpas %[[T2]], %[[T4]], %[[CST]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
@@ -139,19 +139,19 @@ func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf1
 // -----
 // CHECK: function: test_extf_truncf:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T1:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T2:.*]] = arith.extf %[[T1]] : vector<16x16xf16> to vector<16x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T3:.*]] = arith.truncf %[[T2]] : vector<16x16xf32> to vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
 func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -164,29 +164,29 @@ func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t
 // -----
 // CHECK: function: test_load_gather_with_transpose_effect:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<256xf16>' at index: 1
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: op    : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T1:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[CST0:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T2:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2]
 // CHECK-NEXT: op    : %[[T3:.*]] = xegpu.load %[[T2]], %[[CST0]] <{transpose}> : !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>>, vector<16xi1> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T4:.*]] = xegpu.dpas %[[T1]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -204,17 +204,17 @@ func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1
 // -----
 // CHECK: function: test_load_gather_1d:
 // CHECK: argument: <block argument> of type 'memref<256xf32>' at index: 0
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
-// CHECK-NEXT: sg_map  : wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout  : lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[CST0:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T1]] = xegpu.load %[[T0]], %[[CST0]]  : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
   %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
   %cst_0 = arith.constant dense<true> : vector<16xi1>
@@ -227,15 +227,15 @@ func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc
 // -----
 // CHECK: function: test_store_scatter_with_transpose_effect:
 // CHECK-NEXT: argument: <block argument> of type 'memref<128xf32>' at index: 0
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[CST0:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[CST1:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST1]] : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 1]
 func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
   %cst = arith.constant dense<1.000000e+00> : vector<8x16xf32>
   %cst_0 = arith.constant dense<true> : vector<16xi1>
@@ -248,15 +248,15 @@ func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
 // -----
 // CHECK: function: test_store_scatter_1d:
 // CHECK-NEXT: argument: <block argument> of type 'vector<16xf32>' at index: 0
-// CHECK-NEXT: sg_map  : wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout  : lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: argument: <block argument> of type 'memref<256xf32>' at index: 1
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[CST1:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
   %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
   %cst_0 = arith.constant dense<true> : vector<16xi1>
@@ -268,27 +268,27 @@ func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>)
 // -----
 // CHECK: function: test_vector_bitcast_i16_to_i8:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xi16>' at index: 0
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<32x16xi8>' at index: 1
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xi32>' at index: 2
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: op    : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<32x16xi8> -> !xegpu.tensor_desc<32x16xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [4, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1]
 // CHECK-NEXT: op    : %[[T2:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<8x16xi16> -> vector<8x16xi16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T3:.*]] = xegpu.load_nd %[[T1]]  : !xegpu.tensor_desc<32x16xi8> -> vector<32x16xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [4, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1]
 // CHECK-NEXT: op    : %[[T4:.*]] = vector.bitcast %[[T2]] : vector<8x16xi16> to vector<8x32xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2]
 // CHECK-NEXT: op    : %[[T5:.*]] = xegpu.dpas %[[T4]], %[[T3]] : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<32x16xi8>, %arg2: memref<8x16xi32>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16>
@@ -305,29 +305,29 @@ func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<
 // -----
 // CHECK: function: test_vector_bitcast_i8_to_f16:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x32xi8>' at index: 0
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<16x32xi8>' at index: 1
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: op    : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2]
 // CHECK-NEXT: op    : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x32xi8> -> !xegpu.tensor_desc<16x32xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [4, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1]
 // CHECK-NEXT: op    : %[[T2:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<8x32xi8> -> vector<8x32xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2]
 // CHECK-NEXT: op    : %[[T3:.*]] = xegpu.load_nd %[[T1]]  : !xegpu.tensor_desc<16x32xi8> -> vector<16x32xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [4, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1]
 // CHECK-NEXT: op    : %[[T4:.*]] = vector.bitcast %[[T2]] : vector<8x32xi8> to vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T5:.*]] = vector.bitcast %[[T3]] : vector<16x32xi8> to vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T7:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<16x32xi8>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8>
@@ -345,21 +345,21 @@ func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<1
 // -----
 // CHECK: function: test_binary_op_one_use:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T1:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T2:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T3:.*]] = arith.addf %[[T1]], %[[T2]] : vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -373,23 +373,23 @@ func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !x
 // -----
 // CHECK: function: test_binary_op_multiple_uses:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 3
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T1:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T2:.*]] = arith.addf %[[T1]], %[[CST]] : vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T3:.*]] = xegpu.dpas %[[T0]], %[[T2]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -404,39 +404,39 @@ func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %ar
 // -----
 // CHECK: function: test_for_op:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x128xf16>' at index: 0
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<128x16xf16>' at index: 1
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: op    : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
 // CHECK-NEXT: op    : %{{.*}} = arith.constant 128 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
 // CHECK-NEXT: op    : %{{.*}} = arith.constant 16 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T4:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T5:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T7:.*]] = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T8:.*]] = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : scf.for
-// CHECK-NEXT: sg_map for result #0: Not assigned.
-// CHECK-NEXT: sg_map for result #1: Not assigned.
-// CHECK-NEXT: sg_map for result #2: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: Not assigned.
+// CHECK-NEXT: layout for result #1: Not assigned.
+// CHECK-NEXT: layout for result #2: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T3:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
@@ -460,23 +460,23 @@ func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg
 // -----
 // CHECK: function: test_if_single_use:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: argument: <block argument> of type 'i1' at index: 2
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf32>' at index: 3
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T3:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T4:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : scf.if
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = scf.if %arg2 -> (vector<16x16xf16>) {
@@ -494,25 +494,25 @@ func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu
 // -----
 // CHECK: function: test_if_multiple_uses:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type 'i1' at index: 2
-// CHECK-NEXT: sg_map  : Not assigned.
+// CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf32>' at index: 3
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 4
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T3:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T4:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : scf.if
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = scf.if %arg2 -> (vector<16x16xf16>) {
@@ -531,13 +531,13 @@ func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xe
 // -----
 // CHECK: function: test_vector_outer_reduction:
 // CHECK-NEXT: argument: <block argument> of type 'vector<16x16xf32>' at index: 0
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
-// CHECK-NEXT: sg_map  : wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout  : lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T0:.*]] = vector.multi_reduction <add>, %{{.*}}, %[[CST]] [0] : vector<16x16xf32> to vector<16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
   %cst = arith.constant dense<0.000000e+00> : vector<16xf32>
   %0 = vector.multi_reduction <add>, %arg0, %cst [0] : vector<16x16xf32> to vector<16xf32>
@@ -548,13 +548,13 @@ func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.t
 // -----
 // CHECK: function: test_vector_inner_reduction:
 // CHECK-NEXT: argument: <block argument> of type 'vector<16x16xf32>' at index: 0
-// CHECK-NEXT: sg_map  : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
-// CHECK-NEXT: sg_map  : wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout  : lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T0:.*]] = vector.multi_reduction <add>, %{{.*}}, %[[CST]] [1] : vector<16x16xf32> to vector<16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 func.func @test_vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
   %cst = arith.constant dense<0.000000e+00> : vector<16xf32>
   %0 = vector.multi_reduction <add>, %arg0, %cst [1] : vector<16x16xf32> to vector<16xf32>