[Mlir-commits] [mlir] [MLIR][XeGPU] Extend SGMapAttr (PR #132425)

Fri Mar 21 10:09:24 PDT 2025

https://github.com/chencha3 created https://github.com/llvm/llvm-project/pull/132425

This PR extends the SGMapAttr to support workgroup level programming. It is the first step to extend the XeGPU dialect from subgroup level to workgroup level. Because it is now only only for subgroup distribution, it is renamed to LayoutAttr. Technically, it added four fields into the old attribute definition: `scope`, `sg_layout`, `sg_data`, and `order`. `conv_layout` operation is also added for conversion between two different LayoutAttrs

>From 4838b524a635e566175aa087440283b909555402 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 21 Mar 2025 16:54:15 +0000
Subject: [PATCH] extend sg_map from subgroup to workgroup

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  76 ++++--
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |  63 +++--
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       |  16 +-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 126 ++++------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 157 ++++++++----
 mlir/test/Dialect/XeGPU/invalid.mlir          | 110 +++++----
 mlir/test/Dialect/XeGPU/ops.mlir              | 230 ++++++++++--------
 7 files changed, 457 insertions(+), 321 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 0136b18ccfa94..7adb9df3c6b25 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -154,33 +154,81 @@ def XeGPU_FenceScopeAttr:
     let assemblyFormat = "$value";
 }
 
-def XeGPU_SGMapAttr : XeGPUAttr<"SGMap", "sg_map"> {
+def XeGPU_ScopeWG:   I32EnumAttrCase<"WG", 0, "wg">;      // workgroup level code
+def XeGPU_ScopeSG:   I32EnumAttrCase<"SG", 1, "sg">;      // subgroup level code
+def XeGPU_ScopeWI:   I32EnumAttrCase<"WI", 2, "wi">;      // simt level code
+
+def XeGPU_ScopeEnums : I32EnumAttr<"Scope", "enumerate of scope",
+  [XeGPU_ScopeWG,XeGPU_ScopeSG,XeGPU_ScopeWI]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::xegpu";
+}
+
+def XeGPU_ScopeAttr
+  : EnumAttr<XeGPU_Dialect,XeGPU_ScopeEnums, "Stage"> {
+    let summary = [{Describe the stage of lowering progress}];
+    let assemblyFormat = "``$value";
+}
+
+def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
   let summary = [{
     Describes the mapping between work item (WI) and the 2D tensor specified by the tensor descriptor.
   }];
   let description = [{
-    To distribute the XeGPU operation to work items, the tensor_desc must be specified with the sg_map
-    attribute at the tensor description creation time.
-    Within the `sg_map`, `wi_layout` specifies the layout of work items,
-    describing the mapping of work items to the tensor.
-    wi_layout[0] x wi_layout[1] must be equal to the total number of work items within a subgroup.
-    `wi_data` specifies the minimum number of data elements assigned to each work item for a single distribution.
-
-    E.g., #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
-    In this example, the subgroup has 16 work items in wi_layout=[1, 16],
-    each accessing 1 element as specified by wi_data=[1, 1].
+    XeGPU operations leverages LayoutAttr to distribute data across work-item. It is specified in tensor_descs
+    upon the tensor description creation. LayoutAttr contains the following parameters.
+
+    * scope: specifies the scope of current code. It can be either wg (workgroup), sg (subgroup) or wi (workitem).
+             it is hard required for subgroup, but optional for workgroup and wi. By default, if a LayoutAttr
+             contains sg_layout and sg_data, it will be treated as workgroup code; and if it only contains
+             wi_layout and wi_data, it will be considered as workitem level.
+    * sg_layout: [optional] specifies the total number of subgroups and their layout in a workgroup.
+    * sg_data: [optional] specifies the data size accessed per subgroup.
+    * order: [optional] specifies the dimension order used to linearize n-d sbugroup ids to 1-d.
+            The first dimension in the order list is the fastest-changing dimension.
+    * wi_layout: [required] specifies the total number of work-items and their layout in a subgroup
+    * wi_data: [required] specifies the data size accessed per work-item for a single distribution.
 
     `wi_data[0] * wi_data[1]` can be greater than 1, meaning that each work item operates on multiple elements,
     which is eventually lowered to "SIMT-flavor" vector, like SPIR-V vector or llvm vector, or packed to a storage data type.
     The multiple elements indicated by `wi_data` can only be from one dimension and must be contiguous in the memory along either dimension.
+
+    E.g., #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
+    In this example, the subgroup has 16 work items in wi_layout=[1, 16], each accessing 1 element as specified by wi_data=[1, 1].
+
+    E.g., #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>
+    In this example, the layout representing a workgroup work distribution. A workgroup has 8 subgroups organized as 2x4 layout.
+    and each subgroup accesses a 16x16 block per instruction, which is further disbributed to 16 work items as described above.
+
   }];
   let parameters = (ins
-    ArrayRefParameter<"uint32_t">:$wi_layout,
-    ArrayRefParameter<"uint32_t">:$wi_data
+    OptionalParameter<"ScopeAttr">: $scope,
+    OptionalParameter<"DenseI32ArrayAttr">: $sg_layout,
+    OptionalParameter<"DenseI32ArrayAttr">: $sg_data,
+    OptionalParameter<"DenseI32ArrayAttr">: $order,
+    "DenseI32ArrayAttr": $wi_layout,
+    "DenseI32ArrayAttr": $wi_data
   );
 
+  let extraClassDeclaration = [{
+    bool isForWorkgroupLevel() {
+      if (!getScope())
+        return getSgLayout() && getSgData();
+      return getScope() == ScopeAttr::get(getContext(), Scope::WG);
+    }
+
+    bool isForSubgroupLevel() {
+      return getScope() == ScopeAttr::get(getContext(), Scope::SG);
+    }
+
+    bool isForWorkItemLevel() {
+      if (!getScope())
+        return !getSgLayout() && !getSgData() && !getOrder();
+      return getScope() == ScopeAttr::get(getContext(), Scope::WI);
+    }
+  }];
 
-  let hasCustomAssemblyFormat = 1;
+  let assemblyFormat = "`<` struct(params) `>`";
   let genVerifyDecl = 1;
 }
 
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 56b836d707a7d..6b27ae3b2754c 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -80,7 +80,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
         information e.g., memref<?x?xf16>, the strides information has to be explicitly
         passed via the "strides" and "const_strides" argument.
 
-    In SIMT mode, tensor descriptor is augmented with `SGMapAttr` which describes the
+    In SIMT mode, tensor descriptor is augmented with `LayoutAttr` which describes the
     mapping of the tensor descriptor to the work items.
 
     Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
@@ -113,7 +113,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 8 : index
     %1 = xegpu.create_nd_tdesc %0[%c0, %c0] : memref<1024x1024xf32>
-          -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+          -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
     ```
   }];
 
@@ -306,7 +306,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
     fp32 or fp64. It implies that vnni and transpose cannot exit at the
     same time.
 
-    In SIMT mode, LoadNdOp expects the tensor descriptor to be augmented with `SGMapAttr`
+    In SIMT mode, LoadNdOp expects the tensor descriptor to be augmented with `LayoutAttr`
     which describes the mapping of the tensor to the work items. In this case, result
     vector represents the data to be loaded by each work-item.
 
@@ -323,7 +323,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
       xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint<cached>,
                         l2_hint = #xegpu.cache_hint<uncached>}>
         : !xegpu.tensor_desc<8x16xf32,
-          #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+          #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
     ```
 
 
@@ -364,7 +364,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
     of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
     Corresponding cache hint attribute will be masked.
 
-    In SIMT mode, StoreNdOp expects the tensor descriptor to be augmented with `SGMapAttr`
+    In SIMT mode, StoreNdOp expects the tensor descriptor to be augmented with `LayoutAttr`
     which describes the mapping of the tensor to the work items. In this case, input
     vector represents the data to be stored by each work-item.
 
@@ -381,7 +381,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
                              l2_hint = #xegpu.cache_hint<write_back>,
                              l3_hint = #xegpu.cache_hint<write_through>}
                              : vector<8x1xf16>, !xegpu.tensor_desc<8x16xf16,
-                               #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+                               #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
     ```
 
 
@@ -422,7 +422,7 @@ def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
   Example 2 (SIMT mode):
   ```
     %2 = xegpu.update_nd_offset %1, [0, 16]:
-      !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+      !xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
   ```
   }];
 
@@ -482,7 +482,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
     the chunk_size if the chunk size is larger than 1.
 
     In SIMT mode, similar to `create_nd_tdesc` the resulting tensor descriptor is augmented
-    with `SGMapAttr` which describes the mapping of the tensor descriptor to the work items.
+    with `LayoutAttr` which describes the mapping of the tensor descriptor to the work items.
     In this case, the first dimension of the tensor descriptor represents the work-items, and
     the second dimension represents the chunk size.
 
@@ -517,7 +517,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
     %off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
     %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
           -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>,
-          #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+          #xegpu.layout<wi_layout = [4, 1], wi_data = [1, 1]>>
     ```
   }];
 
@@ -571,7 +571,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
   let hasVerifier = 1;
 }
 
-def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
+def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [MemoryEffects<[MemRead]>]> {
   let summary = "prefetches a set of scattered data points to cache";
 
   let description = [{
@@ -623,7 +623,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
     The mask operand masks out memory access so that it is safe to pass out-of-boundary
     addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
 
-    In SIMT mode, LoadGatherOp expects the tensor descriptor to be augmented with `SGMapAttr`
+    In SIMT mode, LoadGatherOp expects the tensor descriptor to be augmented with `LayoutAttr`
     which describes the mapping of the tensor to the work items. In this case, result vector
     represents the data to be loaded by each work-item. Each work-item recieves a `chunk_size`
     number of elements.
@@ -653,7 +653,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
                             l2_hint = #xegpu.cache_hint<uncached>,
                             l3_hint = #xegpu.cache_hint<uncached>}
           : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>,
-            !xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>>
+            !xegpu.layout<wi_layout = [16, 1], wi_data = [1, 1]>>
             vector<16xi1> -> vector<8x1xf32>
   ```
 
@@ -704,7 +704,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
   has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
   introduced on purpose, making sure users are aware of this implicit transformation.
 
-  In SIMT mode, StoreScatterOp expects the tensor descriptor to be augmented with `SGMapAttr`
+  In SIMT mode, StoreScatterOp expects the tensor descriptor to be augmented with `LayoutAttr`
   which describes the mapping of the tensor to the work items. In this case, input vector
   represents the data to be stored by each work-item. Each work-item recieves a `chunk_size`
   number of elements.
@@ -732,7 +732,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
                                  l2_hint = #xegpu.cache_hint<write_back>,
                                  l3_hint = #xegpu.cache_hint<write_through>}
           : vector<8x1xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>,
-            !xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>> vector<16xi1>
+            !xegpu.layout<wi_layout = [16, 1], wi_data = [1, 1]>> vector<16xi1>
   ```
 
   }];
@@ -790,7 +790,7 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
       %off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
       %2 = xegpu.update_offset %1, %off :
               !xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<chunk_size=2>,
-              #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+              #xegpu.layout<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
     ```
   }];
 
@@ -840,9 +840,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
     factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>`
     can be represented as `B: vector<8x16x2xf16>`.
 
-    In SIMT mode, DpasOp expects attributes `sg_map_a`, `sg_map_b`, and `sg_map_c`
-    which descibes the data fragment owned by each work-item w.r.t. the tensor
-    descriptor these data are loaded from.
+    In SIMT mode, DpasOp expects layout attributes `a`, `b`, and `c` (only if acc is used)
+    which descibe the data fragment owned by each work-item w.r.t. the tensor descriptor
+    these data are loaded from.
 
     Note: on PVC, the hardware can perform load with VNNI transformation when data
           element type is 16-bit or lower precision, taking 2 or 4 elements from
@@ -853,9 +853,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
     XeGPU_DpasOpType : $lhs,
     XeGPU_DpasOpType : $rhs,
     Optional<XeGPU_Vector2DType>: $acc,
-    OptionalAttr<XeGPU_SGMapAttr>:$sg_map_a,
-    OptionalAttr<XeGPU_SGMapAttr>:$sg_map_b,
-    OptionalAttr<XeGPU_SGMapAttr>:$sg_map_c);
+    OptionalAttr<XeGPU_LayoutAttr>:$a_layout,
+    OptionalAttr<XeGPU_LayoutAttr>:$b_layout,
+    OptionalAttr<XeGPU_LayoutAttr>:$c_layout);
   let results = (outs XeGPU_Vector2DType: $result);
 
   let extraClassDeclaration = [{
@@ -876,6 +876,10 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
     VectorType getResultType() {
       return getResult().getType();
     }
+
+    bool hasAcc() {
+      return getAcc() != nullptr;
+    }
   }];
 
   let assemblyFormat = [{
@@ -979,4 +983,21 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
   let extraClassDeclaration = extraBaseClassDeclaration;
 }
 
+def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> {
+    let summary = "Convert the sg layout of the input operand";
+    let description = [{
+        convert_layout remaps the distribution of data across workitems by updating the LayoutAttr.
+    }];
+    let arguments = (ins XeGPU_Vector2DType: $source,
+                         XeGPU_LayoutAttr: $srcMap,
+                         XeGPU_LayoutAttr: $resMap
+                         );
+    let results = (outs XeGPU_Vector2DType: $result);
+    let assemblyFormat = [{
+        $source attr-dict `:` type($source)
+    }];
+
+    let hasVerifier = 1;
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index ccd91a928e1dd..c92ea42efde3b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -63,7 +63,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     element-type ::= float-type | integer-type | index-type
     dim-list := (static-dim-list `x`)?
     static-dim-list ::= decimal-literal `x` decimal-literal
-    attr-list = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)? (, sg_map `<` wi_layout = value, wi_data = value `>`)?
+    attr-list = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)? (, layout `<` wi_layout = value, wi_data = value `>`)?
     ```
 
     Examples:
@@ -78,15 +78,15 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     // A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
     xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_space = slm>>
 
-    // A TensorDesc with a sg_map
-    xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    // A TensorDesc with a layout
+    xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
     ```
   }];
 
   let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
                         "mlir::Type": $elementType,
                         OptionalParameter<"mlir::Attribute">: $encoding,
-                        OptionalParameter<"mlir::Attribute">: $sg_map);
+                        OptionalParameter<"mlir::Attribute">: $layout);
 
   let builders = [
     TypeBuilderWithInferredContext<(ins
@@ -95,13 +95,13 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
       CArg<"int", "1">: $array_length,
       CArg<"bool", "true">: $boundary_check,
       CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
-      CArg<"mlir::Attribute", "mlir::Attribute()">:$sg_map)>,
+      CArg<"mlir::Attribute", "mlir::Attribute()">:$layout)>,
     TypeBuilderWithInferredContext<(ins
       "llvm::ArrayRef<int64_t>": $shape,
       "mlir::Type": $elementType,
       CArg<"int", "1">: $chunk_size,
       CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
-      CArg<"mlir::Attribute", "mlir::Attribute()">:$sg_map)>
+      CArg<"mlir::Attribute", "mlir::Attribute()">:$layout)>
   ];
 
   let extraClassDeclaration = [{
@@ -127,8 +127,8 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
       return llvm::dyn_cast_if_present<ScatterTensorDescAttr>(getEncoding());
     }
 
-    SGMapAttr getSGMapAttr() const {
-      return llvm::dyn_cast_if_present<SGMapAttr>(getSgMap());
+    LayoutAttr getLayoutAttr() const {
+      return llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
     }
 
     xegpu::MemorySpace getMemorySpace() const {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 78c242571935c..52b9f2c192b3f 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -68,73 +68,39 @@ LogicalResult ScatterTensorDescAttr::verify(
 }
 
 //===----------------------------------------------------------------------===//
-// XeGPU_SGMapAttr
+// XeGPU_LayoutAttr
 //===----------------------------------------------------------------------===//
-namespace {
-template <typename T, unsigned N>
-LogicalResult parseIntArrayField(::mlir::AsmParser &parser,
-                                 llvm::SmallVector<T, N> &result,
-                                 llvm::StringRef fieldName) {
-  if (failed(parser.parseKeyword(fieldName))) {
-    parser.emitError(parser.getCurrentLocation(),
-                     "unexpected field name. Expected " + fieldName + ".");
-    return failure();
+LogicalResult
+LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+                  ScopeAttr scope,
+                  DenseI32ArrayAttr sg_layout,
+                  DenseI32ArrayAttr sg_data,
+                  DenseI32ArrayAttr order,
+                  DenseI32ArrayAttr wi_layout,
+                  DenseI32ArrayAttr wi_data) {
+
+  if (scope && scope.getValue() != Scope::WG && (sg_layout || sg_data || order)) {
+    return emitError() << "expected sg_layout, sg_data, and order being only used at workgroup level.";
   }
 
-  if (failed(parser.parseEqual())) {
-    parser.emitError(parser.getCurrentLocation(), "expected '=' sign.");
-    return failure();
+  if ((sg_layout != nullptr) ^ (sg_data != nullptr)) {
+    return emitError() << "expected sg_layout and sg_data being both present or both absent";
   }
 
-  auto elemParser = [&]() -> llvm::ParseResult {
-    uint32_t elem = 0;
-    auto res = parser.parseInteger(elem);
-    result.push_back(elem);
-    return res;
-  };
-
-  return parser.parseCommaSeparatedList(AsmParser::Delimiter::Square,
-                                        elemParser, fieldName);
-}
-} // namespace
-
-mlir::Attribute SGMapAttr::parse(::mlir::AsmParser &parser,
-                                 ::mlir::Type attrType) {
-  if (failed(parser.parseLess()))
-    return {};
-
-  llvm::SmallVector<uint32_t, 2> wi_layout, wi_data;
-  if (failed(parseIntArrayField(parser, wi_layout, "wi_layout")))
-    return {};
-
-  if (failed(parser.parseComma()))
-    return {};
-
-  if (failed(parseIntArrayField(parser, wi_data, "wi_data")))
-    return {};
+  if (order) {
+    if (!sg_layout)
+      return emitError() << "expected order being used with sg_layout and sg_data.";
+    if (order.size() != sg_layout.size())
+      return emitError() << "expected order having the same rank as sg_layout and sg_data";
+  }
 
-  return SGMapAttr::getChecked(
-      [&]() { return parser.emitError(parser.getNameLoc()); },
-      parser.getContext(), wi_layout, wi_data);
-}
+  if (sg_layout && (sg_layout.size() != sg_data.size() || sg_layout.size() > 2)) {
+    return emitError() << "expected sg_layout and sg_data having the same rank, which is not larger than 2";
+  }
 
-void SGMapAttr::print(::mlir::AsmPrinter &printer) const {
-  printer << "<";
-  printer.printKeywordOrString("wi_layout");
-  printer << " = [" << getWiLayout() << "], ";
-  printer.printKeywordOrString("wi_data");
-  printer << " = [" << getWiData() << "]";
-  printer << ">";
-}
+  if (wi_layout.size() != wi_data.size() || wi_layout.size() > 2)
+    return emitError() << "expected wi_layout and wi_data having the same rank, which is not larger than 2";
 
-LogicalResult
-SGMapAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
-                  llvm::ArrayRef<uint32_t> wi_layout,
-                  llvm::ArrayRef<uint32_t> wi_data) {
-  if (wi_layout.size() != 2)
-    return emitError() << "expected wi_layout of size 2";
-  if (wi_data.size() != 2)
-    return emitError() << "expected wi_data of size 2";
   return success();
 }
 
@@ -146,7 +112,7 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
   llvm::SmallVector<int64_t> shape;
   mlir::Type elementType;
   mlir::FailureOr<mlir::Attribute> encoding;
-  mlir::FailureOr<mlir::Attribute> sg_map;
+  mlir::FailureOr<mlir::Attribute> layout;
 
   // Parse literal '<'
   if (parser.parseLess())
@@ -169,8 +135,8 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
     mlir::Attribute attr;
     ParseResult res = parser.parseAttribute(attr);
     if (mlir::succeeded(res)) {
-      if (mlir::isa<SGMapAttr>(attr)) {
-        sg_map = attr;
+      if (mlir::isa<LayoutAttr>(attr)) {
+        layout = attr;
         continue;
       }
       if (mlir::isa<BlockTensorDescAttr, ScatterTensorDescAttr>(attr)) {
@@ -188,7 +154,7 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
   return TensorDescType::getChecked(
       [&]() { return parser.emitError(parser.getNameLoc()); },
       parser.getContext(), shape, elementType,
-      encoding.value_or(mlir::Attribute()), sg_map.value_or(mlir::Attribute()));
+      encoding.value_or(mlir::Attribute()), layout.value_or(mlir::Attribute()));
 }
 
 void TensorDescType::print(::mlir::AsmPrinter &printer) const {
@@ -208,8 +174,8 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const {
   if (auto encoding = getEncoding())
     printer << ", " << encoding;
 
-  if (auto sg_map = getSgMap())
-    printer << ", " << sg_map;
+  if (auto layout = getLayout())
+    printer << ", " << layout;
 
   printer << ">";
 }
@@ -218,29 +184,29 @@ TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
                                    mlir::Type elementType, int array_length,
                                    bool boundary_check,
                                    MemorySpace memory_space,
-                                   mlir::Attribute sg_map) {
+                                   mlir::Attribute layout) {
   auto context = elementType.getContext();
   auto attr = BlockTensorDescAttr::get(context, memory_space, array_length,
                                        boundary_check);
-  return Base::get(context, shape, elementType, attr, sg_map);
+  return Base::get(context, shape, elementType, attr, layout);
 }
 
 TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
                                    mlir::Type elementType, int chunk_size,
                                    MemorySpace memory_space,
-                                   mlir::Attribute sg_map) {
+                                   mlir::Attribute layout) {
   auto context = elementType.getContext();
   auto attr = ScatterTensorDescAttr::get(context, memory_space, chunk_size);
-  return Base::get(context, shape, elementType, attr, sg_map);
+  return Base::get(context, shape, elementType, attr, layout);
 }
 
 LogicalResult TensorDescType::verify(
     llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
     llvm::ArrayRef<int64_t> shape, mlir::Type elementType,
-    mlir::Attribute encoding, mlir::Attribute sg_map) {
+    mlir::Attribute encoding, mlir::Attribute layout) {
   size_t rank = shape.size();
   // Low-pressure types are packed in 32-bit units.
-  unsigned packingFactor = 32 / elementType.getIntOrFloatBitWidth();
+  int32_t packingFactor = 32 / elementType.getIntOrFloatBitWidth();
   if (rank != 1 && rank != 2)
     return emitError() << "expected 1D or 2D tensor";
 
@@ -274,9 +240,9 @@ LogicalResult TensorDescType::verify(
       return emitError() << "SLM is not supported for 2D block tensor";
   }
 
-  if (auto sgMapAttr = llvm::dyn_cast_if_present<SGMapAttr>(sg_map)) {
-    ArrayRef<uint32_t> wiLayout = sgMapAttr.getWiLayout();
-    ArrayRef<uint32_t> wiData = sgMapAttr.getWiData();
+  if (auto layoutAttr = llvm::dyn_cast_if_present<LayoutAttr>(layout)) {
+    ArrayRef<int32_t> wiLayout = layoutAttr.getWiLayout().asArrayRef();
+    ArrayRef<int32_t> wiData = layoutAttr.getWiData().asArrayRef();
 
     if (rank == 1) {
       if (wiLayout[0] != 1 || wiData[0] != 1)
@@ -318,7 +284,7 @@ LogicalResult TensorDescType::verify(
   return success();
 }
 
-// If tensor descriptor has a sg_map attribute it is used in SIMT mode.
+// If tensor descriptor has a layout attribute it is used in SIMT mode.
 // In this mode, the distributed vector shape is determined as follows:
 // Definitions:
 //        wi_data_size = wi_data[0] × wi_data[1]
@@ -343,13 +309,13 @@ LogicalResult TensorDescType::verify(
 // Distributed vector shape must be:
 //        [n_distribution_units, wi_data_size]
 FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
-  auto sgMap = llvm::dyn_cast_if_present<SGMapAttr>(getSgMap());
-  // If no sg_map is provided, tensor desc is not used in SIMT mode.
-  if (!sgMap)
+  auto layout = llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
+  // If no layout is provided, tensor desc is not used in SIMT mode.
+  if (!layout || !layout.isForWorkItemLevel())
     return failure();
 
-  SmallVector<int64_t> wiData(sgMap.getWiData());
-  SmallVector<int64_t> wiLayout(sgMap.getWiLayout());
+  SmallVector<int64_t> wiData(layout.getWiData().asArrayRef());
+  SmallVector<int64_t> wiLayout(layout.getWiLayout().asArrayRef());
   auto tdescShape = getShape();
 
   auto wiDataSize = 1, sgSize = 1;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 3bdf3fb218b45..c7e863256f235 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -78,18 +78,18 @@ static LogicalResult
 isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
                  ArrayRef<int64_t> adjustedTdescShape,
                  function_ref<InFlightDiagnostic()> emitError) {
-  auto sgMap = tdescTy.getSGMapAttr();
+  auto layout = tdescTy.getLayoutAttr();
   auto valueShape = valueTy.getShape();
-  // sg_map not present means IR is in SIMD mode. In this case value shape must
+  // layout not present means IR is in SIMD mode. In this case value shape must
   // match adjusted tensor descriptor shape.
-  if (!sgMap)
+  if (!layout || !layout.isForWorkItemLevel())
     return valueShape == adjustedTdescShape
                ? success()
                : emitError()
                      << "Value shape " << makeString(valueShape)
                      << " is not consistent with tensor descriptor " << tdescTy;
 
-  // sg_map present means IR is in SIMT mode. In this case sg_map determines the
+  // layout present means IR is in SIMT mode. In this case layout determines the
   // value shape.
   auto expectedValueShapeOrFailure = tdescTy.getDistributedVectorType();
   assert(succeeded(expectedValueShapeOrFailure) &&
@@ -105,6 +105,25 @@ isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
                    << " for tensor descriptor " << tdescTy;
 }
 
+static bool isEvenDistributed(llvm::ArrayRef<int64_t> shape,
+                              xegpu::LayoutAttr attr) {
+  assert(attr && "workgroup map attribute is missing.");
+  llvm::ArrayRef<int32_t> layout, data;
+  if (attr.getSgLayout()) {
+    data = attr.getSgData().asArrayRef();
+    layout = attr.getSgLayout().asArrayRef();
+  } else {
+    data = attr.getWiData().asArrayRef();
+    layout = attr.getWiLayout().asArrayRef();
+  }
+  for (auto [s, d, l] : llvm::zip_equal(shape, data, layout)) {
+    // check s % (d * l) != 0
+    if (s % d != 0 || (s / d) % l != 0)
+      return false;
+  }
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_CreateNdDescOp
 //===----------------------------------------------------------------------===//
@@ -541,7 +560,7 @@ LogicalResult StoreScatterOp::verify() {
                           [&]() { return emitOpError(); });
 }
 
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 // XeGPU_UpdateOffsetOp
 //===----------------------------------------------------------------------===//
 void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
@@ -569,61 +588,107 @@ void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
 LogicalResult DpasOp::verify() {
   int64_t lhsRank = getLhsType().getRank();
   int64_t rhsRank = getRhsType().getRank();
-  int64_t resultRank = getResultType().getRank();
+  int64_t resRank = getResultType().getRank();
   auto lhsShape = getLhsType().getShape();
   auto rhsShape = getRhsType().getShape();
-  auto resultShape = getResultType().getShape();
-
-  auto sgMapA = getSgMapAAttr();
-  auto sgMapB = getSgMapBAttr();
-  auto sgMapC = getSgMapCAttr();
+  auto resShape = getResultType().getShape();
+
+  auto layoutA = getALayoutAttr();
+  auto layoutB = getBLayoutAttr();
+  auto layoutC = getCLayoutAttr();
+
+  // make sure the layout attribute is either set for every available
+  // operand or simply not set at all. C is special, since ACC is optional.
+  // If they are all set, they also should be in the same scope.
+  auto isValidSet = [&]() {
+    bool result = (layoutA != nullptr) ^ (layoutB != nullptr);
+    if (hasAcc()) {
+      result |= (layoutA != nullptr) ^ (layoutC != nullptr);
+    }
+    result = !result;
 
-  // If sg_maps are not present, then the operation is in SIMD mode.
-  if (!sgMapA && !sgMapB && !sgMapC) {
-    if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resultRank != 2)
+    if (layoutA) {
+      auto scope = layoutA.getScope();
+      result &= layoutB ? scope == layoutB.getScope() : false;
+      if (hasAcc())
+        result &= layoutC ? scope == layoutC.getScope() : false;
+    }
+    return result;
+  };
+
+  if (!isValidSet())
+    return emitOpError("layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code).");
+
+  // query the scope from layoutA (a valid setting).
+  if (layoutA && layoutA.isForWorkItemLevel()) {
+    // In SIMT mode, All data fragments must be 2D
+    if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
+      return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
+
+    auto wiLayoutA = layoutA.getWiLayout();
+    auto wiLayoutB = layoutB.getWiLayout();
+    auto wiLayoutC = layoutC.getWiLayout();
+    // Obtain the expanded shapes of the operands and result using wi_layout.
+    // NOTE: For B, get rid of the packed dimension for the expanded shape.
+    SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],
+                                           lhsShape[1] * wiLayoutA[1]};
+    SmallVector<int64_t> expandedShapeB = {
+        rhsShape[0] * rhsShape[1] * wiLayoutB[0], 1 * wiLayoutB[1]};
+    SmallVector<int64_t> expandedShapeC = {resShape[0] * wiLayoutC[0],
+                                           resShape[1] * wiLayoutC[1]};
+    auto bK = expandedShapeB[0];
+    if (bK != expandedShapeA[1])
+      return emitOpError("K-dimension mismatch.");
+    if (expandedShapeA[0] != expandedShapeC[0])
+      return emitOpError("M-dimension mismatch.");
+    if (expandedShapeB[1] != expandedShapeC[1])
+      return emitOpError("N-dimension mismatch.");
+  } else { // For other scopes, operands' shape should match the mxkxn semantics.
+    if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resRank != 2)
       return emitOpError(
           "expecting lhs and result to be a 2D vector, and rhs to be either "
           "2D or 3D (packed) vector.");
     auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
     if (bK != lhsShape[1])
       return emitOpError("K-dimension mismatch.");
-    if (lhsShape[0] != resultShape[0])
+    if (lhsShape[0] != resShape[0])
       return emitOpError("M-dimension mismatch.");
-    if (rhsShape[1] != resultShape[1])
+    if (rhsShape[1] != resShape[1])
       return emitOpError("N-dimension mismatch.");
-    return success();
   }
-  // Otherwise, in SIMT mode we expect sg_map attributes for all operands and
-  // result of DPAS operation.
-  if (!sgMapA || !sgMapB || !sgMapC)
-    return emitOpError("sg_map attributes for all operands and outputs are "
-                       "expected in SIMT xegpu::Dpas operation");
-
-  // In SIMT mode, All data fragments must be 2D
-  if (lhsRank != 2 || rhsRank != 2 || resultRank != 2)
-    return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
-
-  auto wiLayoutA = sgMapA.getWiLayout();
-  auto wiLayoutB = sgMapB.getWiLayout();
-  auto wiLayoutC = sgMapC.getWiLayout();
-  // Obtain the expanded shapes of the operands and result using wi_layout.
-  // NOTE: For B, get rid of the packed dimension for the expanded shape.
-  SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],
-                                         lhsShape[1] * wiLayoutA[1]};
-  SmallVector<int64_t> expandedShapeB = {
-      rhsShape[0] * rhsShape[1] * wiLayoutB[0], 1 * wiLayoutB[1]};
-  SmallVector<int64_t> expandedShapeC = {resultShape[0] * wiLayoutC[0],
-                                         resultShape[1] * wiLayoutC[1]};
-  auto bK = expandedShapeB[0];
-  if (bK != expandedShapeA[1])
-    return emitOpError("K-dimension mismatch.");
-  if (expandedShapeA[0] != expandedShapeC[0])
-    return emitOpError("M-dimension mismatch.");
-  if (expandedShapeB[1] != expandedShapeC[1])
-    return emitOpError("N-dimension mismatch.");
-
   return success();
 }
+
+//===----------------------------------------------------------------------===//
+// XeGPU_ConvertLayoutOp
+//===----------------------------------------------------------------------===//
+LogicalResult ConvertLayoutOp::verify() {
+  auto srcMap = getSrcMapAttr();
+  auto resMap = getResMapAttr();
+  if (!srcMap)
+    return emitOpError("expected srcMap.");
+  if (!resMap)
+    return emitOpError("expected resMap.");
+
+  if (srcMap.getScope() != resMap.getScope())
+    return emitOpError("expected srcMap and resMap be in the same scope.");
+
+  if (srcMap == resMap)
+    return emitOpError("expected different srcMap and resMap.");
+
+  if (srcMap.isForWorkItemLevel())
+    return emitOpError("doesn't work on SIMT code.");
+
+  auto shape = getSource().getType().getShape();
+  if (!isEvenDistributed(shape, srcMap))
+    return emitOpError("invalid srcMap, data cannot be evenly distributed.");
+
+  if (!isEvenDistributed(shape, resMap))
+    return emitOpError("invalid resMap, data cannot be evenly distributed.");
+
+  return mlir::success();
+}
+
 } // namespace xegpu
 } // namespace mlir
 
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 88e9bbf78945b..c4958d920a89f 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -78,25 +78,25 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
 }
 
 // -----
-func.func @test_load_nd_sg_map(%src: memref<24x32xf32>) {
+func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   // expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1]}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
       l2_hint = #xegpu.cache_hint<uncached>}>
-    : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    : !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
     -> vector<8x2xf32>
   return
 }
 
 // -----
-func.func @test_load_nd_sg_map(%src: memref<24x32xf32>) {
+func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   // expected-error at +1 {{Result shape [8] is not consistent with distributed vector shape [1, 1]}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
       l2_hint = #xegpu.cache_hint<uncached>}>
-    : !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    : !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
     -> vector<8xf32>
   return
 }
@@ -134,22 +134,22 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
 }
 
 // -----
-func.func @test_store_nd_sg_map(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) {
+func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) {
   %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   // expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1] for tensor descriptor}}
   xegpu.store_nd %data, %1
-    : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   return
 }
 
 // -----
-func.func @test_store_nd_sg_map(%dst: memref<24x32xf32>, %data: vector<2xf32>) {
+func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<2xf32>) {
   %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   // expected-error at +1 {{Result shape [2] is not consistent with distributed vector shape [1, 1] for tensor descriptor}}
   xegpu.store_nd %data, %1
-    : vector<2xf32>, !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    : vector<2xf32>, !xegpu.tensor_desc<16xf32,   #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   return
 }
 
@@ -245,69 +245,69 @@ func.func @test_prefetch_vc_2(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_sg_map_1(%src: ui64) {
+func.func @test_create_tdesc_layout_1(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>,   #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   return
 }
 
 // -----
-func.func @test_create_tdesc_sg_map_2(%src: ui64) {
+func.func @test_create_tdesc_layout_2(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error at +1 {{cannot map over non-contiguous scattered row elements}}
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [2, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,   #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [2, 1]>>
   return
 }
 
 // -----
-func.func @test_create_tdesc_sg_map_3(%src: ui64) {
+func.func @test_create_tdesc_layout_3(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error at +1 {{work item data mapping must match the number of contiguous elements}}
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>,   #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
   return
 }
 
 // -----
-func.func @test_load_gather_sg_map_1(%src: ui64) {
+func.func @test_load_gather_layout_1(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,   #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   // expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
+  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
   return
 }
 
 // -----
-func.func @test_load_gather_sg_map_2(%src: ui64) {
+func.func @test_load_gather_layout_2(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   // expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
+  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
   return
 }
 
 
 // -----
-func.func @test_store_scatter_sg_map_1(%src: ui64) {
+func.func @test_store_scatter_layout_1(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %val = arith.constant dense<2.9>: vector<1x2xf32>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   // expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
   return
 }
 
 // -----
-func.func @test_store_scatter_sg_map_2(%src: ui64) {
+func.func @test_store_scatter_layout_2(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %val = arith.constant dense<2.9>: vector<2xf32>
-  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   // expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
-  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+  xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,  #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
   return
 }
 
@@ -394,18 +394,18 @@ func.func @test_dpas_4(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) {
 }
 
 // -----
-func.func @test_dpas_sg_map_1(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
-  // expected-error at +1 {{sg_map attributes for all operands and outputs are expected in SIMT xegpu::Dpas operation}}
-  %1 = xegpu.dpas %a, %b {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+func.func @test_dpas_layout_1(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
+  // expected-error at +1 {{layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code)}}
+  %1 = xegpu.dpas %a, %b {a_layout =  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
   return
 }
 
 // -----
-func.func @test_dpas_sg_map_2(%a : vector<8x1xf16>, %b: vector<4x2xf16>) {
+func.func @test_dpas_layout_2(%a : vector<8x1xf16>, %b: vector<4x2xf16>) {
   // expected-error at +1 {{K-dimension mismatch}}
-  %1 = xegpu.dpas %a, %b {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>,
-                          sg_map_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>,
-                          sg_map_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>}
+  %1 = xegpu.dpas %a, %b {a_layout =  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
+                          b_layout =  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
+                          c_layout =  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>}
                           : vector<8x1xf16>, vector<4x2xf16> -> vector<8x1xf32>
   return
 }
@@ -439,7 +439,7 @@ func.func @tensor_desc_invalid_rank_1(%src: memref<24x32xf32>) {
 func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
-      !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [2, 16], wi_data = [1, 1]>>
+      !xegpu.tensor_desc<16xf32,  #xegpu.layout<scope = wi, wi_layout = [2, 16], wi_data = [1, 1]>>
   return
 }
 
@@ -447,7 +447,7 @@ func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) {
 func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
-      !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
+      !xegpu.tensor_desc<16xf32,  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
   return
 }
 
@@ -455,7 +455,7 @@ func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
 func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{cannot distribute 8 over 16 work items with 1 elements each}}
-      !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   return
 }
 
@@ -463,7 +463,7 @@ func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
 func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{cannot distribute 4 over 8 work items with 1 elements each}}
-      !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map<wi_layout = [8, 2], wi_data = [1, 1]>>
+      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = wi, wi_layout = [8, 2], wi_data = [1, 1]>>
   return
 }
 
@@ -471,7 +471,7 @@ func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
 func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{cannot distribute 4 over 2 work items with 4 elements each}}
-      !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [4, 1]>>
+      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = wi, wi_layout = [2, 8], wi_data = [4, 1]>>
   return
 }
 
@@ -479,7 +479,7 @@ func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
 func.func @tensor_desc_invalid_map_data_1(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
       // expected-error at +1 {{cannot distribute 4 over 8 work items with 1 elements each}}
-      !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map<wi_layout = [8, 2], wi_data = [1, 2]>>
+      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<scope = wi, wi_layout = [8, 2], wi_data = [1, 2]>>
   return
 }
 
@@ -490,7 +490,7 @@ func.func @tensor_desc_scatter_invalid_map_data(%src: ui64) {
       // expected-error at +1 {{cannot map over non-contiguous scattered row elements}}
       !xegpu.tensor_desc<4x2xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 2>,
-        #xegpu.sg_map<wi_layout = [1, 1], wi_data = [2, 1]>>
+         #xegpu.layout<scope = wi, wi_layout = [1, 1], wi_data = [2, 1]>>
   return
 }
 
@@ -500,7 +500,7 @@ func.func @tensor_desc_scatter_invalid_map_data_1(%src: ui64, %offsets: vector<1
       // expected-error at +1 {{work item data mapping must match the number of contiguous elements}}
       !xegpu.tensor_desc<16xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 1>,
-        #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 2]>>
+         #xegpu.layout<scope = wi, wi_layout = [1, 8], wi_data = [1, 2]>>
   return
 }
 
@@ -510,7 +510,7 @@ func.func @tensor_desc_scatter_invalid_chunk_size_1D(%src: ui64, %offsets: vecto
       // expected-error at +1 {{expected non-contiguous elements for 1D tensor}}
       !xegpu.tensor_desc<16xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 2>,
-        #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 2]>>
+         #xegpu.layout<scope = wi, wi_layout = [1, 8], wi_data = [1, 2]>>
   return
 }
 
@@ -520,6 +520,22 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto
       // expected-error at +1 {{expected chunk blocks for 2D tensor}}
       !xegpu.tensor_desc<16x2xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 1>,
-        #xegpu.sg_map<wi_layout = [8, 1], wi_data = [1, 2]>>
+         #xegpu.layout<scope = wi, wi_layout = [8, 1], wi_data = [1, 2]>>
   return
 }
+
+// -----
+func.func @test_convert_layout_same_map(%a: vector<32x64xf16>) {
+  // expected-error at +1 {{expected different srcMap and resMap}}
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>,
+                                resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+  gpu.return
+}
+
+// -----
+func.func @test_convert_layout_unmatch(%a: vector<32x64xf16>) {
+  // expected-error at +1 {{expected srcMap and resMap be in the same scope}}
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>,
+                                resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+  gpu.return
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index c32f1905454b6..6a29a73a20612 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -15,9 +15,9 @@ gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_1(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_create_nd_tdesc_simt_1(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -34,8 +34,8 @@ gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : ind
 gpu.func @test_create_nd_tdesc_simt_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
   //CHECK: %[[C:.*]] = arith.constant 1 : index
   %c1 = arith.constant 1 : index
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -48,8 +48,8 @@ gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_create_nd_tdesc_simt_3(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -62,8 +62,8 @@ gpu.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
 gpu.func @test_create_nd_tdesc_simt_4(%src: memref<2x24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -76,8 +76,8 @@ gpu.func @test_create_nd_tdesc_vc_5(%src: memref<2x24x32xf32, 3>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
 gpu.func @test_create_nd_tdesc_simt_5(%src: memref<2x24x32xf32, 3>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -90,8 +90,8 @@ gpu.func @test_create_nd_tdesc_vc_6(%src: memref<24x32xf32>) {
 
 // CHECK: gpu.func @test_create_nd_tdesc_simt_6(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_create_nd_tdesc_simt_6(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -106,10 +106,10 @@ gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
 
 // CHECK: gpu.func @test_prefetch_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_prefetch_nd_simt(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -125,11 +125,11 @@ gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
 
 // CHECK: func @test_load_nd_simt(%[[arg0:.*]]: memref<8x16xf16>) {
 gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
   %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
-       : !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
+       : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
   gpu.return
 }
 
@@ -144,10 +144,10 @@ gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
 
 // CHECK: func @test_load_nd_simt_2(%[[arg0:.*]]: memref<8x16xf16>) {
 gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
   gpu.return
 }
 
@@ -162,11 +162,11 @@ gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
 
 // CHECK: func @test_load_nd_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
   gpu.return
 }
 
@@ -181,11 +181,11 @@ gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) {
 
 // CHECK: func @test_load_nd_simt_4(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
-    !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
+    !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
   gpu.return
 }
 
@@ -200,11 +200,11 @@ gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
 
 // CHECK: func @test_load_nd_simt_5(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
+    !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
   gpu.return
 }
 
@@ -219,11 +219,11 @@ gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) {
 
 // CHECK: func @test_load_nd_simt_6(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
-    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
+    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
   gpu.return
 }
 
@@ -238,11 +238,11 @@ gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) {
 
 // CHECK: func @test_load_nd_simt_7(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
-    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
+    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
   gpu.return
 }
 
@@ -257,10 +257,10 @@ gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) {
 
 // CHECK: func @test_load_nd_simt_8(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
-  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
   gpu.return
 }
 
@@ -279,11 +279,11 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
 gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) {
    // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48x1xf16>
   %1 = arith.constant dense<1.0>: vector<48x1xf16>
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
-    !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -305,11 +305,11 @@ gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
 gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) {
    // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2x1xf16>
   %1 = arith.constant dense<1.0>: vector<2x1xf16>
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
-    !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+    !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -324,10 +324,10 @@ gpu.func @test_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
 
 // CHECK: gpu.func @test_update_nd_tdesc_simt(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_update_nd_tdesc_simt(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+  %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -344,8 +344,8 @@ gpu.func @test_create_tdesc_vc(%src: ui64) {
 gpu.func @test_create_tdesc_simt(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -363,8 +363,8 @@ gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) {
 gpu.func @test_create_tdesc_simt_1(%src: memref<?xf32, 3>) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space =  slm, chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex>  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space =  slm, chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex>  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -383,7 +383,7 @@ gpu.func @test_create_tdesc_simt_2(%src: memref<?xf32>) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>
-  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>  -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>  -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -401,8 +401,8 @@ gpu.func @test_create_tdesc_vc_3(%src: ui64) {
 gpu.func @test_create_tdesc_simt_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
   gpu.return
 }
 
@@ -425,10 +425,10 @@ gpu.func @test_load_simt(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
   %1 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
-  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
   gpu.return
 }
 
@@ -451,10 +451,10 @@ gpu.func @test_load_simt_2(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
   %1 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
-  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
-  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
-  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
+  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
   gpu.return
 }
 
@@ -477,10 +477,10 @@ gpu.func @test_load_simt_3(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
   %1 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
-  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
-  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
-  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+  %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+  %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
   gpu.return
 }
 
@@ -509,10 +509,10 @@ gpu.func @test_store_simt(%src: ui64) {
   %1 = arith.constant dense<1>: vector<4xi1>
   //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2x1xf32>
   %2 = arith.constant dense<2.9>: vector<2x1xf32>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
-  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
   gpu.return
 }
 
@@ -541,10 +541,10 @@ gpu.func @test_store_simt_2(%src: ui64) {
   %1 = arith.constant dense<1>: vector<4xi1>
   //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<1x2xf16>
   %2 = arith.constant dense<2.9>: vector<1x2xf16>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
-  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
-  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
-  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
+  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
   gpu.return
 }
 
@@ -572,10 +572,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
   %1 = arith.constant dense<1>: vector<4xi1>
   //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32>
   %2 = arith.constant dense<2.9>: vector<1xf32>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
-  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
-  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
-  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+  %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+  //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
+  xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
   gpu.return
 }
 
@@ -583,10 +583,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
 gpu.func @test_prefetch_simt(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
-  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   gpu.return
 }
 
@@ -605,13 +605,13 @@ gpu.func @test_prefetch_vc(%src: ui64) {
 // CHECK: gpu.func @test_create_update_tdesc_simt(%[[arg0:.*]]: ui64) {
 gpu.func @test_create_update_tdesc_simt(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
-  //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+  //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
   %s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
-  %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+  %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
   gpu.return
 }
 
@@ -637,12 +637,12 @@ gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
 
 // CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8x1xf16>, %[[arg1:.*]]: vector<8x2xf16>)
 gpu.func @test_dpas_simt(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
-  // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>,
-  // CHECK: sg_map_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>,
-  // CHECK: sg_map_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
-  %1 = xegpu.dpas %a, %b {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>,
-                          sg_map_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>,
-                          sg_map_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>}
+  // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
+  // CHECK: b_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
+  // CHECK: c_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+  %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
+                          b_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
+                          c_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>}
                           : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
   gpu.return
 }
@@ -704,4 +704,24 @@ gpu.func @fence() {
   gpu.return
 }
 
+// CHECK: gpu.func @test_create_nd_tdesc_wg_1(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], wi_layout = [1, 16], wi_data = [8, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], wi_layout = [1, 16], wi_data = [8, 1]>>
+  gpu.return
+}
+
+gpu.func @test_convert_layout(%a: vector<32x64xf16>) {
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [2, 1]>,
+                                resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+  gpu.return
+}
+
+gpu.func @test_convert_layout_wg(%a: vector<32x64xf16>) {
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>,
+                                resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+  gpu.return
+}
+
+
 }