[Mlir-commits] [mlir] [mlir][xegpu] Add definitons of MatrixDescType and related ops. (PR #153273)

Wed Aug 13 17:06:19 PDT 2025

https://github.com/chencha3 updated https://github.com/llvm/llvm-project/pull/153273

>From cce8abaa92703dea562536c02fee3a8fd00ef9e6 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 8 Aug 2025 15:57:16 +0000
Subject: [PATCH 01/10] init

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 96 +++++++++++--------
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       | 23 +++++
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 56 +++++++++++
 3 files changed, 134 insertions(+), 41 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 75b16a87e03c6..3b074a35e9cbd 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -29,7 +29,7 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
     void printProperties(::mlir::MLIRContext *ctx,
             ::mlir::OpAsmPrinter &p, const Properties &prop,
             ::mlir::ArrayRef<::llvm::StringRef> elidedProps) {
-      
+
       DictionaryAttr propAttr = dyn_cast_if_present<mlir::DictionaryAttr>(getPropertiesAsAttr(ctx, prop));
 
       // filter out the elidedProps from propAttr, and get the resultAttr
@@ -43,7 +43,7 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
       }
 
       if (!filteredAttrs.empty()) {
-        p << "<" << DictionaryAttr::get(ctx, filteredAttrs) << ">"; 
+        p << "<" << DictionaryAttr::get(ctx, filteredAttrs) << ">";
       }
     }
 
@@ -189,11 +189,11 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     ArrayRef<int64_t> getStaticOffsets(){
       auto attr = getConstOffsetsAttr();
 
-      if (attr) 
+      if (attr)
         return attr;
 
       int64_t rank = getMixedSizes().size();
-      
+
       setConstOffsets(llvm::SmallVector<int64_t, 4>(rank, 0));
 
       attr = getConstOffsetsAttr();
@@ -233,7 +233,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
       auto attr = getConstStridesAttr();
       if (attr)
         return attr;
-      
+
       if (llvm::isa<IntegerType>(getSourceType()))
         return emptyStrides;
 
@@ -314,15 +314,15 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
   }];
 
   let assemblyFormat = [{
-    $TensorDesc `` 
-    custom<OptionalDynamicIndexList>($offsets, $const_offsets) 
+    $TensorDesc ``
+    custom<OptionalDynamicIndexList>($offsets, $const_offsets)
     prop-dict attr-dict `:` qualified(type($TensorDesc))
   }];
 
   let builders = [
-    OpBuilder<(ins "Value": $TensorDesc, 
-                   "xegpu::CachePolicyAttr": $l1_hint, 
-                   "xegpu::CachePolicyAttr": $l2_hint, 
+    OpBuilder<(ins "Value": $TensorDesc,
+                   "xegpu::CachePolicyAttr": $l1_hint,
+                   "xegpu::CachePolicyAttr": $l2_hint,
                    "xegpu::CachePolicyAttr": $l3_hint)>
   ];
 
@@ -370,7 +370,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
 
   let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
                        Variadic<Index>: $offsets,
-                       OptionalAttr<DenseI64ArrayAttr>: $const_offsets,  
+                       OptionalAttr<DenseI64ArrayAttr>: $const_offsets,
                        OptionalAttr<UnitAttr>: $packed,
                        OptionalAttr<DenseI64ArrayAttr>: $transpose,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
@@ -390,16 +390,16 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
   }];
 
   let assemblyFormat = [{
-    $TensorDesc `` 
-    custom<OptionalDynamicIndexList>($offsets, $const_offsets) 
+    $TensorDesc ``
+    custom<OptionalDynamicIndexList>($offsets, $const_offsets)
     prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)
   }];
 
   let builders = [
-    OpBuilder<(ins "Type": $value, "Value": $TensorDesc, 
+    OpBuilder<(ins "Type": $value, "Value": $TensorDesc,
                     "UnitAttr": $packed, "DenseI64ArrayAttr": $transpose,
-                    "xegpu::CachePolicyAttr": $l1_hint, 
-                    "xegpu::CachePolicyAttr": $l2_hint, 
+                    "xegpu::CachePolicyAttr": $l1_hint,
+                    "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint)>
   ];
 
@@ -442,7 +442,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
   let arguments = (ins XeGPU_ValueType: $value,
                        XeGPU_TensorDesc: $TensorDesc,
                        Variadic<Index>: $offsets,
-                       OptionalAttr<DenseI64ArrayAttr>: $const_offsets,  
+                       OptionalAttr<DenseI64ArrayAttr>: $const_offsets,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
@@ -458,16 +458,16 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
   }];
 
    let assemblyFormat = [{
-    $value `,` 
-    $TensorDesc `` 
-    custom<OptionalDynamicIndexList>($offsets, $const_offsets) 
+    $value `,`
+    $TensorDesc ``
+    custom<OptionalDynamicIndexList>($offsets, $const_offsets)
     prop-dict attr-dict `:`  type($value) `,` qualified(type($TensorDesc))
   }];
 
   let builders = [
-    OpBuilder<(ins "Value": $value, "Value": $TensorDesc, 
-                   "xegpu::CachePolicyAttr": $l1_hint, 
-                   "xegpu::CachePolicyAttr": $l2_hint, 
+    OpBuilder<(ins "Value": $value, "Value": $TensorDesc,
+                   "xegpu::CachePolicyAttr": $l1_hint,
+                   "xegpu::CachePolicyAttr": $l2_hint,
                    "xegpu::CachePolicyAttr": $l3_hint)>
   ];
 
@@ -635,12 +635,12 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
                              l3_hint = #xegpu.cache_hint<cached>}
         : !xegpu.tensor_desc<16xf16>
     ```
-    
+
     Example 2:
     A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
     It combines "create scattered TensorTdesc" and "prefetch with scattered TensorTdesc".
     The source operand could be a raw pointer (uint64_t).
-    Please refer to create_tdesc for the restriction of memref. 
+    Please refer to create_tdesc for the restriction of memref.
     ```mlir
       %a = memref.alloc() : memref<1024xf32>
       %0 = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
@@ -676,16 +676,16 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
   }];
 
   let assemblyFormat = [{
-    $source 
+    $source
     (`[` $offsets^ `]`)?
     prop-dict
-    attr-dict `:` type(operands) 
+    attr-dict `:` type(operands)
   }];
-    
+
   let builders = [
     OpBuilder<(ins "Value": $source,
-                    "xegpu::CachePolicyAttr": $l1_hint, 
-                    "xegpu::CachePolicyAttr": $l2_hint, 
+                    "xegpu::CachePolicyAttr": $l1_hint,
+                    "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint)>
   ];
 
@@ -723,7 +723,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
           : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
             vector<16xi1> -> vector<16x8xf32>
   ```
-  
+
   Example 3 (SIMT mode):
   ```mlir
     %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>,
@@ -732,12 +732,12 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
           : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>
             vector<16xi1> -> vector<8xf32>
   ```
-  
+
   Example 4:
   A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
   It combines "create scattered TensorTdesc" and "load with scattered TensorTdesc".
   The source operand could be a raw pointer (uint64_t). Please refer to create_tdesc
-  for the restriction of memref. 
+  for the restriction of memref.
   ```mlir
     %a = memref.alloc() : memref<1024xf32>
     %offsets = vector.step : vector<16xindex>
@@ -794,14 +794,14 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
   let assemblyFormat = [{
     $source
     (`[` $offsets^ `]`)? `,`
-    $mask prop-dict 
+    $mask prop-dict
     attr-dict `:` type(operands) `->` type($value)
   }];
 
   let builders = [
     OpBuilder<(ins "Type": $value, "Value": $source, "Value": $mask,
-                    "xegpu::CachePolicyAttr": $l1_hint, 
-                    "xegpu::CachePolicyAttr": $l2_hint, 
+                    "xegpu::CachePolicyAttr": $l1_hint,
+                    "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint)>
    ];
 
@@ -848,7 +848,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
   A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
   It combines "create scattered TensorTdesc" and "store with scattered TensorTdesc".
   The dest operand could be a raw pointer (uint64_t).
-  Please refer to create_tdesc for the restriction of memref. 
+  Please refer to create_tdesc for the restriction of memref.
   ```mlir
     %a = memref.alloc() : memref<1024xf32>
     %val = arith.constant dense<0.0> : vector<16xf32>
@@ -901,15 +901,15 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
     $value `,`
     $dest
     (`[` $offsets^ `]`)? `,`
-    $mask 
-    prop-dict 
+    $mask
+    prop-dict
     attr-dict `:`  type(operands)
   }];
 
   let builders = [
     OpBuilder<(ins "Value": $value, "Value": $dest, "Value": $mask,
-                    "xegpu::CachePolicyAttr": $l1_hint, 
-                    "xegpu::CachePolicyAttr": $l2_hint, 
+                    "xegpu::CachePolicyAttr": $l1_hint,
+                    "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint)>
    ];
 
@@ -1146,4 +1146,18 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
     let hasCanonicalizer = 1;
 }
 
+def XeGPU_CreateMatrixDescOp: XeGPU_Op<"create_matrix_desc"> {
+  let summary = "Create a matrix descriptor.";
+  let description = [{
+    Matrices are treated as 2D units.
+    In case the ROI rank is >2, the two fastest changing dimensions
+    represent a 2D unit and other dimensions specify the multiple
+    of these units that are stacked vertically.
+    Results:
+     - `matrix_desc` : a descriptor for SLM allocation.
+  }];
+  let results = (outs XeGPU_MatrixDesc:$matrix_desc);
+  let assemblyFormat = "attr-dict `:` type($matrix_desc)";
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index b268cabb5d266..6ac126a84d39c 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -201,4 +201,27 @@ def XeGPU_Nbarrier: XeGPUTypeDef<"Nbarrier", "nbarrier", [], "mlir::Type"> {
   }];
 }
 
+def XeGPU_MatrixDesc: XeGPUTypeDef<"MatrixDesc", "matrix_desc", [ShapedTypeInterface], "mlir::Type"> {
+  let summary = "MatrixDesc describing the data in SLM";
+  let description = [{
+    MatrixDesc describes the data stored in SLM. Unleass specified via
+    the the optional layout attribute, the data is stored in a continuous
+    SLM region in row-major order by default.
+  }];
+  let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
+                        "mlir::Type": $elementType,
+                        OptionalParameter<"mlir::Attribute">: $layout);
+
+  let extraClassDeclaration = [{
+    // using mlir::ShapedType::Trait<MatrixDescType>::getElementTypeBitWidth;
+    // using mlir::ShapedType::Trait<MatrixDescType>::getElementTypeBitWidth;
+    // using mlir::ShapedType::Trait<MatrixDescType>::getRank;
+    // using mlir::ShapedType::Trait<MatrixDescType>::getNumElements;
+    // using mlir::ShapedType::Trait<MatrixDescType>::isDynamicDim;
+    // using mlir::ShapedType::Trait<MatrixDescType>::hasStaticShape;
+  }];
+
+  let hasCustomAssemblyFormat = true;
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 3c0ca114a62d4..50eb90dbc1df9 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -394,6 +394,62 @@ LogicalResult TensorDescType::verify(
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// XeGPU_MatrixDescType
+//===----------------------------------------------------------------------===//
+mlir::Type MatrixDescType::parse(::mlir::AsmParser &parser) {
+  llvm::SmallVector<int64_t> shape;
+  mlir::Type elementType;
+  mlir::FailureOr<mlir::Attribute> layout;
+
+  // Parse literal '<'
+  if (parser.parseLess())
+    return {};
+
+  auto shapeLoc = parser.getCurrentLocation();
+  if (mlir::failed(parser.parseDimensionList(shape, false, true))) {
+    parser.emitError(shapeLoc, "failed to parse parameter 'shape'");
+    return {};
+  }
+
+  auto elemTypeLoc = parser.getCurrentLocation();
+  if (mlir::failed(parser.parseType(elementType))) {
+    parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'");
+    return {};
+  }
+
+  // parse optional attributes
+  if (mlir::succeeded(parser.parseOptionalComma())) {
+    mlir::Attribute attr;
+    ParseResult res = parser.parseAttribute(attr);
+    if (mlir::failed(res))
+      return {};
+    layout = attr;
+  }
+
+  // Parse literal '>'
+  if (parser.parseGreater())
+    return {};
+
+  MLIRContext *ctxt = parser.getContext();
+  return MatrixDescType::getChecked(
+      [&]() { return parser.emitError(parser.getNameLoc()); }, ctxt, shape,
+      elementType, layout.value_or(mlir::Attribute()));
+}
+
+void MatrixDescType::print(::mlir::AsmPrinter &printer) const {
+  printer << "<";
+
+  printer.printDimensionList(getShape());
+  printer << 'x';
+  printer << getElementType();
+
+  if (auto layout = getLayout())
+    printer << ", " << layout;
+
+  printer << ">";
+}
+
 } // namespace xegpu
 } // namespace mlir
 

>From 76ccc39d6f3c599015d0d6d853cc20a4853fcb7f Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 11 Aug 2025 18:48:38 +0000
Subject: [PATCH 02/10] sync

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 87 ++++++++++++++++++-
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       | 16 ++--
 2 files changed, 92 insertions(+), 11 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 3b074a35e9cbd..59c1a432dce66 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1146,18 +1146,101 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
     let hasCanonicalizer = 1;
 }
 
-def XeGPU_CreateMatrixDescOp: XeGPU_Op<"create_matrix_desc"> {
+def XeGPU_CreateMatrixDescOp: XeGPU_Op<"create_matrix_desc", [Pure]>  {
   let summary = "Create a matrix descriptor.";
   let description = [{
     Matrices are treated as 2D units.
     In case the ROI rank is >2, the two fastest changing dimensions
     represent a 2D unit and other dimensions specify the multiple
     of these units that are stacked vertically.
+    Arguments:
+     - `source` : a base address of SLM allocation.
     Results:
      - `matrix_desc` : a descriptor for SLM allocation.
   }];
+  let arguments = (ins XeGPU_BaseAddrType:$source);
   let results = (outs XeGPU_MatrixDesc:$matrix_desc);
-  let assemblyFormat = "attr-dict `:` type($matrix_desc)";
+  let assemblyFormat = "$source prop-dict attr-dict `:` type($source) `->` type($matrix_desc)";
 }
 
+def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>]>  {
+  let arguments = (ins XeGPU_MatrixDesc:$matrix_desc,
+    Variadic<Index>: $offsets,
+    DenseI64ArrayAttr: $const_offsets,
+    OptionalAttr<XeGPU_LayoutAttr>:$layout
+  );
+  let results = (outs XeGPU_ValueType:$res);
+  let assemblyFormat = [{
+    $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
+    prop-dict attr-dict `:` functional-type(operands, results)
+  }];
+  let summary = "Load matrix from SLM.";
+  let description = [{
+    This operation loads a matrix from the SLM using the matrix descriptor.
+    There are additional parameters and attributes that support loading, but they must only
+    be specified for a work-item level operation.
+
+    General rules:
+    1. Non-WI-level code must not specify optional attributes.
+    2. If the load uses `vector` semantics, all of the vector attributes must be specified.
+    3. If the load uses `array` semantics, all of the array attributes must be specified.
+
+    Arguments:
+     - `matrix_desc` : a matrix descriptor (SLM allocation + matrix type).
+     - `offsets`     : Coordinates of the matrix to load.
+    Results:
+      - `res` : loaded matrix elements.
+  }];
+
+  let builders = [
+    // OpBuilder<(ins "Type":$res, "TypedValue<MatrixDescType>": $matrix_desc, "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutAttr": $layout)>,
+  ];
+  let extraClassDeclaration = [{
+    SmallVector<OpFoldResult> getMixedOffsets() {
+      return getMixedValues(getConstOffsets(), getOffsets(), getContext());
+    }
+  }];
+  // let hasVerifier = 1;
+}
+
+def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix"> {
+  let arguments = (ins
+    XeGPU_MatrixDesc:$matrix_desc,
+    XeGPU_ValueType:$data,
+    Variadic<Index>: $offsets,
+    DenseI64ArrayAttr: $const_offsets,
+    OptionalAttr<XeGPU_LayoutAttr>:$layout
+  );
+  let assemblyFormat = [{
+    $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets) `,` $data
+    prop-dict attr-dict `:` type(operands)
+  }];
+  let summary = "Store matrix from SLM.";
+  let description = [{
+    This operation stores workitem's `data` fragment of the matrix to the SLM (`matrix_desc`).
+    There are additional parameters and attributes that support loading, but they must only
+    be specified for a work-item level operation.
+
+    General rules:
+    1. Non-WI-level code must not specify optional attributes.
+    2. If the store uses `vector` semantics, all of the vector attributes must be specified.
+
+    Arguments:
+     - `matrix_desc` : a matrix descriptor.
+     - `data`        : data to be stored to the matrix.
+     - `offsets`     : Coordinates of the matrix where the data will be stored.
+  }];
+  let builders = [
+    // OpBuilder<(ins "TypedValue<MatrixDescType>": $matrix_desc,  "Value" : $data, "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutAttr": $layout)>,
+  ];
+  let extraClassDeclaration = [{
+    SmallVector<OpFoldResult> getMixedOffsets() {
+      Builder b(getContext());
+      return getMixedValues(getConstOffsets(), getOffsets(), b);
+    }
+  }];
+  // let hasVerifier = 1;
+}
+
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 6ac126a84d39c..f578fc8bc0735 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -204,21 +204,19 @@ def XeGPU_Nbarrier: XeGPUTypeDef<"Nbarrier", "nbarrier", [], "mlir::Type"> {
 def XeGPU_MatrixDesc: XeGPUTypeDef<"MatrixDesc", "matrix_desc", [ShapedTypeInterface], "mlir::Type"> {
   let summary = "MatrixDesc describing the data in SLM";
   let description = [{
-    MatrixDesc describes the data stored in SLM. Unleass specified via
-    the the optional layout attribute, the data is stored in a continuous
-    SLM region in row-major order by default.
+    MatrixDesc describes a SLM region. Unleass specified via the optional layout attribute,
+    the data is stored contiguously in the region in row-major order by default.
   }];
   let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
                         "mlir::Type": $elementType,
                         OptionalParameter<"mlir::Attribute">: $layout);
 
   let extraClassDeclaration = [{
-    // using mlir::ShapedType::Trait<MatrixDescType>::getElementTypeBitWidth;
-    // using mlir::ShapedType::Trait<MatrixDescType>::getElementTypeBitWidth;
-    // using mlir::ShapedType::Trait<MatrixDescType>::getRank;
-    // using mlir::ShapedType::Trait<MatrixDescType>::getNumElements;
-    // using mlir::ShapedType::Trait<MatrixDescType>::isDynamicDim;
-    // using mlir::ShapedType::Trait<MatrixDescType>::hasStaticShape;
+    bool hasRank() const { return true; }
+
+    MatrixDescType cloneWith(std::optional<llvm::ArrayRef<int64_t>> shape, Type elementType) const {
+      return MatrixDescType::get(getContext(), shape.value_or(getShape()), elementType, getLayout());
+    }
   }];
 
   let hasCustomAssemblyFormat = true;

>From cb0a195e340bac10e10b6d5cb9de0d925d39deeb Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 12 Aug 2025 18:10:33 +0000
Subject: [PATCH 03/10] add unit tests for create_matrix_desc

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 24 +++++++++++++------
 mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt      |  1 +
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 10 ++++++++
 mlir/test/Dialect/XeGPU/invalid.mlir          | 16 +++++++++++++
 mlir/test/Dialect/XeGPU/ops.mlir              | 18 ++++++++++++++
 5 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 37e4c2c811155..e4ea0b27323ec 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1101,21 +1101,31 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
     let hasCanonicalizer = 1;
 }
 
-def XeGPU_CreateMatrixDescOp: XeGPU_Op<"create_matrix_desc", [Pure]>  {
+def isSharedPred : CPred<"isSharedMemory(llvm::cast<mlir::MemRefType>($_self))">;
+class StaticShared1DMemRefOf<list<Type> allowedTypes> :
+  ConfinedType<MemRefRankOf<allowedTypes, [1]>, [HasStaticShapePred, isSharedPred],
+     "statically shaped " # MemRefOf<allowedTypes>.summary # " for shared memory",
+     "mlir::MemRefType">;
+
+class SizeInBits<string name> :
+  StrFunc<"llvm::cast<mlir::ShapedType>($" # name # ".getType()).getNumElements()"
+          "*llvm::cast<mlir::ShapedType>($" # name # ".getType()).getElementTypeBitWidth()">;
+class AllMemSizesMatch<list<string> names> :
+    AllMatchSameOperatorTrait<names, SizeInBits<"_self">.result,
+                              "size in bits">;
+
+def XeGPU_CreateMatrixDescOp: XeGPU_Op<"create_matrix_desc", [Pure,
+      AllMemSizesMatch<["source", "matrix_desc"]>]>  {
   let summary = "Create a matrix descriptor.";
   let description = [{
-    Matrices are treated as 2D units.
-    In case the ROI rank is >2, the two fastest changing dimensions
-    represent a 2D unit and other dimensions specify the multiple
-    of these units that are stacked vertically.
     Arguments:
      - `source` : a base address of SLM allocation.
     Results:
      - `matrix_desc` : a descriptor for SLM allocation.
   }];
-  let arguments = (ins XeGPU_BaseAddrType:$source);
+  let arguments = (ins StaticShared1DMemRefOf<[I8]>:$source);
   let results = (outs XeGPU_MatrixDesc:$matrix_desc);
-  let assemblyFormat = "$source prop-dict attr-dict `:` type($source) `->` type($matrix_desc)";
+  let assemblyFormat = "$source prop-dict attr-dict `` `:` type($source) `->` qualified(type($matrix_desc))";
 }
 
 def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>]>  {
diff --git a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
index 7c6a4f37db9af..603fb5d237544 100644
--- a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
@@ -17,6 +17,7 @@ add_mlir_dialect_library(MLIRXeGPUDialect
   MLIRAffineUtils
   MLIRArithUtils
   MLIRDialectUtils
+  MLIRGPUDialect
   MLIRIR
   MLIRViewLikeInterface
   MLIRVectorDialect
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 2cd086feb5deb..ad4d8bd6e22cd 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
@@ -21,6 +22,15 @@
 namespace mlir {
 namespace xegpu {
 
+bool isSharedMemory(const MemRefType &memrefTy) {
+  Attribute attr = memrefTy.getMemorySpace();
+  if (auto intAttr = llvm::dyn_cast<IntegerAttr>(attr))
+    return intAttr.getInt() == 3;
+  if (auto memrefSpace = llvm::dyn_cast<MemorySpaceAttr>(attr))
+    return memrefSpace.getValue() == MemorySpace::SLM;
+  return gpu::GPUDialect::isWorkgroupMemoryAddressSpace(attr);
+}
+
 template <typename T>
 static std::string makeString(T array, bool breakline = false) {
   std::string buf;
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 44e15dd7cbb38..1cd817918a772 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -762,3 +762,19 @@ func.func @slice_attr_repeat_dim() {
   return
 }
 
+// -----
+func.func @create_matrix_desc_non_slm() {
+  %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 1>
+  // expected-error at +1 {{operand #0 must be statically shaped memref of 8-bit signless integer values for shared memory}}
+  %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 1> -> !xegpu.matrix_desc<16x64xf16>
+  return
+}
+
+// -----
+func.func @create_matrix_desc_mismatch_sizes() {
+  %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 3>
+  // expected-error at +1 {{failed to verify that all of {source, matrix_desc} have same size in bits}}
+  %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x32xf16>
+  return
+}
+
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 67c00f5a9cc2f..c224749031328 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -751,4 +751,22 @@ gpu.func @fence() {
   gpu.return
 }
 
+// CHECK-LABEL: gpu.func @create_matrix_desc({{.*}}) {
+gpu.func @create_matrix_desc() {
+  //CHECK: [[alloc:%.+]] = memref.alloca() {alignment = 1024 : i64} : memref<2048xi8, 3>
+  //CHECK: [[mdesc:%.+]] = xegpu.create_matrix_desc [[alloc]] : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16>
+  %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 3>
+  %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @create_matrix_desc_with_stride({{.*}}) {
+gpu.func @create_matrix_desc_with_stride() {
+  //CHECK: [[alloc:%.+]] = memref.alloca() {alignment = 1024 : i64} : memref<2048xi8, 3>
+  //CHECK: [[mdesc:%.+]] = xegpu.create_matrix_desc [[alloc]] : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>
+  %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 3>
+  %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>
+  gpu.return
+}
+
 }

>From 98871ccb013229593e8d169533ab3b03b136f687 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 12 Aug 2025 20:18:09 +0000
Subject: [PATCH 04/10] add unit test for load_matrix and store_matrix

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 31 ++++++-----
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 53 +++++++++++++++++++
 mlir/test/Dialect/XeGPU/invalid.mlir          | 28 ++++++++++
 mlir/test/Dialect/XeGPU/ops.mlir              | 29 ++++++++++
 4 files changed, 129 insertions(+), 12 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index e4ea0b27323ec..461df6efb8528 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1128,16 +1128,18 @@ def XeGPU_CreateMatrixDescOp: XeGPU_Op<"create_matrix_desc", [Pure,
   let assemblyFormat = "$source prop-dict attr-dict `` `:` type($source) `->` qualified(type($matrix_desc))";
 }
 
-def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>]>  {
+def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
+                              AllElementTypesMatch<["matrix_desc", "res"]>,
+                              AllRanksMatch<["matrix_desc", "res"]>]>  {
   let arguments = (ins XeGPU_MatrixDesc:$matrix_desc,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
-    OptionalAttr<XeGPU_LayoutAttr>:$layout
+    OptionalAttr<LayoutTrait>:$layout
   );
   let results = (outs XeGPU_ValueType:$res);
   let assemblyFormat = [{
     $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
-    prop-dict attr-dict `:` functional-type(operands, results)
+    prop-dict attr-dict `` `:` type(operands) `->` type(results)
   }];
   let summary = "Load matrix from SLM.";
   let description = [{
@@ -1158,23 +1160,27 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>]>  {
   }];
 
   let builders = [
-    // OpBuilder<(ins "Type":$res, "TypedValue<MatrixDescType>": $matrix_desc, "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutAttr": $layout)>,
+    OpBuilder<(ins "Type":$res, "TypedValue<MatrixDescType>": $matrix_desc,
+                    "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutTrait": $layout)>,
   ];
   let extraClassDeclaration = [{
     SmallVector<OpFoldResult> getMixedOffsets() {
       return getMixedValues(getConstOffsets(), getOffsets(), getContext());
     }
   }];
-  // let hasVerifier = 1;
+
+  let hasVerifier = 1;
 }
 
-def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix"> {
+def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
+                              AllElementTypesMatch<["matrix_desc", "data"]>,
+                              AllRanksMatch<["matrix_desc", "data"]>]> {
   let arguments = (ins
     XeGPU_MatrixDesc:$matrix_desc,
-    XeGPU_ValueType:$data,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
-    OptionalAttr<XeGPU_LayoutAttr>:$layout
+    XeGPU_ValueType:$data,
+    OptionalAttr<LayoutTrait>:$layout
   );
   let assemblyFormat = [{
     $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets) `,` $data
@@ -1196,15 +1202,16 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix"> {
      - `offsets`     : Coordinates of the matrix where the data will be stored.
   }];
   let builders = [
-    // OpBuilder<(ins "TypedValue<MatrixDescType>": $matrix_desc,  "Value" : $data, "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutAttr": $layout)>,
+    OpBuilder<(ins "TypedValue<MatrixDescType>": $matrix_desc, "llvm::ArrayRef<OpFoldResult>": $offsets,
+                   "Value" : $data, "LayoutTrait": $layout)>,
   ];
   let extraClassDeclaration = [{
     SmallVector<OpFoldResult> getMixedOffsets() {
-      Builder b(getContext());
-      return getMixedValues(getConstOffsets(), getOffsets(), b);
+      return getMixedValues(getConstOffsets(), getOffsets(), getContext());
     }
   }];
-  // let hasVerifier = 1;
+
+  let hasVerifier = 1;
 }
 
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index ad4d8bd6e22cd..2051d7030340e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -935,6 +935,59 @@ void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<FoldConvertLayoutOp>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// XeGPU_LoadMatrixOp
+//===----------------------------------------------------------------------===//
+void LoadMatrixOp::build(OpBuilder &builder, OperationState &state, Type res,
+                         TypedValue<MatrixDescType> matrixDesc,
+                         llvm::ArrayRef<OpFoldResult> offsets,
+                         LayoutTrait layout) {
+  llvm::SmallVector<Value> dynamicOffsets;
+  llvm::SmallVector<int64_t> staticOffsets;
+
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+  auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
+
+  build(builder, state, res, matrixDesc, dynamicOffsets, staticOffsetsAttr,
+        layout);
+}
+
+LogicalResult LoadMatrixOp::verify() {
+  ArrayRef<int64_t> valueShape = getRes().getType().getShape();
+  ArrayRef<int64_t> mdescShape = getMatrixDesc().getType().getShape();
+  if (llvm::any_of(llvm::zip_equal(valueShape, mdescShape),
+                   [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
+    return emitOpError("result shape must not exceed matrix desc shape.");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_StoreMatrixOp
+//===----------------------------------------------------------------------===//
+void StoreMatrixOp::build(OpBuilder &builder, OperationState &state,
+                          TypedValue<MatrixDescType> matrixDesc,
+                          llvm::ArrayRef<OpFoldResult> offsets, Value data,
+                          LayoutTrait layout) {
+  llvm::SmallVector<Value> dynamicOffsets;
+  llvm::SmallVector<int64_t> staticOffsets;
+
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+  auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
+
+  build(builder, state, matrixDesc, dynamicOffsets, staticOffsetsAttr, data,
+        layout);
+}
+
+LogicalResult StoreMatrixOp::verify() {
+  ArrayRef<int64_t> dataShape = getData().getType().getShape();
+  ArrayRef<int64_t> mdescShape = getMatrixDesc().getType().getShape();
+  if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape),
+                   [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
+    return emitOpError("data shape must not exceed matrix desc shape.");
+
+  return success();
+}
+
 } // namespace xegpu
 } // namespace mlir
 
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 1cd817918a772..2feb010d343a8 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -778,3 +778,31 @@ func.func @create_matrix_desc_mismatch_sizes() {
   return
 }
 
+// -----
+func.func @load_matrix_desc_mismatch_element_type(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+  // expected-error at +1 {{failed to verify that all of {matrix_desc, res} have same element type}}
+  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> vector<8x16xf32>
+  return
+}
+
+// -----
+func.func @load_matrix_desc_invalid_result_size(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+  // expected-error at +1 {{result shape must not exceed matrix desc shape}}
+  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> vector<32x16xf16>
+  return
+}
+
+// -----
+func.func @store_matrix_desc_mismatch_element_type(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<16x16xf32>) {
+  // expected-error at +1 {{failed to verify that all of {matrix_desc, data} have same element type}}
+  xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16>, vector<16x16xf32>
+  return
+}
+
+// -----
+func.func @store_matrix_desc_invalid_data_size(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<32x32xf16>) {
+  // expected-error at +1 {{data shape must not exceed matrix desc shape}}
+  xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16>, vector<32x32xf16>
+  return
+}
+
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index c224749031328..cda8f0ac1bb40 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -769,4 +769,33 @@ gpu.func @create_matrix_desc_with_stride() {
   gpu.return
 }
 
+// CHECK: gpu.func @load_matrix_desc([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>)
+gpu.func @load_matrix_desc(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+  // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16> -> vector<8x16xf16>
+  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> vector<8x16xf16>
+  gpu.return
+}
+
+// CHECK: gpu.func @load_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>)
+gpu.func @load_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>) {
+  // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> vector<8x16xf16>
+  %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> vector<8x16xf16>
+  gpu.return
+}
+
+
+// CHECK: gpu.func @store_matrix_desc([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>, [[ARG1:%.+]]: vector<16x16xf16>)
+gpu.func @store_matrix_desc(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<16x16xf16>) {
+  // CHECK: xegpu.store_matrix [[ARG0]][8, 8], [[ARG1]] : !xegpu.matrix_desc<16x64xf16>, vector<16x16xf16>
+  xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16>, vector<16x16xf16>
+  gpu.return
+}
+
+// CHECK: gpu.func @store_matrix_desc_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>)
+gpu.func @store_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, %arg1: vector<16x16xf16>) {
+  // CHECK: xegpu.store_matrix [[ARG0]][8, 8], [[ARG1]] : !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, vector<16x16xf16>
+  xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, vector<16x16xf16>
+  gpu.return
+}
+
 }

>From 06eec6e51b755cbb13b62cfaa3ba2320e8bc3cb6 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 12 Aug 2025 20:33:56 +0000
Subject: [PATCH 05/10] refine description

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 42 ++++++++-----------
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       |  5 ++-
 2 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 461df6efb8528..f536650e9d872 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1118,10 +1118,14 @@ def XeGPU_CreateMatrixDescOp: XeGPU_Op<"create_matrix_desc", [Pure,
       AllMemSizesMatch<["source", "matrix_desc"]>]>  {
   let summary = "Create a matrix descriptor.";
   let description = [{
+    Creates a matrix descriptor from a shared local memory (SLM) buffer.
+    The resulting matrix descriptor has to have the same size as the underlying
+    shared local memory.
+
     Arguments:
-     - `source` : a base address of SLM allocation.
+     - `source` : a 1D statically shaped memref with element type i8, representing the raw SLM buffer.
     Results:
-     - `matrix_desc` : a descriptor for SLM allocation.
+     - `matrix_desc` : the matrix descriptor.
   }];
   let arguments = (ins StaticShared1DMemRefOf<[I8]>:$source);
   let results = (outs XeGPU_MatrixDesc:$matrix_desc);
@@ -1141,22 +1145,16 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
     $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
     prop-dict attr-dict `` `:` type(operands) `->` type(results)
   }];
-  let summary = "Load matrix from SLM.";
-  let description = [{
-    This operation loads a matrix from the SLM using the matrix descriptor.
-    There are additional parameters and attributes that support loading, but they must only
-    be specified for a work-item level operation.
 
-    General rules:
-    1. Non-WI-level code must not specify optional attributes.
-    2. If the load uses `vector` semantics, all of the vector attributes must be specified.
-    3. If the load uses `array` semantics, all of the array attributes must be specified.
+  let description = [{
+    This operation reads a block of data from shared local memory (SLM)
+    using the provided matrix descriptor.
 
     Arguments:
-     - `matrix_desc` : a matrix descriptor (SLM allocation + matrix type).
-     - `offsets`     : Coordinates of the matrix to load.
+     - `matrix_desc`: the matrix descriptor identifying the SLM region.
+     - `offsets`: the coordinates within the matrix to read from.
     Results:
-      - `res` : loaded matrix elements.
+     - `res`: the matrix elements loaded from SLM.
   }];
 
   let builders = [
@@ -1186,20 +1184,14 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
     $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets) `,` $data
     prop-dict attr-dict `:` type(operands)
   }];
-  let summary = "Store matrix from SLM.";
   let description = [{
-    This operation stores workitem's `data` fragment of the matrix to the SLM (`matrix_desc`).
-    There are additional parameters and attributes that support loading, but they must only
-    be specified for a work-item level operation.
-
-    General rules:
-    1. Non-WI-level code must not specify optional attributes.
-    2. If the store uses `vector` semantics, all of the vector attributes must be specified.
+    This operation writes the `data` fragment into the shared local memory region
+    identified by `matrix_desc`.
 
     Arguments:
-     - `matrix_desc` : a matrix descriptor.
-     - `data`        : data to be stored to the matrix.
-     - `offsets`     : Coordinates of the matrix where the data will be stored.
+     - `matrix_desc`: the matrix descriptor specifying the SLM region.
+     - `offsets`: the coordinates within the matrix where the data will be written.
+     - `data`: the values to be stored in the matrix.
   }];
   let builders = [
     OpBuilder<(ins "TypedValue<MatrixDescType>": $matrix_desc, "llvm::ArrayRef<OpFoldResult>": $offsets,
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index f578fc8bc0735..02cabce82398b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -204,8 +204,9 @@ def XeGPU_Nbarrier: XeGPUTypeDef<"Nbarrier", "nbarrier", [], "mlir::Type"> {
 def XeGPU_MatrixDesc: XeGPUTypeDef<"MatrixDesc", "matrix_desc", [ShapedTypeInterface], "mlir::Type"> {
   let summary = "MatrixDesc describing the data in SLM";
   let description = [{
-    MatrixDesc describes a SLM region. Unleass specified via the optional layout attribute,
-    the data is stored contiguously in the region in row-major order by default.
+    MatrixDesc represents a block of data stored in shared local memory.
+    By default, unless a layout attribute is provided, the data is stored
+    contiguously in row-major order within the region.
   }];
   let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
                         "mlir::Type": $elementType,

>From 6df4291c7fcecccc233f0b9ffea67e5edaef5d9b Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 13 Aug 2025 00:02:35 +0000
Subject: [PATCH 06/10] add subview op

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 31 ++++++++++++++++++
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 32 ++++++++++++++++---
 mlir/test/Dialect/XeGPU/invalid.mlir          | 20 ++++++++++++
 mlir/test/Dialect/XeGPU/ops.mlir              | 14 ++++++++
 4 files changed, 93 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index f536650e9d872..0c8980bb04b2e 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1206,5 +1206,36 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
   let hasVerifier = 1;
 }
 
+def XeGPU_MatrixDescSubviewOp: XeGPU_Op<"matrix_desc_subview", [Pure, ViewLikeOpInterface,
+                                                                AllElementTypesMatch<["src", "res"]>,
+                                                                AllRanksMatch<["src", "res"]>]> {
+  let description = [{
+    Create a subview of a matrix descriptor.
+    Results:
+     - `src` : a matrix descriptor.
+     - `offsets` : the coordinates within the matrix the subview will be created from.
+  }];
+  let arguments = (ins XeGPU_MatrixDesc:$src,
+                       Variadic<Index>:$offsets,
+                       DenseI64ArrayAttr:$const_offsets,
+                       OptionalAttr<LayoutTrait>: $layout);
+  let results = (outs XeGPU_MatrixDesc:$res);
+  let assemblyFormat = [{$src `` custom<DynamicIndexList>($offsets, $const_offsets) prop-dict
+                         attr-dict `` `:` qualified(type($src)) `->` qualified(type($res))}];
+  let builders = [
+    OpBuilder<(ins "Type": $res, "Value":$src, "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutTrait": $layout)>
+  ];
+
+  let extraClassDeclaration = [{
+    mlir::Value getViewSource() { return getSrc(); }
+
+    SmallVector<OpFoldResult> getMixedOffsets() {
+      return getMixedValues(getConstOffsets(), getOffsets(), getContext());
+    }
+  }];
+
+  let hasVerifier = 1;
+}
+
 
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 2051d7030340e..a8ec058a12a93 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -944,10 +944,8 @@ void LoadMatrixOp::build(OpBuilder &builder, OperationState &state, Type res,
                          LayoutTrait layout) {
   llvm::SmallVector<Value> dynamicOffsets;
   llvm::SmallVector<int64_t> staticOffsets;
-
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
-
   build(builder, state, res, matrixDesc, dynamicOffsets, staticOffsetsAttr,
         layout);
 }
@@ -970,10 +968,8 @@ void StoreMatrixOp::build(OpBuilder &builder, OperationState &state,
                           LayoutTrait layout) {
   llvm::SmallVector<Value> dynamicOffsets;
   llvm::SmallVector<int64_t> staticOffsets;
-
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
-
   build(builder, state, matrixDesc, dynamicOffsets, staticOffsetsAttr, data,
         layout);
 }
@@ -988,6 +984,34 @@ LogicalResult StoreMatrixOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// XeGPU_MatrixDescSubviewOp
+//===----------------------------------------------------------------------===//
+
+void MatrixDescSubviewOp::build(OpBuilder &builder, OperationState &state,
+                                Type resTy, Value src,
+                                llvm::ArrayRef<OpFoldResult> offsets,
+                                LayoutTrait layout) {
+  llvm::SmallVector<Value> dynamicOffsets;
+  llvm::SmallVector<int64_t> staticOffsets;
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+  auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
+  build(builder, state, resTy, src, dynamicOffsets, staticOffsetsAttr, layout);
+}
+
+LogicalResult MatrixDescSubviewOp::verify() {
+  ArrayRef<int64_t> srcShape = getSrc().getType().getShape();
+  ArrayRef<int64_t> resShape = getRes().getType().getShape();
+  if (llvm::any_of(llvm::zip_equal(resShape, srcShape),
+                   [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
+    return emitOpError("result shape must not exceed source shape.");
+
+  if (getSrc().getType().getLayout() != getRes().getType().getLayout())
+    return emitOpError("result must inherit the source layout.");
+
+  return success();
+}
+
 } // namespace xegpu
 } // namespace mlir
 
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 2feb010d343a8..63945dab1ccc2 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -806,3 +806,23 @@ func.func @store_matrix_desc_invalid_data_size(%arg0: !xegpu.matrix_desc<16x64xf
   return
 }
 
+// -----
+func.func @matrix_desc_subview_size_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+  // expected-error at +1 {{result shape must not exceed source shape}}
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<32x16xf16>
+  return
+}
+
+// -----
+func.func @matrix_desc_subview_layout_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>) {
+  // expected-error at +1 {{result must inherit the source layout}}
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> !xegpu.matrix_desc<8x16xf16>
+  return
+}
+
+// -----
+func.func @matrix_desc_subview_rank_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+  // expected-error at +1 {{failed to verify that all of {src, res} have same element type}}
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf32>
+  return
+}
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index cda8f0ac1bb40..7bceda70dea9f 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -798,4 +798,18 @@ gpu.func @store_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, str
   gpu.return
 }
 
+// CHECK: gpu.func @matrix_desc_subview([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>)
+gpu.func @matrix_desc_subview(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+  //CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf16>
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf16>
+  gpu.return
+}
+
+// CHECK: gpu.func @matrix_desc_subview_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>)
+gpu.func @matrix_desc_subview_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>) {
+  //CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> !xegpu.matrix_desc<8x16xf16, strided<[1, 16]>>
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> !xegpu.matrix_desc<8x16xf16, strided<[1, 16]>>
+  gpu.return
+}
+
 }

>From e11c88db66366d3c61b158959f5418230ce2abbb Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 13 Aug 2025 13:57:59 +0000
Subject: [PATCH 07/10] address comments

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 6 ++++++
 mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt       | 1 +
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp         | 3 +++
 3 files changed, 10 insertions(+)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 0c8980bb04b2e..6d06464e204a6 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1153,6 +1153,9 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
     Arguments:
      - `matrix_desc`: the matrix descriptor identifying the SLM region.
      - `offsets`: the coordinates within the matrix to read from.
+     - `layout`: [optional] An attribute for guiding distributions among
+                 subgroups and/or work-items. It currently can accept either
+                 LayoutAttr or SliceAttr.
     Results:
      - `res`: the matrix elements loaded from SLM.
   }];
@@ -1192,6 +1195,9 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
      - `matrix_desc`: the matrix descriptor specifying the SLM region.
      - `offsets`: the coordinates within the matrix where the data will be written.
      - `data`: the values to be stored in the matrix.
+     - `layout`: [optional] An attribute for guiding distributions among
+                 subgroups and/or work-items. It currently can accept either
+                 LayoutAttr or SliceAttr.
   }];
   let builders = [
     OpBuilder<(ins "TypedValue<MatrixDescType>": $matrix_desc, "llvm::ArrayRef<OpFoldResult>": $offsets,
diff --git a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
index 603fb5d237544..7869a28dfed57 100644
--- a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
@@ -18,6 +18,7 @@ add_mlir_dialect_library(MLIRXeGPUDialect
   MLIRArithUtils
   MLIRDialectUtils
   MLIRGPUDialect
+  MLIRXeVMDialect
   MLIRIR
   MLIRViewLikeInterface
   MLIRVectorDialect
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index a8ec058a12a93..1157f21230485 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
@@ -28,6 +29,8 @@ bool isSharedMemory(const MemRefType &memrefTy) {
     return intAttr.getInt() == 3;
   if (auto memrefSpace = llvm::dyn_cast<MemorySpaceAttr>(attr))
     return memrefSpace.getValue() == MemorySpace::SLM;
+  if (auto xevmSpace = llvm::dyn_cast<xevm::AddrSpaceAttr>(attr))
+    return xevmSpace.getValue() == xevm::AddrSpace::SHARED;
   return gpu::GPUDialect::isWorkgroupMemoryAddressSpace(attr);
 }
 

>From 23380a923cd2c2073a66fd31b70c3650869dcf3b Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 13 Aug 2025 14:30:21 +0000
Subject: [PATCH 08/10] update doc

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 6d06464e204a6..112a18f0705ab 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1220,6 +1220,9 @@ def XeGPU_MatrixDescSubviewOp: XeGPU_Op<"matrix_desc_subview", [Pure, ViewLikeOp
     Results:
      - `src` : a matrix descriptor.
      - `offsets` : the coordinates within the matrix the subview will be created from.
+     - `layout`: [optional] An attribute for guiding distributions among
+                 subgroups and/or work-items. It currently can accept either
+                 LayoutAttr or SliceAttr.
   }];
   let arguments = (ins XeGPU_MatrixDesc:$src,
                        Variadic<Index>:$offsets,

>From 9e3aa8d6631fe177fd17bfdb9fd48da2ef1d5072 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 13 Aug 2025 21:25:18 +0000
Subject: [PATCH 09/10] remove the layout attribute from the subview op

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 8 ++------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp         | 5 ++---
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 112a18f0705ab..9ae2eb0c2e178 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1220,19 +1220,15 @@ def XeGPU_MatrixDescSubviewOp: XeGPU_Op<"matrix_desc_subview", [Pure, ViewLikeOp
     Results:
      - `src` : a matrix descriptor.
      - `offsets` : the coordinates within the matrix the subview will be created from.
-     - `layout`: [optional] An attribute for guiding distributions among
-                 subgroups and/or work-items. It currently can accept either
-                 LayoutAttr or SliceAttr.
   }];
   let arguments = (ins XeGPU_MatrixDesc:$src,
                        Variadic<Index>:$offsets,
-                       DenseI64ArrayAttr:$const_offsets,
-                       OptionalAttr<LayoutTrait>: $layout);
+                       DenseI64ArrayAttr:$const_offsets);
   let results = (outs XeGPU_MatrixDesc:$res);
   let assemblyFormat = [{$src `` custom<DynamicIndexList>($offsets, $const_offsets) prop-dict
                          attr-dict `` `:` qualified(type($src)) `->` qualified(type($res))}];
   let builders = [
-    OpBuilder<(ins "Type": $res, "Value":$src, "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutTrait": $layout)>
+    OpBuilder<(ins "Type": $res, "Value":$src, "llvm::ArrayRef<OpFoldResult>": $offsets)>
   ];
 
   let extraClassDeclaration = [{
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 1157f21230485..27fd6797fed39 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -993,13 +993,12 @@ LogicalResult StoreMatrixOp::verify() {
 
 void MatrixDescSubviewOp::build(OpBuilder &builder, OperationState &state,
                                 Type resTy, Value src,
-                                llvm::ArrayRef<OpFoldResult> offsets,
-                                LayoutTrait layout) {
+                                llvm::ArrayRef<OpFoldResult> offsets) {
   llvm::SmallVector<Value> dynamicOffsets;
   llvm::SmallVector<int64_t> staticOffsets;
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
-  build(builder, state, resTy, src, dynamicOffsets, staticOffsetsAttr, layout);
+  build(builder, state, resTy, src, dynamicOffsets, staticOffsetsAttr);
 }
 
 LogicalResult MatrixDescSubviewOp::verify() {

>From af2c25f457f4a94a0e304196040c0484718d54ca Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Thu, 14 Aug 2025 00:06:01 +0000
Subject: [PATCH 10/10] refine subview op

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 27 ++++++++++---------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 23 ++++++++++------
 mlir/test/Dialect/XeGPU/invalid.mlir          | 14 +++++++---
 mlir/test/Dialect/XeGPU/ops.mlir              | 15 ++++++++---
 4 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 9ae2eb0c2e178..65f805d1efa93 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1177,16 +1177,14 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
                               AllElementTypesMatch<["matrix_desc", "data"]>,
                               AllRanksMatch<["matrix_desc", "data"]>]> {
   let arguments = (ins
+    XeGPU_ValueType:$data,
     XeGPU_MatrixDesc:$matrix_desc,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
-    XeGPU_ValueType:$data,
     OptionalAttr<LayoutTrait>:$layout
   );
-  let assemblyFormat = [{
-    $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets) `,` $data
-    prop-dict attr-dict `:` type(operands)
-  }];
+  let assemblyFormat = [{ $data `,` $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
+                          prop-dict attr-dict `` `:` type(operands)}];
   let description = [{
     This operation writes the `data` fragment into the shared local memory region
     identified by `matrix_desc`.
@@ -1200,8 +1198,8 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
                  LayoutAttr or SliceAttr.
   }];
   let builders = [
-    OpBuilder<(ins "TypedValue<MatrixDescType>": $matrix_desc, "llvm::ArrayRef<OpFoldResult>": $offsets,
-                   "Value" : $data, "LayoutTrait": $layout)>,
+    OpBuilder<(ins "Value" : $data, "TypedValue<MatrixDescType>": $matrix_desc,
+                   "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutTrait": $layout)>,
   ];
   let extraClassDeclaration = [{
     SmallVector<OpFoldResult> getMixedOffsets() {
@@ -1212,14 +1210,19 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
   let hasVerifier = 1;
 }
 
-def XeGPU_MatrixDescSubviewOp: XeGPU_Op<"matrix_desc_subview", [Pure, ViewLikeOpInterface,
-                                                                AllElementTypesMatch<["src", "res"]>,
-                                                                AllRanksMatch<["src", "res"]>]> {
+def XeGPU_MatrixDescSubviewOp: XeGPU_Op<"matrix_desc_subview",
+          [Pure, ViewLikeOpInterface, AllElementTypesMatch<["src", "res"]>]> {
   let description = [{
-    Create a subview of a matrix descriptor.
-    Results:
+    Creates a subview of a matrix descriptor. The resulting matrix descriptor
+    may have a lower rank than the source, in which case the dimensions are left-aligned.
+
+    Arguments:
      - `src` : a matrix descriptor.
      - `offsets` : the coordinates within the matrix the subview will be created from.
+
+    Results:
+    - `res` : a matrix descriptor with smaller size.
+
   }];
   let arguments = (ins XeGPU_MatrixDesc:$src,
                        Variadic<Index>:$offsets,
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 27fd6797fed39..27a652663190d 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -965,15 +965,15 @@ LogicalResult LoadMatrixOp::verify() {
 //===----------------------------------------------------------------------===//
 // XeGPU_StoreMatrixOp
 //===----------------------------------------------------------------------===//
-void StoreMatrixOp::build(OpBuilder &builder, OperationState &state,
+void StoreMatrixOp::build(OpBuilder &builder, OperationState &state, Value data,
                           TypedValue<MatrixDescType> matrixDesc,
-                          llvm::ArrayRef<OpFoldResult> offsets, Value data,
+                          llvm::ArrayRef<OpFoldResult> offsets,
                           LayoutTrait layout) {
   llvm::SmallVector<Value> dynamicOffsets;
   llvm::SmallVector<int64_t> staticOffsets;
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
-  build(builder, state, matrixDesc, dynamicOffsets, staticOffsetsAttr, data,
+  build(builder, state, data, matrixDesc, dynamicOffsets, staticOffsetsAttr,
         layout);
 }
 
@@ -1002,13 +1002,20 @@ void MatrixDescSubviewOp::build(OpBuilder &builder, OperationState &state,
 }
 
 LogicalResult MatrixDescSubviewOp::verify() {
-  ArrayRef<int64_t> srcShape = getSrc().getType().getShape();
-  ArrayRef<int64_t> resShape = getRes().getType().getShape();
-  if (llvm::any_of(llvm::zip_equal(resShape, srcShape),
-                   [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
+  MatrixDescType srcTy = getSrc().getType();
+  MatrixDescType resTy = getRes().getType();
+  ArrayRef<int64_t> srcShape = srcTy.getShape();
+  ArrayRef<int64_t> resShape = resTy.getShape();
+
+  if (srcTy.getRank() < resTy.getRank())
+    return emitOpError("result rank must not exceed source rank.");
+
+  if (llvm::any_of(
+          llvm::zip_equal(resShape, srcShape.take_back(resShape.size())),
+          [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
     return emitOpError("result shape must not exceed source shape.");
 
-  if (getSrc().getType().getLayout() != getRes().getType().getLayout())
+  if (srcTy.getLayout() != resTy.getLayout())
     return emitOpError("result must inherit the source layout.");
 
   return success();
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 63945dab1ccc2..f2df1a3920e23 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -795,14 +795,14 @@ func.func @load_matrix_desc_invalid_result_size(%arg0: !xegpu.matrix_desc<16x64x
 // -----
 func.func @store_matrix_desc_mismatch_element_type(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<16x16xf32>) {
   // expected-error at +1 {{failed to verify that all of {matrix_desc, data} have same element type}}
-  xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16>, vector<16x16xf32>
+  xegpu.store_matrix %arg1, %arg0[8, 8] : vector<16x16xf32>, !xegpu.matrix_desc<16x64xf16>
   return
 }
 
 // -----
 func.func @store_matrix_desc_invalid_data_size(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<32x32xf16>) {
   // expected-error at +1 {{data shape must not exceed matrix desc shape}}
-  xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16>, vector<32x32xf16>
+  xegpu.store_matrix %arg1, %arg0[8, 8] : vector<32x32xf16>, !xegpu.matrix_desc<16x64xf16>
   return
 }
 
@@ -821,8 +821,16 @@ func.func @matrix_desc_subview_layout_mismatch(%arg0: !xegpu.matrix_desc<16x64xf
 }
 
 // -----
-func.func @matrix_desc_subview_rank_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+func.func @matrix_desc_subview_element_type_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16>) {
   // expected-error at +1 {{failed to verify that all of {src, res} have same element type}}
   %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf32>
   return
 }
+
+// -----
+func.func @matrix_desc_subview_rank_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+  // expected-error at +1 {{result rank must not exceed source rank}}
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<4x8x16xf16>
+  return
+}
+
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 7bceda70dea9f..7a9657587070a 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -786,15 +786,15 @@ gpu.func @load_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, stri
 
 // CHECK: gpu.func @store_matrix_desc([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>, [[ARG1:%.+]]: vector<16x16xf16>)
 gpu.func @store_matrix_desc(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<16x16xf16>) {
-  // CHECK: xegpu.store_matrix [[ARG0]][8, 8], [[ARG1]] : !xegpu.matrix_desc<16x64xf16>, vector<16x16xf16>
-  xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16>, vector<16x16xf16>
+  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 8] : vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16>
+  xegpu.store_matrix %arg1, %arg0[8, 8]: vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16>
   gpu.return
 }
 
 // CHECK: gpu.func @store_matrix_desc_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>)
 gpu.func @store_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, %arg1: vector<16x16xf16>) {
-  // CHECK: xegpu.store_matrix [[ARG0]][8, 8], [[ARG1]] : !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, vector<16x16xf16>
-  xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, vector<16x16xf16>
+  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 8] : vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>
+  xegpu.store_matrix %arg1, %arg0[8, 8]: vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>
   gpu.return
 }
 
@@ -805,6 +805,13 @@ gpu.func @matrix_desc_subview(%arg0: !xegpu.matrix_desc<16x64xf16>) {
   gpu.return
 }
 
+// CHECK: gpu.func @matrix_desc_subview_lower_rank([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>)
+gpu.func @matrix_desc_subview_lower_rank(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+  //CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<16xf16>
+  %data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<16xf16>
+  gpu.return
+}
+
 // CHECK: gpu.func @matrix_desc_subview_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>)
 gpu.func @matrix_desc_subview_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>) {
   //CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> !xegpu.matrix_desc<8x16xf16, strided<[1, 16]>>