[Mlir-commits] [mlir] [MLIR][XeGPU] Adding XeGPU 2d block operators (PR #84692)

Fri Mar 15 12:23:32 PDT 2024

https://github.com/chencha3 updated https://github.com/llvm/llvm-project/pull/84692

>From c93bdcfef59e9d2cb9d2e4fca119b77cf11824dc Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 8 Mar 2024 22:58:14 +0000
Subject: [PATCH 01/19] add TensorDescType, TensorDescAttr, and createNdDescOp.

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h    |   4 +
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  78 ++++
 .../mlir/Dialect/XeGPU/IR/XeGPUDialect.td     |   4 +-
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 135 +++++++
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       | 103 ++++-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  72 +++-
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 380 +++++++++++++++++-
 7 files changed, 769 insertions(+), 7 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 7aaa4ecc7ee77a..8dc3ff78d25ede 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -9,7 +9,11 @@
 #ifndef MLIR_DIALECT_XEGPU_IR_XEGPU_H
 #define MLIR_DIALECT_XEGPU_IR_XEGPU_H
 
+#include <mlir/Bytecode/BytecodeOpInterface.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Dialect.h>
+#include <mlir/Interfaces/ShapedOpInterfaces.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
 
 namespace mlir {
 namespace xegpu {
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index bb325c272e3324..6e4c1bce6d0d59 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -10,6 +10,7 @@
 #define MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
 
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
+include "mlir/IR/EnumAttr.td"
 
 class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
                 string baseCppClass = "::mlir::Attribute">
@@ -17,4 +18,81 @@ class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
   let mnemonic = attrMnemonic;
 }
 
+def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
+  let parameters = (ins
+    OptionalParameter<"MemoryScopeKindAttr">: $memory_scope,
+    OptionalParameter<"IntegerAttr", "1">: $array_length,
+    OptionalParameter<"BoolAttr", "true">: $boundary_check
+  );
+
+  let builders = [
+    AttrBuilder<(ins
+      CArg<"xegpu::MemoryScopeKind", "xegpu::MemoryScopeKind::Global">:$memory_scope,
+      CArg<"int", "1">:$array_length,
+      CArg<"bool", "true">: $boundary_check
+    )>
+  ];
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU Memory Scope Enums.
+//===----------------------------------------------------------------------===//
+def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">;
+def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">;
+def XeGPU_MemoryScopeKind: I32EnumAttr<"MemoryScopeKind", 
+      "The address space of the memory the tensor descritor is created for", 
+      [XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::xegpu";
+}
+
+def XeGPU_MemoryScopeAttr: 
+  EnumAttr<XeGPU_Dialect, XeGPU_MemoryScopeKind, "memory_scope"> {
+    let assemblyFormat = "`<` $value `>`";
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU Operator Mode Enums.
+//===----------------------------------------------------------------------===//
+def XeGPU_OpModeSIMT : I32EnumAttrCase<"SIMT", 0, "simt">;
+def XeGPU_OpModeVectorCompute : I32EnumAttrCase<"VectorCompute", 1, "vc">;
+def XeGPU_ModeKind : I32EnumAttr<"ModeKind", 
+             "The Mode an operator runs on", 
+  [XeGPU_OpModeSIMT, XeGPU_OpModeVectorCompute]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::xegpu";
+}
+
+def XeGPU_ModeAttr: 
+  EnumAttr<XeGPU_Dialect, XeGPU_ModeKind, "mode"> {
+    let assemblyFormat = "`<` $value `>`";
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU Cache Enums.
+//===----------------------------------------------------------------------===//
+def XeGPU_CacheKindCached:        I32EnumAttrCase<"CACHED", 0, "cached">;                    // valid for read and write
+def XeGPU_CacheKindUncached:      I32EnumAttrCase<"UNCACHED", 1, "uncached">;                // valid for read and write
+def XeGPU_CacheKindStreaming:     I32EnumAttrCase<"STREAMING", 2, "streaming">;              // valid for read only
+def XeGPU_CacheKindInvalid:       I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate">;  // valid for read only
+def XeGPU_CacheKindWriteBack:     I32EnumAttrCase<"WRITE_BACK", 4, "write_back">;            // valid for write only
+def XeGPU_CacheKindWriteThrough:  I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">;      // valid for write only
+
+def XeGPU_CacheKind : I32EnumAttr<"CacheKind", "Cache kind", 
+  [XeGPU_CacheKindCached, XeGPU_CacheKindUncached, 
+   XeGPU_CacheKindStreaming, XeGPU_CacheKindInvalid,
+   XeGPU_CacheKindWriteBack, XeGPU_CacheKindWriteThrough]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::xegpu";
+}
+
+def XeGPU_CacheAttr 
+  : EnumAttr<XeGPU_Dialect, XeGPU_CacheKind, "cache_kind"> {
+    let assemblyFormat = "`<` $value `>`";
+}
+
+
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
index 3851275ad30a0a..c2f09319c790e0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -23,8 +23,8 @@ def XeGPU_Dialect : Dialect {
       the lower-level GPU compiler.
     }];
 
-    // let useDefaultTypePrinterParser = true;
-    // let useDefaultAttributePrinterParser = true;
+    let useDefaultTypePrinterParser = true;
+    let useDefaultAttributePrinterParser = true;
 }
 
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUDIALECT_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 5825ef9195b03f..a321d36f2ae271 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -12,6 +12,22 @@
 include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/AttrTypeBase.td"
+
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/OpAsmInterface.td"
+include "mlir/IR/AttrTypeBase.td"
+include "mlir/IR/BuiltinTypes.td"
+include "mlir/IR/BuiltinTypeInterfaces.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/ViewLikeInterface.td"
+include "mlir/Interfaces/CastInterfaces.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
+include "mlir/Interfaces/CopyOpInterface.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/ShapedOpInterfaces.td"
 
 
 // Base class for dialect operations. This operation inherits from the base
@@ -23,4 +39,123 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
           Op<XeGPU_Dialect, mnemonic, traits>;
 
 
+def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSegments]> {
+
+  let summary = "create nd tensor descriptor operation";
+  let description = [{
+    The "create_nd_tdesc" operation creates a TensorDescType which represents
+    a sub-view of a 2D memory region (It can be extended to support N-D memory
+    region if needed in future). Elements in the subview continuous in each 
+    dimention. It encodes the following important information for supporting 
+    Intel hardware features:
+
+    * source: an object representing (starting address/pointer of) a 2D memory region. 
+        It can be either a 2D memref object, or simply a pointer represented by uint64_t type.
+        for the later case, the shape and layout information of the 2D memory region should 
+        be explicitly passed via `dynamic_shape` and `dynamic_strides` parameters.
+    * offsets: two index values represents offsets from the "source" at the each dimension 
+        at which the subview of the target memory will be created. It is encoded via two
+        variables, including "dynamic_offsets" and "static_offsets", such that it can
+        accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])).
+    * shape: the shape information of the memory region pointed by the "source".  It is 
+        typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. 
+        But if "source" is simply a pointer represented as uint64_t type, or a memref 
+        type without shape information e.g., memref<?x?xf16>, the shape information has 
+        to be explicitly passed via the "dynamic_shape" argument. Currently "dynamic_shape" 
+        only accepts operands(e.g., [%c4096, %c4096]), not attributes(e.g., [4096, 4096]).
+    * strides: the strides of the memory region pointed by the "source". Similar to shape, 
+        it is typically encoded via the MemRefType of the source too. But if "source" is 
+        simply a pointer represented as uint64_t type, or a memref type without shape 
+        information e.g., memref<?x?xf16>, the strides information has to be explicitly 
+        passed via the "dynamic_strides" argument. And it currently only accepts operands two.
+
+    Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
+    %0 = memref.alloc() : memref<1024x1024xf32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.create_nd_tdesc %0[%c0, %c0]: memref<1024x1024xf32> -> TensorDesc<8x16xf32>
+
+    Example 2 (suppose the tensor shape inferred by the compiler is 8x16):
+    %0 = memref.alloc(%h, %w) : memref<?x?xf32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: memref<?x?xf32> -> TensorDesc<8x16xf32>
+
+    Example 3 (suppose the tensor shape inferred by the compiler is 8x16):
+    %0 = ... : ui64
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
+  }];
+
+  let arguments = (ins XeGPU_BaseAddrType: $source, 
+                 Variadic<Index>: $dynamic_offsets, 
+                 Variadic<Index>: $dynamic_shape, 
+                 Variadic<Index>: $dynamic_strides,
+                 DenseI64ArrayAttr: $static_offsets);
+  let results = (outs XeGPU_TensorDesc:$TensorDesc);
+
+  let hasCustomAssemblyFormat = 1;
+  let skipDefaultBuilders = 1;
+  let hasVerifier = 1;
+
+  let builders = [
+    OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets, 
+                   "ValueRange": $shape, "ValueRange": $strides, 
+                   "llvm::ArrayRef<int64_t>": $static_offsets)>,
+
+    OpBuilder<(ins "Type": $tdesc, "Value": $source, 
+                   "llvm::ArrayRef<OpFoldResult>": $offsets)>,
+
+    OpBuilder<(ins "Type": $tdesc, "Value": $source, 
+                   "llvm::ArrayRef<OpFoldResult>": $offsets,
+                   "ValueRange": $shape, "ValueRange": $stride)>
+  ];
+
+  let extraClassDeclaration = [{
+    /// Returns the type of the source memref operand.
+    Type getSourceType() {
+      return getSource().getType();
+    }
+
+    /// Returns the type of the result TensorDesc.
+    xegpu::TensorDescType getType() {
+      return getTensorDesc().getType();
+    }
+
+    /// Returns the offsets info to the source. It consolidates
+    /// information from both dynamic_offsets and static_offsets
+    /// parameters. static_offsets parameter always has the expected
+    /// ranks with some dim could have ShapeType::kDynamic value
+    /// indicating the corresponding value should be from dynamic_offsets.
+    llvm::SmallVector<OpFoldResult> getOffsets();
+
+    /// returns the shape info of the source. It is either from the
+    /// memref type, if source is a memref with static shape
+    /// information or from the dynamic_shape parameter. If both
+    /// exists, the dynamic_shape parameter will be used and the
+    /// shape information from  memref type will be ignored.
+    llvm::SmallVector<OpFoldResult> getShape();
+
+    /// returns the strides info of the source. It is either from the
+    /// memref type, if source is a memref with static shape
+    /// information or from the dynamic_stride parameter. If both
+    /// exists, the dynamic_strides parameter will be used and the
+    /// strides information from  memref type will be ignored.
+    llvm::SmallVector<OpFoldResult> getStrides();
+
+    /// Return the element type of the TensorDesc
+    Type getElementType() {
+      return getType().getElementType();
+    }
+
+    /// Return the shape of the TensorDesc
+    llvm::ArrayRef<int64_t> getTensorDescShape() {
+      return getType().getShape();
+    }
+  }];
+
+}
+
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 1d75bb4e2906fe..319e16b3ae326b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -9,9 +9,9 @@
 #ifndef MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
 #define MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
 
-include "mlir/IR/BuiltinTypes.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
+include "mlir/IR/BuiltinTypes.td"
 
 def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64, UI1, UI8, UI16, UI32, UI64]>;
 def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>;
@@ -30,4 +30,105 @@ class XeGPUTypeDef<string name, string typeMnemonic, list<Trait> traits = [],
   let mnemonic = typeMnemonic;
 }
 
+def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
+        [ShapedTypeInterface], "::mlir::TensorType"> {
+  let summary = "TensorDesc describing regions of interested data.";
+  let description = [{
+    TensorDesc is a type designed to describe regions of the interested data as well as some 
+    features that are unique to Intel hardware. Different with the builtin tensor type in MLIR, 
+    it essentially only contains the meta data, and doesn't hold the data by itself. It is designed 
+    to mainly support 2d block load/store and DPAS (matrix multiplication instruction) on Intel GPU. 
+    It encodes the following information:
+
+    * shape:  the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows
+              and each row contains 16 continious data element. The rows could be
+              either continuous or not, depends on whether the encoding attribute
+              is set or not.
+    * element_type: the data type of the data element, e.g., f16, f32.
+
+    Similar to the builtin tensor, it also provides an optinal attribute to encoding 
+    the following information via the TensorDescAttr object:
+    * memory_scope (xegpu::MemoryScope): [optional] where the data is located, 
+                global memory or shared memory. It is default to Global.
+    * array_length (int): [optional] The number of continuous blocks with size as `shape`,
+               that will be loaded by block load at a time. It is default to 1.
+    * boundary_check (bool): [optional] indicates whether the operation detects the boundary 
+                and pads with zero for out-of-boundary access. It is default to do boundary check.
+    
+
+    Syntax:
+
+    ```
+    TensorDesc-type ::= `tensor_desc` `<` dim-list element-type (attr-list)? `>`
+    element-type ::= float-type | integer-type | index-type
+    dim-list := (static-dim-list `x`)?
+    static-dim-list ::= decimal-literal `x` decimal-literal
+    attr-list = (, memory_scope = value)? (, arr_len = value)? (, boundary_check = value)?
+    ```
+
+    Examples:
+
+    ```mlir
+    // A block TensorDesc with 8x16 i32 elements
+    xegpu.tensor_desc<8x16xi32>
+
+    // A block TensorDesc with 8x16 f32 elements
+    xegpu.tensor_desc<8x16xf32>
+
+    // A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
+    xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+    ```
+  }];
+
+  let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
+                        "mlir::Type": $elementType,
+                        OptionalParameter<"mlir::Attribute">: $encoding);
+
+  let extraClassDeclaration = [{
+    using TensorType::clone;
+    using mlir::ShapedType::Trait<TensorDescType>::getElementTypeBitWidth;
+    using mlir::ShapedType::Trait<TensorDescType>::getRank;
+    using mlir::ShapedType::Trait<TensorDescType>::getNumElements;
+    using mlir::ShapedType::Trait<TensorDescType>::isDynamicDim;
+    using mlir::ShapedType::Trait<TensorDescType>::hasStaticShape;
+    using mlir::ShapedType::Trait<TensorDescType>::getNumDynamicDims;
+    using mlir::ShapedType::Trait<TensorDescType>::getDimSize;
+    using mlir::ShapedType::Trait<TensorDescType>::getDynamicDimIndex;
+
+    TensorDescType clone(::mlir::Type elementType) {
+      return llvm::cast<TensorDescType>(cloneWith(getShape(), elementType));
+    }
+
+    TensorDescAttr getEncodingAsTensorDescAttr() const {
+      return llvm::dyn_cast_if_present<TensorDescAttr>(getEncoding());
+    }
+
+    xegpu::MemoryScopeKind getMemoryScope() const {
+      auto attr = getEncodingAsTensorDescAttr();
+      if (attr && attr.getMemoryScope())
+        return attr.getMemoryScope().getValue();
+      // return default value
+      return MemoryScopeKind::Global;
+    }
+
+    int getArrayLength() {
+      auto attr = getEncodingAsTensorDescAttr();
+      if (attr && attr.getArrayLength())
+        return attr.getArrayLength().getInt();
+      // return default value
+      return 1; 
+    }
+
+    bool getBoundaryCheck() {
+      auto attr = getEncodingAsTensorDescAttr();
+      if (attr && attr.getBoundaryCheck())
+        return attr.getBoundaryCheck().getValue();
+      // return default value
+      return true;
+    }
+  }];
+
+  let hasCustomAssemblyFormat = true;
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 4f839ee773476b..bd72d5c17b6ea1 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -6,7 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <llvm/ADT/TypeSwitch.h>
 #include <mlir/Dialect/XeGPU/IR/XeGPU.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/DialectImplementation.h>
 
 namespace mlir {
 namespace xegpu {
@@ -26,8 +29,73 @@ void XeGPUDialect::initialize() {
       >();
 }
 
-// this file is for position occupation,
-// we will add functions in following PRs.
+
+//===----------------------------------------------------------------------===//
+// XeGPU_TensorDescAttr
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XeGPU_TensorDescType
+//===----------------------------------------------------------------------===//
+mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
+  llvm::SmallVector<int64_t> shape;
+  mlir::Type elementType;
+  mlir::FailureOr<mlir::Attribute> encoding;
+
+  // Parse literal '<'
+  if (parser.parseLess())
+    return {};
+
+  auto shapeLoc = parser.getCurrentLocation();
+  if (mlir::failed(parser.parseDimensionList(shape))) {
+    parser.emitError(shapeLoc, "failed to parse parameter 'shape'");
+    return {};
+  }
+
+  auto elemTypeLoc = parser.getCurrentLocation();
+  if (mlir::failed(parser.parseType(elementType))) {
+    parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'");
+    return {};
+  }
+
+  // parse optional attributes
+  if (mlir::succeeded(parser.parseOptionalComma())) {
+    encoding = mlir::FieldParser<mlir::Attribute>::parse(parser);
+    if (mlir::failed(encoding)) {
+      parser.emitError(parser.getCurrentLocation(),
+          "Failed to parse the attribute field for TensorDescType.\n");
+      return {};
+    }
+  }
+
+  // Parse literal '>'
+  if (parser.parseGreater())
+    return {};
+
+  return TensorDescType::get(parser.getContext(), shape, elementType,
+                             encoding.value_or(mlir::Attribute()));
+}
+
+void TensorDescType::print(::mlir::AsmPrinter &printer) const {
+  printer << "<";
+
+  auto shape = getShape();
+  for (int64_t dim : shape) {
+    if (mlir::ShapedType::isDynamic(dim))
+      printer << '?';
+    else
+      printer << dim;
+    printer << 'x';
+  }
+
+  printer << getElementType();
+
+  if (auto encoding = getEncoding())
+    printer << ", " << encoding;
+
+  printer << ">";
+}
+
 
 } // namespace xegpu
 } // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 0e89ac4df6ef28..74557eaca0869c 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -6,14 +6,390 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <mlir/Dialect/Utils/StaticValueUtils.h>
 #include <mlir/Dialect/XeGPU/IR/XeGPU.h>
+#include <mlir/Interfaces/ViewLikeInterface.h>
+#include <mlir/IR/Builders.h>
 
 #define DEBUG_TYPE "xegpu"
 
 namespace mlir {
 namespace xegpu {
-// this file is for position occupation,
-// we will add functions in following PRs.
+
+bool printDefaultValues() {return false;}
+
+static size_t getRankOf(Value value) {
+  if (value.getType().isIntOrIndexOrFloat())
+    return 0;
+  if (auto ty = llvm::dyn_cast_if_present<MemRefType>(value.getType()))
+    return ty.getRank();
+  if (auto ty = llvm::dyn_cast_if_present<VectorType>(value.getType()))
+    return ty.getRank();
+  llvm_unreachable("Unsupported value for getRankOf");
+}
+
+static ParseResult
+parseOptionalAttrDictWithCustomAttrs(OpAsmParser &parser,
+                                     OperationState &result) {
+  // no optional attributes, return success
+  if (failed(parser.parseOptionalLBrace()))
+    return success();
+
+  llvm::SmallDenseSet<StringRef, 8> seenKeys;
+  auto parseElt = [&]() -> ParseResult {
+    // The name of an attribute can either be a keyword, or a string.
+    // as compared to mlir::parseOptionalAttrList, the cases of using
+    // TOken::bare_identifier and Token::inttype as key maybe not handlered
+    std::string nameId;
+    auto loc = parser.getCurrentLocation();
+    if (parser.parseOptionalKeywordOrString(&nameId))
+      return parser.emitError(loc, "invalid attribute name: ")
+             << nameId << ".\n";
+
+    if (nameId.empty())
+      return parser.emitError(loc, "expected valid attribute name");
+
+    if (!seenKeys.insert(nameId).second)
+      return parser.emitError(loc, "duplicate key '")
+             << nameId << "' in dictionary attribute.";
+
+    // Lazy load a dialect in the context if there is a possible namespace.
+    auto splitName = StringRef(nameId).split('.');
+    if (!splitName.second.empty())
+      parser.getContext()->getOrLoadDialect(splitName.first);
+
+    // Try to parse the '=' for the attribute value.
+    if (parser.parseEqual()) {
+      // If there is no '=', it is treated as a unit attribute.
+      result.addAttribute(nameId, parser.getBuilder().getUnitAttr());
+      return success();
+    }
+
+    // for xegpu specific attributes
+    if (nameId == "mode") {
+      ModeKindAttr attr;
+      return parser.parseCustomAttributeWithFallback(attr, Type{}, nameId,
+                                                     result.attributes);
+    } else if (nameId == "l1_hint" || nameId == "l2_hint" ||
+               nameId == "l3_hint") {
+      CacheKindAttr attr;
+      return parser.parseCustomAttributeWithFallback(attr, Type{}, nameId,
+                                                     result.attributes);
+    } else if (nameId == "transpose") {
+      // in form of [4, 5], acctually it is a copy of DenseI63ArrayAttr::parse()
+      if (succeeded(parser.parseOptionalLSquare())) {
+        Attribute attr;
+        // handle empty list case
+        if (succeeded(parser.parseOptionalRSquare())) {
+          attr = DenseI64ArrayAttr::get(parser.getContext(), {});
+        } else {
+          attr = DenseI64ArrayAttr::parseWithoutBraces(parser, Type{});
+          if (failed(parser.parseRSquare()))
+            return failure();
+        }
+        if (!attr)
+          return failure();
+        result.addAttribute(nameId, attr);
+        return success();
+      } else {
+        // in form of array<i64: 4, 5>
+        DenseI64ArrayAttr attr;
+        return parser.parseAttribute(attr, nameId, result.attributes);
+      }
+    } else {
+      Attribute attr;
+      return parser.parseAttribute(attr, nameId, result.attributes);
+    }
+  };
+
+  if (parser.parseCommaSeparatedList(parseElt))
+    return failure();
+
+  return parser.parseRBrace();
+}
+
+
+//===----------------------------------------------------------------------===//
+// XeGPU_CreateNdDescOp
+//===----------------------------------------------------------------------===//
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type TensorDesc, Value source, ValueRange offsets,
+                           ValueRange shape, ValueRange strides,
+                           llvm::ArrayRef<int64_t> static_offsets) {
+  auto offsetRank = static_offsets.size();
+  auto shapeRank = shape.size() ? shape.size() : getRankOf(source);
+
+  size_t dynOffsetRank =
+      std::count_if(static_offsets.begin(), static_offsets.end(),
+                    [](int64_t d) { return ShapedType::isDynamic(d); });
+
+  // shape and strides should exists at the same time
+  // and the final rank for shape and offset (dynamic + static)
+  // should be the same
+  assert(shape.size() == strides.size() && shapeRank == offsetRank &&
+         offsets.size() == dynOffsetRank);
+
+  state.addOperands(source);
+  state.addOperands(offsets);
+  state.addOperands(shape);
+  state.addOperands(strides);
+  state.addAttribute(
+      getOperandSegmentSizesAttrName(state.name),
+      builder.getDenseI32ArrayAttr({1, static_cast<int32_t>(offsets.size()),
+                                    static_cast<int32_t>(shape.size()),
+                                    static_cast<int32_t>(strides.size())}));
+  state.addAttribute(getStaticOffsetsAttrName(state.name),
+                     builder.getDenseI64ArrayAttr(static_offsets));
+  state.addTypes(TensorDesc);
+}
+
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type tdesc, Value source,
+                           llvm::ArrayRef<OpFoldResult> offsets) {
+  auto ty = llvm::dyn_cast_if_present<MemRefType>(source.getType());
+  assert(ty && ty.hasStaticShape() && offsets.size() == getRankOf(source));
+
+  llvm::SmallVector<int64_t> staticOffsets;
+  llvm::SmallVector<Value> dynamicOffsets;
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+
+  build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */,
+        ValueRange({}) /* empty dynamic shape */,
+        ValueRange({}) /* empty dynamic strides */,
+        staticOffsets /* static offsets */);
+}
+
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type tdesc, Value source,
+                           llvm::ArrayRef<OpFoldResult> offsets,
+                           ValueRange shape, ValueRange stride) {
+  assert(shape.size() && offsets.size() && stride.size() &&
+         shape.size() == stride.size() && shape.size() == offsets.size());
+
+  llvm::SmallVector<int64_t> staticOffsets;
+  llvm::SmallVector<Value> dynamicOffsets;
+
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+
+  build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets,
+        /* dynamic shape = */ shape , /* dynamic strides = */ stride,
+        /* static offsets = */ staticOffsets);
+}
+
+ParseResult CreateNdDescOp::parse(OpAsmParser &parser, OperationState &result) {
+  // parse the source operand
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand> sourceOperands(1);
+  llvm::SMLoc sourceOperandsLoc = parser.getCurrentLocation();
+  if (parser.parseOperand(sourceOperands[0]))
+    return failure();
+
+  // parse the offset operand, in format of [x, y]
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand, 4> offsetsOperands;
+  DenseI64ArrayAttr static_offsetsAttr;
+  llvm::SMLoc offsetsOperandsLoc = parser.getCurrentLocation();
+  if (parseDynamicIndexList(parser, offsetsOperands, static_offsetsAttr))
+    return failure();
+  result.addAttribute("static_offsets", static_offsetsAttr);
+
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand, 4> shapeOperands;
+  llvm::SMLoc shapeOperandsLoc;
+
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand, 4> stridesOperands;
+  llvm::SMLoc stridesOperandsLoc;
+  // parse optional shape and strides, shape and strides should always come
+  // together
+  if (succeeded(parser.parseOptionalComma())) {
+    // parse shape part, in form of [x, y]
+    if (parser.parseLSquare())
+      return failure();
+    shapeOperandsLoc = parser.getCurrentLocation();
+    if (parser.parseOperandList(shapeOperands))
+      return failure();
+    if (parser.parseRSquare())
+      return failure();
+
+    if (parser.parseComma())
+      return failure();
+
+    // parse stride part, in form of [x, y]
+    if (parser.parseLSquare())
+      return failure();
+    stridesOperandsLoc = parser.getCurrentLocation();
+    if (parser.parseOperandList(stridesOperands))
+      return failure();
+    if (parser.parseRSquare())
+      return failure();
+  }
+
+  auto loc = parser.getCurrentLocation();
+  if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+    return failure();
+
+  if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+        return parser.emitError(loc)
+               << "'" << result.name.getStringRef() << "' op ";
+      })))
+    return failure();
+
+  if (parser.parseColon())
+    return failure();
+
+  llvm::SmallVector<Type> sourceTypes(1);
+  if (parser.parseType(sourceTypes[0]))
+    return failure();
+
+  if (parser.parseArrow())
+    return failure();
+
+  llvm::SmallVector<Type> TensorDescTypes(1);
+  if (parser.parseType(TensorDescTypes[0]))
+    return failure();
+  result.addAttribute("operandSegmentSizes",
+                      parser.getBuilder().getDenseI32ArrayAttr(
+                          {1, static_cast<int32_t>(offsetsOperands.size()),
+                           static_cast<int32_t>(shapeOperands.size()),
+                           static_cast<int32_t>(stridesOperands.size())}));
+
+  result.addTypes(TensorDescTypes);
+  if (parser.resolveOperands(sourceOperands, sourceTypes, sourceOperandsLoc,
+                             result.operands))
+    return failure();
+
+  Type indexType = parser.getBuilder().getIndexType();
+  if (parser.resolveOperands(offsetsOperands, indexType, offsetsOperandsLoc,
+                             result.operands))
+    return failure();
+  if (parser.resolveOperands(shapeOperands, indexType, shapeOperandsLoc,
+                             result.operands))
+    return failure();
+  if (parser.resolveOperands(stridesOperands, indexType, stridesOperandsLoc,
+                             result.operands))
+    return failure();
+  return success();
+}
+
+void CreateNdDescOp::print(OpAsmPrinter &printer) {
+  printer << ' ';
+  printer << getSource();
+  printDynamicIndexList(printer, *this, getDynamicOffsets(),
+                        getStaticOffsetsAttr());
+  if (!getDynamicShape().empty()) {
+    printer << ",";
+    printer << ' ' << "[";
+    printer << getDynamicShape();
+    printer << "]";
+  }
+
+  if (!getDynamicStrides().empty()) {
+    printer << ",";
+    printer << ' ' << "[";
+    printer << getDynamicStrides();
+    printer << "]";
+  }
+
+  llvm::SmallVector<llvm::StringRef> elidedAttrs;
+  elidedAttrs.push_back("static_offsets");
+  elidedAttrs.push_back("operandSegmentSizes");
+  printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+  printer << ' ' << ":";
+  printer << ' ';
+  printer << getSourceType();
+  printer << ' ' << "->";
+  printer << ' ';
+  printer << getType();
+}
+
+LogicalResult CreateNdDescOp::verify() {
+  auto offsetRank = getOffsets().size();
+  auto shapeRank = getShape().size();
+  auto stridesRank = getStrides().size();
+  auto baseRank = getRankOf(getSource()) ? getRankOf(getSource()) : 2;
+
+  if (offsetRank != shapeRank || shapeRank != stridesRank ||
+      shapeRank != baseRank)
+
+    return emitOpError(
+        "Expecting the rank of shape, strides, offsets and memref type "
+        "should match with each other (they currently should be 2D).");
+  return success();
+}
+
+// compute consolidated offsets from dynamic_offsets and static_offsets parameters
+llvm::SmallVector<OpFoldResult> CreateNdDescOp::getOffsets() {
+  llvm::SmallVector<OpFoldResult> offsets;
+  auto dynamicOffsets = getDynamicOffsets(); // dynamic_offsets variable
+  auto staticOffsets = getStaticOffsets();   // static_offsets attribute
+
+  // in case static_offsets is missing, dynamic_offsets will be used
+  if (staticOffsets.size() == 0) {
+    offsets.assign(dynamicOffsets.begin(), dynamicOffsets.end());
+    return offsets;
+  }
+
+  // use static offsets for each dim if it has valid value, 
+  // othwise use the value from dynamic_offsets
+  for (size_t i = 0, j = 0; i < staticOffsets.size(); i++) {
+    if (ShapedType::isDynamic(staticOffsets[i])) {
+      assert(j < dynamicOffsets.size());
+      offsets.push_back(dynamicOffsets[j++]);
+    } else {
+      auto ty = IndexType::get(getContext());
+      auto attr = IntegerAttr::get(ty, staticOffsets[i]);
+      offsets.push_back(attr);
+    }
+  }
+  return offsets;
+}
+
+// get the consolidated shape of the 2D memory region. 
+// It prefer dynamic_shape than the static shape of 
+// memref type.
+llvm::SmallVector<OpFoldResult> CreateNdDescOp::getShape() {
+  llvm::SmallVector<OpFoldResult> shape;
+  auto dynShape = getDynamicShape();
+  if (dynShape.size()) {
+    shape.append(dynShape.begin(), dynShape.end());
+    return shape;
+  }
+
+  auto ty = llvm::dyn_cast_if_present<MemRefType>(getSourceType());
+  if (ty && ty.hasStaticShape()) {
+    for (auto dim : ty.getShape()) {
+      auto attr = IntegerAttr::get(IndexType::get(getContext()), dim);
+      shape.push_back(attr);
+    }
+    return shape;
+  }
+  
+  this->emitError("The shape information of the memory is missing.\n");
+  return {};
+}
+
+// get the consolidated strides of the 2D memory region. 
+// It prefer dynamic_stride than the static strides of 
+// memref type.
+llvm::SmallVector<OpFoldResult> CreateNdDescOp::getStrides() {
+  llvm::SmallVector<OpFoldResult> strides;
+
+  auto dynStrides = getDynamicStrides();
+  if (dynStrides.size()) {
+    strides.append(dynStrides.begin(), dynStrides.end());
+    return strides;
+  }
+
+  auto ty = llvm::dyn_cast_if_present<MemRefType>(getSourceType());
+  if (ty && ty.hasStaticShape()) {
+    auto [staticStrides, offset] = getStridesAndOffset(ty);
+    for (auto dim : staticStrides) {
+      auto attr = IntegerAttr::get(IndexType::get(getContext()), dim);
+      strides.push_back(attr);
+    }
+    return strides;
+  }
+
+  this->emitError("The strides information of the memory is missing.\n");
+  return {};
+}
 
 } // namespace xegpu
 } // namespace mlir

>From facb3b40613319915d52a7c6e5f539b686085535 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Sun, 10 Mar 2024 12:57:06 -0500
Subject: [PATCH 02/19] add prefetch_nd, load_nd, and store_nd

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  53 +--
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |  92 +++++-
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       |   5 +-
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 310 +++++++-----------
 4 files changed, 215 insertions(+), 245 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 6e4c1bce6d0d59..cd38549f1ccf43 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -20,14 +20,14 @@ class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
 
 def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
   let parameters = (ins
-    OptionalParameter<"MemoryScopeKindAttr">: $memory_scope,
+    OptionalParameter<"MemoryScopeAttr">: $memory_scope,
     OptionalParameter<"IntegerAttr", "1">: $array_length,
     OptionalParameter<"BoolAttr", "true">: $boundary_check
   );
 
   let builders = [
     AttrBuilder<(ins
-      CArg<"xegpu::MemoryScopeKind", "xegpu::MemoryScopeKind::Global">:$memory_scope,
+      CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
       CArg<"int", "1">:$array_length,
       CArg<"bool", "true">: $boundary_check
     )>
@@ -41,7 +41,7 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
 //===----------------------------------------------------------------------===//
 def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">;
 def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">;
-def XeGPU_MemoryScopeKind: I32EnumAttr<"MemoryScopeKind", 
+def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope", 
       "The address space of the memory the tensor descritor is created for", 
       [XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> {
   let genSpecializedAttr = 0;
@@ -49,47 +49,30 @@ def XeGPU_MemoryScopeKind: I32EnumAttr<"MemoryScopeKind",
 }
 
 def XeGPU_MemoryScopeAttr: 
-  EnumAttr<XeGPU_Dialect, XeGPU_MemoryScopeKind, "memory_scope"> {
-    let assemblyFormat = "`<` $value `>`";
-}
-
-//===----------------------------------------------------------------------===//
-// XeGPU Operator Mode Enums.
-//===----------------------------------------------------------------------===//
-def XeGPU_OpModeSIMT : I32EnumAttrCase<"SIMT", 0, "simt">;
-def XeGPU_OpModeVectorCompute : I32EnumAttrCase<"VectorCompute", 1, "vc">;
-def XeGPU_ModeKind : I32EnumAttr<"ModeKind", 
-             "The Mode an operator runs on", 
-  [XeGPU_OpModeSIMT, XeGPU_OpModeVectorCompute]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::xegpu";
-}
-
-def XeGPU_ModeAttr: 
-  EnumAttr<XeGPU_Dialect, XeGPU_ModeKind, "mode"> {
-    let assemblyFormat = "`<` $value `>`";
+  EnumAttr<XeGPU_Dialect, XeGPU_MemoryScope, "memory_scope"> {
+    let assemblyFormat = "$value";
 }
 
 //===----------------------------------------------------------------------===//
 // XeGPU Cache Enums.
 //===----------------------------------------------------------------------===//
-def XeGPU_CacheKindCached:        I32EnumAttrCase<"CACHED", 0, "cached">;                    // valid for read and write
-def XeGPU_CacheKindUncached:      I32EnumAttrCase<"UNCACHED", 1, "uncached">;                // valid for read and write
-def XeGPU_CacheKindStreaming:     I32EnumAttrCase<"STREAMING", 2, "streaming">;              // valid for read only
-def XeGPU_CacheKindInvalid:       I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate">;  // valid for read only
-def XeGPU_CacheKindWriteBack:     I32EnumAttrCase<"WRITE_BACK", 4, "write_back">;            // valid for write only
-def XeGPU_CacheKindWriteThrough:  I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">;      // valid for write only
-
-def XeGPU_CacheKind : I32EnumAttr<"CacheKind", "Cache kind", 
-  [XeGPU_CacheKindCached, XeGPU_CacheKindUncached, 
-   XeGPU_CacheKindStreaming, XeGPU_CacheKindInvalid,
-   XeGPU_CacheKindWriteBack, XeGPU_CacheKindWriteThrough]> {
+def XeGPU_CachePolicyCached:        I32EnumAttrCase<"CACHED", 0, "cached">;                    // valid for read and write
+def XeGPU_CachePolicyUncached:      I32EnumAttrCase<"UNCACHED", 1, "uncached">;                // valid for read and write
+def XeGPU_CachePolicyStreaming:     I32EnumAttrCase<"STREAMING", 2, "streaming">;              // valid for read only
+def XeGPU_CachePolicyInvalid:       I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate">;  // valid for read only
+def XeGPU_CachePolicyWriteBack:     I32EnumAttrCase<"WRITE_BACK", 4, "write_back">;            // valid for write only
+def XeGPU_CachePolicyWriteThrough:  I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">;      // valid for write only
+
+def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy", 
+  [XeGPU_CachePolicyCached, XeGPU_CachePolicyUncached, 
+   XeGPU_CachePolicyStreaming, XeGPU_CachePolicyInvalid,
+   XeGPU_CachePolicyWriteBack, XeGPU_CachePolicyWriteThrough]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::xegpu";
 }
 
-def XeGPU_CacheAttr 
-  : EnumAttr<XeGPU_Dialect, XeGPU_CacheKind, "cache_kind"> {
+def XeGPU_CacheHintAttr 
+  : EnumAttr<XeGPU_Dialect, XeGPU_CachePolicyEnums, "cache_hint"> {
     let assemblyFormat = "`<` $value `>`";
 }
 
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index a321d36f2ae271..dd3719f101e8c8 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -88,14 +88,21 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg
     %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
   }];
 
-  let arguments = (ins XeGPU_BaseAddrType: $source, 
-                 Variadic<Index>: $dynamic_offsets, 
-                 Variadic<Index>: $dynamic_shape, 
-                 Variadic<Index>: $dynamic_strides,
-                 DenseI64ArrayAttr: $static_offsets);
-  let results = (outs XeGPU_TensorDesc:$TensorDesc);
-
-  let hasCustomAssemblyFormat = 1;
+  let arguments = (ins 
+    XeGPU_BaseAddrType: $source, 
+    Variadic<Index>: $dynamic_offsets, 
+    Variadic<Index>: $dynamic_shape, 
+    Variadic<Index>: $dynamic_strides,
+    DenseI64ArrayAttr: $static_offsets
+  );
+  let results = (outs XeGPU_TensorDesc: $TensorDesc);
+
+  let assemblyFormat = [{
+    $source ``
+    custom<DynamicIndexList>($dynamic_offsets, $static_offsets)
+    (`,` `[` $dynamic_shape^ `]` `,` `[` $dynamic_strides `]`)?
+    attr-dict `:` type($source) `->` type($TensorDesc)
+  }];
   let skipDefaultBuilders = 1;
   let hasVerifier = 1;
 
@@ -154,8 +161,77 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg
       return getType().getShape();
     }
   }];
+}
+
+def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
+  let summary = "prefetches a nD block to cache";
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+  // Format: xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>, 
+  //                                   l2_hint = #xegpu.cache_hint<cached>, 
+  //                                   l3_hint = #xegpu.cache_hint<cached>}
+  //         : !xegpu.tensor_desc<8x16xf16>
+  let assemblyFormat = "$TensorDesc attr-dict `:` type($TensorDesc)";
+}
+
+
+def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> {
+  let summary = "loads a n-D block from memory (represented by TensorDesc)" 
+                "to registers (represented by vector)";
+  let description = [{
+    LoadNDOp essentially mimics the hardware block read instruction to read 
+    a block of data from memory to register. It takes a set of cache hints 
+    for each level of cache, L1, L2 and L3. If hardware does not have a 
+    correspoding cache, Corresponding cache hint attribute will be masked.
+    If both transpose and vnni_axis present at the same time. It assume to 
+    perform transpose first and then vnni transform.
+  }];
+
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<I64Attr>: $vnni_axis,
+                       OptionalAttr<DenseI64ArrayAttr>: $transpose,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+  let results = (outs XeGPU_ValueType: $value);
+
+  let extraClassDeclaration = [{
+    VectorType getType() {
+      return llvm::dyn_cast<VectorType>(getValue().getType());
+    }
+
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+  }];
 
+  // Format: xegpu.load_nd %1 {transpose = [1, 0], 
+  //                l1_hint = #xegpu.cache_hint<cached>, 
+  //                l2_hint = #xegpu.cache_hint<uncached>, 
+  //                l3_hint = #xegpu.cache_hint<streaming>}
+  //         : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
+  let assemblyFormat = "$TensorDesc attr-dict `:` type($TensorDesc) `->` type($value)";
+  let hasVerifier = 1;
 }
 
+def XeGPU_StoreNDOp : XeGPU_Op<"store_nd", []> {
+  let summary = "stores a n-D block register region back to memory, currently only supports 2D";
+  let arguments = (ins XeGPU_ValueType: $value,
+                       XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+  // Format: xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+  //                                l2_hint = #xegpu.cache_hint<write_back>, 
+  //                                l3_hint = #xegpu.cache_hint<write_through>}
+  //                                : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+  let assemblyFormat = "$value `,` $TensorDesc attr-dict `:` type($value) `,` type($TensorDesc)";
+  let hasVerifier = 1;
+}
 
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 319e16b3ae326b..36b04ea12bcad0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -103,12 +103,12 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
       return llvm::dyn_cast_if_present<TensorDescAttr>(getEncoding());
     }
 
-    xegpu::MemoryScopeKind getMemoryScope() const {
+    xegpu::MemoryScope getMemoryScope() const {
       auto attr = getEncodingAsTensorDescAttr();
       if (attr && attr.getMemoryScope())
         return attr.getMemoryScope().getValue();
       // return default value
-      return MemoryScopeKind::Global;
+      return MemoryScope::Global;
     }
 
     int getArrayLength() {
@@ -129,6 +129,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
   }];
 
   let hasCustomAssemblyFormat = true;
+  
 }
 
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 74557eaca0869c..727c241a027f77 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -28,86 +28,28 @@ static size_t getRankOf(Value value) {
   llvm_unreachable("Unsupported value for getRankOf");
 }
 
-static ParseResult
-parseOptionalAttrDictWithCustomAttrs(OpAsmParser &parser,
-                                     OperationState &result) {
-  // no optional attributes, return success
-  if (failed(parser.parseOptionalLBrace()))
-    return success();
-
-  llvm::SmallDenseSet<StringRef, 8> seenKeys;
-  auto parseElt = [&]() -> ParseResult {
-    // The name of an attribute can either be a keyword, or a string.
-    // as compared to mlir::parseOptionalAttrList, the cases of using
-    // TOken::bare_identifier and Token::inttype as key maybe not handlered
-    std::string nameId;
-    auto loc = parser.getCurrentLocation();
-    if (parser.parseOptionalKeywordOrString(&nameId))
-      return parser.emitError(loc, "invalid attribute name: ")
-             << nameId << ".\n";
-
-    if (nameId.empty())
-      return parser.emitError(loc, "expected valid attribute name");
-
-    if (!seenKeys.insert(nameId).second)
-      return parser.emitError(loc, "duplicate key '")
-             << nameId << "' in dictionary attribute.";
-
-    // Lazy load a dialect in the context if there is a possible namespace.
-    auto splitName = StringRef(nameId).split('.');
-    if (!splitName.second.empty())
-      parser.getContext()->getOrLoadDialect(splitName.first);
-
-    // Try to parse the '=' for the attribute value.
-    if (parser.parseEqual()) {
-      // If there is no '=', it is treated as a unit attribute.
-      result.addAttribute(nameId, parser.getBuilder().getUnitAttr());
-      return success();
-    }
-
-    // for xegpu specific attributes
-    if (nameId == "mode") {
-      ModeKindAttr attr;
-      return parser.parseCustomAttributeWithFallback(attr, Type{}, nameId,
-                                                     result.attributes);
-    } else if (nameId == "l1_hint" || nameId == "l2_hint" ||
-               nameId == "l3_hint") {
-      CacheKindAttr attr;
-      return parser.parseCustomAttributeWithFallback(attr, Type{}, nameId,
-                                                     result.attributes);
-    } else if (nameId == "transpose") {
-      // in form of [4, 5], acctually it is a copy of DenseI63ArrayAttr::parse()
-      if (succeeded(parser.parseOptionalLSquare())) {
-        Attribute attr;
-        // handle empty list case
-        if (succeeded(parser.parseOptionalRSquare())) {
-          attr = DenseI64ArrayAttr::get(parser.getContext(), {});
-        } else {
-          attr = DenseI64ArrayAttr::parseWithoutBraces(parser, Type{});
-          if (failed(parser.parseRSquare()))
-            return failure();
-        }
-        if (!attr)
-          return failure();
-        result.addAttribute(nameId, attr);
-        return success();
-      } else {
-        // in form of array<i64: 4, 5>
-        DenseI64ArrayAttr attr;
-        return parser.parseAttribute(attr, nameId, result.attributes);
-      }
-    } else {
-      Attribute attr;
-      return parser.parseAttribute(attr, nameId, result.attributes);
-    }
-  };
-
-  if (parser.parseCommaSeparatedList(parseElt))
-    return failure();
-
-  return parser.parseRBrace();
+static void transpose(llvm::ArrayRef<int64_t> trans,
+                      std::vector<int64_t> &shape) {
+  std::vector<int64_t> old = shape;
+  for (size_t i = 0; i < trans.size(); i++)
+    shape[i] = old[trans[i]];
 }
 
+template <typename T>
+static std::string makeString(T array, bool breakline = false) {
+  std::string buf;
+  buf.clear();
+  llvm::raw_string_ostream os(buf);
+  os << "[";
+  for (size_t i = 1; i < array.size(); i++) {
+    os << array[i - 1] << ", ";
+    if (breakline)
+      os << "\n\t\t";
+  }
+  os << array.back() << "]";
+  os.flush();
+  return buf;
+}
 
 //===----------------------------------------------------------------------===//
 // XeGPU_CreateNdDescOp
@@ -176,128 +118,6 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
         /* static offsets = */ staticOffsets);
 }
 
-ParseResult CreateNdDescOp::parse(OpAsmParser &parser, OperationState &result) {
-  // parse the source operand
-  llvm::SmallVector<OpAsmParser::UnresolvedOperand> sourceOperands(1);
-  llvm::SMLoc sourceOperandsLoc = parser.getCurrentLocation();
-  if (parser.parseOperand(sourceOperands[0]))
-    return failure();
-
-  // parse the offset operand, in format of [x, y]
-  llvm::SmallVector<OpAsmParser::UnresolvedOperand, 4> offsetsOperands;
-  DenseI64ArrayAttr static_offsetsAttr;
-  llvm::SMLoc offsetsOperandsLoc = parser.getCurrentLocation();
-  if (parseDynamicIndexList(parser, offsetsOperands, static_offsetsAttr))
-    return failure();
-  result.addAttribute("static_offsets", static_offsetsAttr);
-
-  llvm::SmallVector<OpAsmParser::UnresolvedOperand, 4> shapeOperands;
-  llvm::SMLoc shapeOperandsLoc;
-
-  llvm::SmallVector<OpAsmParser::UnresolvedOperand, 4> stridesOperands;
-  llvm::SMLoc stridesOperandsLoc;
-  // parse optional shape and strides, shape and strides should always come
-  // together
-  if (succeeded(parser.parseOptionalComma())) {
-    // parse shape part, in form of [x, y]
-    if (parser.parseLSquare())
-      return failure();
-    shapeOperandsLoc = parser.getCurrentLocation();
-    if (parser.parseOperandList(shapeOperands))
-      return failure();
-    if (parser.parseRSquare())
-      return failure();
-
-    if (parser.parseComma())
-      return failure();
-
-    // parse stride part, in form of [x, y]
-    if (parser.parseLSquare())
-      return failure();
-    stridesOperandsLoc = parser.getCurrentLocation();
-    if (parser.parseOperandList(stridesOperands))
-      return failure();
-    if (parser.parseRSquare())
-      return failure();
-  }
-
-  auto loc = parser.getCurrentLocation();
-  if (parseOptionalAttrDictWithCustomAttrs(parser, result))
-    return failure();
-
-  if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
-        return parser.emitError(loc)
-               << "'" << result.name.getStringRef() << "' op ";
-      })))
-    return failure();
-
-  if (parser.parseColon())
-    return failure();
-
-  llvm::SmallVector<Type> sourceTypes(1);
-  if (parser.parseType(sourceTypes[0]))
-    return failure();
-
-  if (parser.parseArrow())
-    return failure();
-
-  llvm::SmallVector<Type> TensorDescTypes(1);
-  if (parser.parseType(TensorDescTypes[0]))
-    return failure();
-  result.addAttribute("operandSegmentSizes",
-                      parser.getBuilder().getDenseI32ArrayAttr(
-                          {1, static_cast<int32_t>(offsetsOperands.size()),
-                           static_cast<int32_t>(shapeOperands.size()),
-                           static_cast<int32_t>(stridesOperands.size())}));
-
-  result.addTypes(TensorDescTypes);
-  if (parser.resolveOperands(sourceOperands, sourceTypes, sourceOperandsLoc,
-                             result.operands))
-    return failure();
-
-  Type indexType = parser.getBuilder().getIndexType();
-  if (parser.resolveOperands(offsetsOperands, indexType, offsetsOperandsLoc,
-                             result.operands))
-    return failure();
-  if (parser.resolveOperands(shapeOperands, indexType, shapeOperandsLoc,
-                             result.operands))
-    return failure();
-  if (parser.resolveOperands(stridesOperands, indexType, stridesOperandsLoc,
-                             result.operands))
-    return failure();
-  return success();
-}
-
-void CreateNdDescOp::print(OpAsmPrinter &printer) {
-  printer << ' ';
-  printer << getSource();
-  printDynamicIndexList(printer, *this, getDynamicOffsets(),
-                        getStaticOffsetsAttr());
-  if (!getDynamicShape().empty()) {
-    printer << ",";
-    printer << ' ' << "[";
-    printer << getDynamicShape();
-    printer << "]";
-  }
-
-  if (!getDynamicStrides().empty()) {
-    printer << ",";
-    printer << ' ' << "[";
-    printer << getDynamicStrides();
-    printer << "]";
-  }
-
-  llvm::SmallVector<llvm::StringRef> elidedAttrs;
-  elidedAttrs.push_back("static_offsets");
-  elidedAttrs.push_back("operandSegmentSizes");
-  printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
-  printer << ' ' << ":";
-  printer << ' ';
-  printer << getSourceType();
-  printer << ' ' << "->";
-  printer << ' ';
-  printer << getType();
-}
 
 LogicalResult CreateNdDescOp::verify() {
   auto offsetRank = getOffsets().size();
@@ -391,6 +211,96 @@ llvm::SmallVector<OpFoldResult> CreateNdDescOp::getStrides() {
   return {};
 }
 
+//===----------------------------------------------------------------------===//
+// XeGPU_LoadNDOp
+//===----------------------------------------------------------------------===//
+LogicalResult LoadNDOp::verify() {
+  auto tdescTy = getTensorDescType();
+  auto valueTy = getType();
+
+  if (tdescTy.getRank() != 2)
+    return emitOpError(
+        "The TensorDesc for LoadNDOp should be a 2D TensorDesc.");
+
+  if (!valueTy)
+    return emitOpError("Invalid result, it should be a VectorType.\n");
+
+  auto tdescElemTy = tdescTy.getElementType();
+  auto valueElemTy = valueTy.getElementType();
+
+  if (tdescElemTy != valueElemTy)
+    return emitOpError(
+        "Value should have the same element type as TensorDesc.");
+
+  auto array_len = tdescTy.getArrayLength();
+  auto tdescShape = tdescTy.getShape().vec();
+  auto valueShape = valueTy.getShape().vec();
+
+  if (getTranspose()) {
+    auto trans = getTranspose().value();
+    if (tdescShape.size() >= trans.size())
+      transpose(trans, tdescShape);
+    else
+      emitWarning("Invalid transpose attr. It is ignored.");
+  }
+
+  if (getVnniAxis()) {
+    auto axis = getVnniAxis().value();
+    auto vnni_factor = valueShape.back();
+    tdescShape[axis] /= vnni_factor;
+    tdescShape.push_back(vnni_factor);
+  }
+
+  if (array_len > 1) {
+    auto it = tdescShape.begin();
+    tdescShape.insert(it, array_len);
+  }
+
+  if (tdescShape != valueShape)
+    return emitOpError("Result shape doesn't match TensorDesc shape.")
+           << "\nThe expected shape is " << makeString(tdescShape) << "."
+           << "\nBut the given shape is " << makeString(valueShape) << "."
+           << "\nIn VC mode, when VNNI is not enabled, the result should have "
+           << "the same shape (or transposed shape if transpose is enabled) "
+           << "as TensorDesc; \nwhen VNNI is enabled, the result should have "
+           << "one more dimention than the TensorDesc, with last dimention "
+           << "having vnni factor, \nbut having same number of total data "
+           << "elements. The vnni factor are typically calculated as "
+           << "simd_lane_width / elementTypeBitWidth. \nFor element type "
+           << "having more than 32 bits, vnni shouldn't be used. \nIn SIMT "
+           << "mode, the shape is derived from the mapping attributes.\n";
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_StoreNDOp
+//===----------------------------------------------------------------------===//
+LogicalResult StoreNDOp::verify() {
+  auto dstTy = getTensorDesc().getType();                        // Tile
+  // auto valTy = llvm::dyn_cast<VectorType>(getValue().getType()); // Vector
+  auto valTy = getValue().getType().cast<VectorType>(); // Vector
+
+  if (dstTy.getRank() != 2)
+    return emitOpError(
+        "The TensorDesc for StoreNdOp should be a 2D TensorDesc.");
+
+  if (!valTy)
+    return emitOpError("Invalid value operand, it should be a VectorType.\n");
+
+  auto dstElemTy = dstTy.getElementType();
+  auto valElemTy = valTy.getElementType();
+
+  if (dstElemTy != valElemTy) {
+    return emitOpError("The elem type of the value doesn't "
+                       "match the elem type of the TensorDesc.\n");
+  }
+
+  if (dstTy.getShape() != valTy.getShape())
+    return emitOpError("The value shape doesn't match "
+                       "the TensorDesc shape.\n");
+  return success();
+}
+
 } // namespace xegpu
 } // namespace mlir
 

>From 9ea71f80deff3af28fd473a481fd12c5a5ad9781 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Sun, 10 Mar 2024 13:15:50 -0500
Subject: [PATCH 03/19] add test cases

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp |  2 -
 mlir/test/Dialect/XeGPU/XeGPUOps.mlir  | 57 ++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/Dialect/XeGPU/XeGPUOps.mlir

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 727c241a027f77..cabcf0bf071046 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -16,8 +16,6 @@
 namespace mlir {
 namespace xegpu {
 
-bool printDefaultValues() {return false;}
-
 static size_t getRankOf(Value value) {
   if (value.getType().isIntOrIndexOrFloat())
     return 0;
diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
new file mode 100644
index 00000000000000..cfb22ce2b8942f
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
@@ -0,0 +1,57 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: gpu.module @test {
+gpu.module @test {
+// CHECK: gpu.func @test_create_nd_tdesc_vc_1(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> <8x16xf32>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+  gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_vc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) {
+gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
+  //CHECK: %[[C:.*]] = arith.constant 1 : index
+  %c1 = arith.constant 1 : index
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> <8x16xf32>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+  gpu.return
+}
+
+gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> <24x16xf32>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>, %x : index, %y : index) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> <24x32xf16>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK: xegpu.prefetch_nd %[[REG]] {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>} : <8x16xf16>
+  xegpu.prefetch_nd %1 {li_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}: !xegpu.tensor_desc<8x16xf16>
+  gpu.return
+}
+
+// CHECK-LABEL: func @test_load_nd_vc({{.*}}) {
+gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %2 = xegpu.load_nd %1 {vnni_axis = 0, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>} 
+       : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+  gpu.return
+}
+
+// CHECK-LABEL: func @test_store_nd_vc({{.*}}) {
+gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
+  %1 = arith.constant dense<1.0>: vector<24x32xf16>
+  %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+  xegpu.store_nd %1, %2 {l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}
+        : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
+  gpu.return
+}
+
+}
\ No newline at end of file

>From fdd2253802801bcda61747e5e828ee40d5960508 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Sun, 10 Mar 2024 20:12:36 +0000
Subject: [PATCH 04/19] fix printformat issue and update testcases

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |  8 ++---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 30 ++++++------------
 mlir/test/Dialect/XeGPU/XeGPUOps.mlir         | 31 +++++++++++--------
 3 files changed, 31 insertions(+), 38 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index dd3719f101e8c8..9d37d77e03a0c5 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -101,7 +101,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg
     $source ``
     custom<DynamicIndexList>($dynamic_offsets, $static_offsets)
     (`,` `[` $dynamic_shape^ `]` `,` `[` $dynamic_strides `]`)?
-    attr-dict `:` type($source) `->` type($TensorDesc)
+    attr-dict `:` type($source) `->` qualified(type($TensorDesc))
   }];
   let skipDefaultBuilders = 1;
   let hasVerifier = 1;
@@ -174,7 +174,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
   //                                   l2_hint = #xegpu.cache_hint<cached>, 
   //                                   l3_hint = #xegpu.cache_hint<cached>}
   //         : !xegpu.tensor_desc<8x16xf16>
-  let assemblyFormat = "$TensorDesc attr-dict `:` type($TensorDesc)";
+  let assemblyFormat = "$TensorDesc attr-dict `:` qualified(type($TensorDesc))";
 }
 
 
@@ -214,7 +214,7 @@ def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> {
   //                l2_hint = #xegpu.cache_hint<uncached>, 
   //                l3_hint = #xegpu.cache_hint<streaming>}
   //         : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
-  let assemblyFormat = "$TensorDesc attr-dict `:` type($TensorDesc) `->` type($value)";
+  let assemblyFormat = "$TensorDesc attr-dict `:` qualified(type($TensorDesc)) `->` type($value)";
   let hasVerifier = 1;
 }
 
@@ -230,7 +230,7 @@ def XeGPU_StoreNDOp : XeGPU_Op<"store_nd", []> {
   //                                l2_hint = #xegpu.cache_hint<write_back>, 
   //                                l3_hint = #xegpu.cache_hint<write_through>}
   //                                : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
-  let assemblyFormat = "$value `,` $TensorDesc attr-dict `:` type($value) `,` type($TensorDesc)";
+  let assemblyFormat = "$value `,` $TensorDesc attr-dict `:` type($value) `,` qualified(type($TensorDesc))";
   let hasVerifier = 1;
 }
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index cabcf0bf071046..a388db4f5c2dc6 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -255,18 +255,9 @@ LogicalResult LoadNDOp::verify() {
   }
 
   if (tdescShape != valueShape)
-    return emitOpError("Result shape doesn't match TensorDesc shape.")
-           << "\nThe expected shape is " << makeString(tdescShape) << "."
-           << "\nBut the given shape is " << makeString(valueShape) << "."
-           << "\nIn VC mode, when VNNI is not enabled, the result should have "
-           << "the same shape (or transposed shape if transpose is enabled) "
-           << "as TensorDesc; \nwhen VNNI is enabled, the result should have "
-           << "one more dimention than the TensorDesc, with last dimention "
-           << "having vnni factor, \nbut having same number of total data "
-           << "elements. The vnni factor are typically calculated as "
-           << "simd_lane_width / elementTypeBitWidth. \nFor element type "
-           << "having more than 32 bits, vnni shouldn't be used. \nIn SIMT "
-           << "mode, the shape is derived from the mapping attributes.\n";
+    return emitOpError() <<"Result shape doesn't match TensorDesc shape."
+           << "The expected shape is " << makeString(tdescShape) << ". "
+           << "But the given shape is " << makeString(valueShape) << ".\n";
   return success();
 }
 
@@ -274,28 +265,25 @@ LogicalResult LoadNDOp::verify() {
 // XeGPU_StoreNDOp
 //===----------------------------------------------------------------------===//
 LogicalResult StoreNDOp::verify() {
-  auto dstTy = getTensorDesc().getType();                        // Tile
-  // auto valTy = llvm::dyn_cast<VectorType>(getValue().getType()); // Vector
+  auto dstTy = getTensorDesc().getType();               // Tile
   auto valTy = getValue().getType().cast<VectorType>(); // Vector
 
   if (dstTy.getRank() != 2)
-    return emitOpError(
-        "The TensorDesc for StoreNdOp should be a 2D TensorDesc.");
+    return emitOpError("Expecting a 2D TensorDesc shape.\n");
 
   if (!valTy)
-    return emitOpError("Invalid value operand, it should be a VectorType.\n");
+    return emitOpError("Exepcting a VectorType result.\n");
 
   auto dstElemTy = dstTy.getElementType();
   auto valElemTy = valTy.getElementType();
 
   if (dstElemTy != valElemTy) {
-    return emitOpError("The elem type of the value doesn't "
-                       "match the elem type of the TensorDesc.\n");
+    return emitOpError() << "The element type of the value should "
+                       "match the elementtype of the TensorDesc.\n";
   }
 
   if (dstTy.getShape() != valTy.getShape())
-    return emitOpError("The value shape doesn't match "
-                       "the TensorDesc shape.\n");
+    return emitOpError() << "The result shape should match the TensorDesc shape.\n";
   return success();
 }
 
diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
index cfb22ce2b8942f..f9b3510beb4335 100644
--- a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
+++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
@@ -8,7 +8,7 @@
 gpu.module @test {
 // CHECK: gpu.func @test_create_nd_tdesc_vc_1(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> <8x16xf32>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   gpu.return
 }
@@ -17,40 +17,45 @@ gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
 gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
   //CHECK: %[[C:.*]] = arith.constant 1 : index
   %c1 = arith.constant 1 : index
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> <8x16xf32>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
   gpu.return
 }
 
+// CHECK: gpu.func @test_create_nd_tdesc_vc_3(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> <24x16xf32>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr<array_length = 2 : i64>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr<array_length = 2>>
   gpu.return
 }
 
-// CHECK-LABEL: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>, %x : index, %y : index) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> <24x32xf16>
-  %1 = xegpu.create_nd_tdesc %src[%x, %y] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
-  // CHECK: xegpu.prefetch_nd %[[REG]] {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>} : <8x16xf16>
+// CHECK: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK: xegpu.prefetch_nd %[[R0]] {l2_hint = #xegpu.cache_hint<uncached>, li_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<8x16xf16>
   xegpu.prefetch_nd %1 {li_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}: !xegpu.tensor_desc<8x16xf16>
   gpu.return
 }
 
-// CHECK-LABEL: func @test_load_nd_vc({{.*}}) {
+// CHECK: func @test_load_nd_vc(%[[arg0:.*]]: memref<8x16xf16>) {
 gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, vnni_axis = 0 : i64} : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
   %2 = xegpu.load_nd %1 {vnni_axis = 0, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>} 
        : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
   gpu.return
 }
 
-// CHECK-LABEL: func @test_store_nd_vc({{.*}}) {
+// CHECK: func @test_store_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
+  // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16>
   %1 = arith.constant dense<1.0>: vector<24x32xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
   %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
-  xegpu.store_nd %1, %2 {l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}
-        : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
+  // CHECK: xegpu.store_nd %[[C]], %[[R0]] {l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>} : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
+  xegpu.store_nd %1, %2 {l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>} : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
   gpu.return
 }
 

>From ad27a81fd0fddbffb7e5b3529017f3c532b0db7d Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Sun, 10 Mar 2024 20:35:47 +0000
Subject: [PATCH 05/19] add XeGPU 2D block operators

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h    |   4 +
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  61 ++++
 .../mlir/Dialect/XeGPU/IR/XeGPUDialect.td     |   4 +-
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 211 +++++++++++++
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       | 104 ++++++-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  72 ++++-
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 276 +++++++++++++++++-
 mlir/test/Dialect/XeGPU/XeGPUOps.mlir         |  62 ++++
 8 files changed, 787 insertions(+), 7 deletions(-)
 create mode 100644 mlir/test/Dialect/XeGPU/XeGPUOps.mlir

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 7aaa4ecc7ee77a..8dc3ff78d25ede 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -9,7 +9,11 @@
 #ifndef MLIR_DIALECT_XEGPU_IR_XEGPU_H
 #define MLIR_DIALECT_XEGPU_IR_XEGPU_H
 
+#include <mlir/Bytecode/BytecodeOpInterface.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Dialect.h>
+#include <mlir/Interfaces/ShapedOpInterfaces.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
 
 namespace mlir {
 namespace xegpu {
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index bb325c272e3324..cd38549f1ccf43 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -10,6 +10,7 @@
 #define MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
 
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
+include "mlir/IR/EnumAttr.td"
 
 class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
                 string baseCppClass = "::mlir::Attribute">
@@ -17,4 +18,64 @@ class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
   let mnemonic = attrMnemonic;
 }
 
+def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
+  let parameters = (ins
+    OptionalParameter<"MemoryScopeAttr">: $memory_scope,
+    OptionalParameter<"IntegerAttr", "1">: $array_length,
+    OptionalParameter<"BoolAttr", "true">: $boundary_check
+  );
+
+  let builders = [
+    AttrBuilder<(ins
+      CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
+      CArg<"int", "1">:$array_length,
+      CArg<"bool", "true">: $boundary_check
+    )>
+  ];
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU Memory Scope Enums.
+//===----------------------------------------------------------------------===//
+def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">;
+def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">;
+def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope", 
+      "The address space of the memory the tensor descritor is created for", 
+      [XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::xegpu";
+}
+
+def XeGPU_MemoryScopeAttr: 
+  EnumAttr<XeGPU_Dialect, XeGPU_MemoryScope, "memory_scope"> {
+    let assemblyFormat = "$value";
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU Cache Enums.
+//===----------------------------------------------------------------------===//
+def XeGPU_CachePolicyCached:        I32EnumAttrCase<"CACHED", 0, "cached">;                    // valid for read and write
+def XeGPU_CachePolicyUncached:      I32EnumAttrCase<"UNCACHED", 1, "uncached">;                // valid for read and write
+def XeGPU_CachePolicyStreaming:     I32EnumAttrCase<"STREAMING", 2, "streaming">;              // valid for read only
+def XeGPU_CachePolicyInvalid:       I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate">;  // valid for read only
+def XeGPU_CachePolicyWriteBack:     I32EnumAttrCase<"WRITE_BACK", 4, "write_back">;            // valid for write only
+def XeGPU_CachePolicyWriteThrough:  I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">;      // valid for write only
+
+def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy", 
+  [XeGPU_CachePolicyCached, XeGPU_CachePolicyUncached, 
+   XeGPU_CachePolicyStreaming, XeGPU_CachePolicyInvalid,
+   XeGPU_CachePolicyWriteBack, XeGPU_CachePolicyWriteThrough]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::xegpu";
+}
+
+def XeGPU_CacheHintAttr 
+  : EnumAttr<XeGPU_Dialect, XeGPU_CachePolicyEnums, "cache_hint"> {
+    let assemblyFormat = "`<` $value `>`";
+}
+
+
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
index 3851275ad30a0a..c2f09319c790e0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -23,8 +23,8 @@ def XeGPU_Dialect : Dialect {
       the lower-level GPU compiler.
     }];
 
-    // let useDefaultTypePrinterParser = true;
-    // let useDefaultAttributePrinterParser = true;
+    let useDefaultTypePrinterParser = true;
+    let useDefaultAttributePrinterParser = true;
 }
 
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUDIALECT_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 5825ef9195b03f..9d37d77e03a0c5 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -12,6 +12,22 @@
 include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/AttrTypeBase.td"
+
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/OpAsmInterface.td"
+include "mlir/IR/AttrTypeBase.td"
+include "mlir/IR/BuiltinTypes.td"
+include "mlir/IR/BuiltinTypeInterfaces.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/ViewLikeInterface.td"
+include "mlir/Interfaces/CastInterfaces.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
+include "mlir/Interfaces/CopyOpInterface.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/ShapedOpInterfaces.td"
 
 
 // Base class for dialect operations. This operation inherits from the base
@@ -23,4 +39,199 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
           Op<XeGPU_Dialect, mnemonic, traits>;
 
 
+def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSegments]> {
+
+  let summary = "create nd tensor descriptor operation";
+  let description = [{
+    The "create_nd_tdesc" operation creates a TensorDescType which represents
+    a sub-view of a 2D memory region (It can be extended to support N-D memory
+    region if needed in future). Elements in the subview continuous in each 
+    dimention. It encodes the following important information for supporting 
+    Intel hardware features:
+
+    * source: an object representing (starting address/pointer of) a 2D memory region. 
+        It can be either a 2D memref object, or simply a pointer represented by uint64_t type.
+        for the later case, the shape and layout information of the 2D memory region should 
+        be explicitly passed via `dynamic_shape` and `dynamic_strides` parameters.
+    * offsets: two index values represents offsets from the "source" at the each dimension 
+        at which the subview of the target memory will be created. It is encoded via two
+        variables, including "dynamic_offsets" and "static_offsets", such that it can
+        accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])).
+    * shape: the shape information of the memory region pointed by the "source".  It is 
+        typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. 
+        But if "source" is simply a pointer represented as uint64_t type, or a memref 
+        type without shape information e.g., memref<?x?xf16>, the shape information has 
+        to be explicitly passed via the "dynamic_shape" argument. Currently "dynamic_shape" 
+        only accepts operands(e.g., [%c4096, %c4096]), not attributes(e.g., [4096, 4096]).
+    * strides: the strides of the memory region pointed by the "source". Similar to shape, 
+        it is typically encoded via the MemRefType of the source too. But if "source" is 
+        simply a pointer represented as uint64_t type, or a memref type without shape 
+        information e.g., memref<?x?xf16>, the strides information has to be explicitly 
+        passed via the "dynamic_strides" argument. And it currently only accepts operands two.
+
+    Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
+    %0 = memref.alloc() : memref<1024x1024xf32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.create_nd_tdesc %0[%c0, %c0]: memref<1024x1024xf32> -> TensorDesc<8x16xf32>
+
+    Example 2 (suppose the tensor shape inferred by the compiler is 8x16):
+    %0 = memref.alloc(%h, %w) : memref<?x?xf32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: memref<?x?xf32> -> TensorDesc<8x16xf32>
+
+    Example 3 (suppose the tensor shape inferred by the compiler is 8x16):
+    %0 = ... : ui64
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
+  }];
+
+  let arguments = (ins 
+    XeGPU_BaseAddrType: $source, 
+    Variadic<Index>: $dynamic_offsets, 
+    Variadic<Index>: $dynamic_shape, 
+    Variadic<Index>: $dynamic_strides,
+    DenseI64ArrayAttr: $static_offsets
+  );
+  let results = (outs XeGPU_TensorDesc: $TensorDesc);
+
+  let assemblyFormat = [{
+    $source ``
+    custom<DynamicIndexList>($dynamic_offsets, $static_offsets)
+    (`,` `[` $dynamic_shape^ `]` `,` `[` $dynamic_strides `]`)?
+    attr-dict `:` type($source) `->` qualified(type($TensorDesc))
+  }];
+  let skipDefaultBuilders = 1;
+  let hasVerifier = 1;
+
+  let builders = [
+    OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets, 
+                   "ValueRange": $shape, "ValueRange": $strides, 
+                   "llvm::ArrayRef<int64_t>": $static_offsets)>,
+
+    OpBuilder<(ins "Type": $tdesc, "Value": $source, 
+                   "llvm::ArrayRef<OpFoldResult>": $offsets)>,
+
+    OpBuilder<(ins "Type": $tdesc, "Value": $source, 
+                   "llvm::ArrayRef<OpFoldResult>": $offsets,
+                   "ValueRange": $shape, "ValueRange": $stride)>
+  ];
+
+  let extraClassDeclaration = [{
+    /// Returns the type of the source memref operand.
+    Type getSourceType() {
+      return getSource().getType();
+    }
+
+    /// Returns the type of the result TensorDesc.
+    xegpu::TensorDescType getType() {
+      return getTensorDesc().getType();
+    }
+
+    /// Returns the offsets info to the source. It consolidates
+    /// information from both dynamic_offsets and static_offsets
+    /// parameters. static_offsets parameter always has the expected
+    /// ranks with some dim could have ShapeType::kDynamic value
+    /// indicating the corresponding value should be from dynamic_offsets.
+    llvm::SmallVector<OpFoldResult> getOffsets();
+
+    /// returns the shape info of the source. It is either from the
+    /// memref type, if source is a memref with static shape
+    /// information or from the dynamic_shape parameter. If both
+    /// exists, the dynamic_shape parameter will be used and the
+    /// shape information from  memref type will be ignored.
+    llvm::SmallVector<OpFoldResult> getShape();
+
+    /// returns the strides info of the source. It is either from the
+    /// memref type, if source is a memref with static shape
+    /// information or from the dynamic_stride parameter. If both
+    /// exists, the dynamic_strides parameter will be used and the
+    /// strides information from  memref type will be ignored.
+    llvm::SmallVector<OpFoldResult> getStrides();
+
+    /// Return the element type of the TensorDesc
+    Type getElementType() {
+      return getType().getElementType();
+    }
+
+    /// Return the shape of the TensorDesc
+    llvm::ArrayRef<int64_t> getTensorDescShape() {
+      return getType().getShape();
+    }
+  }];
+}
+
+def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
+  let summary = "prefetches a nD block to cache";
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+  // Format: xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>, 
+  //                                   l2_hint = #xegpu.cache_hint<cached>, 
+  //                                   l3_hint = #xegpu.cache_hint<cached>}
+  //         : !xegpu.tensor_desc<8x16xf16>
+  let assemblyFormat = "$TensorDesc attr-dict `:` qualified(type($TensorDesc))";
+}
+
+
+def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> {
+  let summary = "loads a n-D block from memory (represented by TensorDesc)" 
+                "to registers (represented by vector)";
+  let description = [{
+    LoadNDOp essentially mimics the hardware block read instruction to read 
+    a block of data from memory to register. It takes a set of cache hints 
+    for each level of cache, L1, L2 and L3. If hardware does not have a 
+    correspoding cache, Corresponding cache hint attribute will be masked.
+    If both transpose and vnni_axis present at the same time. It assume to 
+    perform transpose first and then vnni transform.
+  }];
+
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<I64Attr>: $vnni_axis,
+                       OptionalAttr<DenseI64ArrayAttr>: $transpose,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+  let results = (outs XeGPU_ValueType: $value);
+
+  let extraClassDeclaration = [{
+    VectorType getType() {
+      return llvm::dyn_cast<VectorType>(getValue().getType());
+    }
+
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+  }];
+
+  // Format: xegpu.load_nd %1 {transpose = [1, 0], 
+  //                l1_hint = #xegpu.cache_hint<cached>, 
+  //                l2_hint = #xegpu.cache_hint<uncached>, 
+  //                l3_hint = #xegpu.cache_hint<streaming>}
+  //         : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
+  let assemblyFormat = "$TensorDesc attr-dict `:` qualified(type($TensorDesc)) `->` type($value)";
+  let hasVerifier = 1;
+}
+
+def XeGPU_StoreNDOp : XeGPU_Op<"store_nd", []> {
+  let summary = "stores a n-D block register region back to memory, currently only supports 2D";
+  let arguments = (ins XeGPU_ValueType: $value,
+                       XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+  // Format: xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+  //                                l2_hint = #xegpu.cache_hint<write_back>, 
+  //                                l3_hint = #xegpu.cache_hint<write_through>}
+  //                                : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+  let assemblyFormat = "$value `,` $TensorDesc attr-dict `:` type($value) `,` qualified(type($TensorDesc))";
+  let hasVerifier = 1;
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 1d75bb4e2906fe..36b04ea12bcad0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -9,9 +9,9 @@
 #ifndef MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
 #define MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
 
-include "mlir/IR/BuiltinTypes.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
+include "mlir/IR/BuiltinTypes.td"
 
 def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64, UI1, UI8, UI16, UI32, UI64]>;
 def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>;
@@ -30,4 +30,106 @@ class XeGPUTypeDef<string name, string typeMnemonic, list<Trait> traits = [],
   let mnemonic = typeMnemonic;
 }
 
+def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
+        [ShapedTypeInterface], "::mlir::TensorType"> {
+  let summary = "TensorDesc describing regions of interested data.";
+  let description = [{
+    TensorDesc is a type designed to describe regions of the interested data as well as some 
+    features that are unique to Intel hardware. Different with the builtin tensor type in MLIR, 
+    it essentially only contains the meta data, and doesn't hold the data by itself. It is designed 
+    to mainly support 2d block load/store and DPAS (matrix multiplication instruction) on Intel GPU. 
+    It encodes the following information:
+
+    * shape:  the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows
+              and each row contains 16 continious data element. The rows could be
+              either continuous or not, depends on whether the encoding attribute
+              is set or not.
+    * element_type: the data type of the data element, e.g., f16, f32.
+
+    Similar to the builtin tensor, it also provides an optinal attribute to encoding 
+    the following information via the TensorDescAttr object:
+    * memory_scope (xegpu::MemoryScope): [optional] where the data is located, 
+                global memory or shared memory. It is default to Global.
+    * array_length (int): [optional] The number of continuous blocks with size as `shape`,
+               that will be loaded by block load at a time. It is default to 1.
+    * boundary_check (bool): [optional] indicates whether the operation detects the boundary 
+                and pads with zero for out-of-boundary access. It is default to do boundary check.
+    
+
+    Syntax:
+
+    ```
+    TensorDesc-type ::= `tensor_desc` `<` dim-list element-type (attr-list)? `>`
+    element-type ::= float-type | integer-type | index-type
+    dim-list := (static-dim-list `x`)?
+    static-dim-list ::= decimal-literal `x` decimal-literal
+    attr-list = (, memory_scope = value)? (, arr_len = value)? (, boundary_check = value)?
+    ```
+
+    Examples:
+
+    ```mlir
+    // A block TensorDesc with 8x16 i32 elements
+    xegpu.tensor_desc<8x16xi32>
+
+    // A block TensorDesc with 8x16 f32 elements
+    xegpu.tensor_desc<8x16xf32>
+
+    // A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
+    xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+    ```
+  }];
+
+  let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
+                        "mlir::Type": $elementType,
+                        OptionalParameter<"mlir::Attribute">: $encoding);
+
+  let extraClassDeclaration = [{
+    using TensorType::clone;
+    using mlir::ShapedType::Trait<TensorDescType>::getElementTypeBitWidth;
+    using mlir::ShapedType::Trait<TensorDescType>::getRank;
+    using mlir::ShapedType::Trait<TensorDescType>::getNumElements;
+    using mlir::ShapedType::Trait<TensorDescType>::isDynamicDim;
+    using mlir::ShapedType::Trait<TensorDescType>::hasStaticShape;
+    using mlir::ShapedType::Trait<TensorDescType>::getNumDynamicDims;
+    using mlir::ShapedType::Trait<TensorDescType>::getDimSize;
+    using mlir::ShapedType::Trait<TensorDescType>::getDynamicDimIndex;
+
+    TensorDescType clone(::mlir::Type elementType) {
+      return llvm::cast<TensorDescType>(cloneWith(getShape(), elementType));
+    }
+
+    TensorDescAttr getEncodingAsTensorDescAttr() const {
+      return llvm::dyn_cast_if_present<TensorDescAttr>(getEncoding());
+    }
+
+    xegpu::MemoryScope getMemoryScope() const {
+      auto attr = getEncodingAsTensorDescAttr();
+      if (attr && attr.getMemoryScope())
+        return attr.getMemoryScope().getValue();
+      // return default value
+      return MemoryScope::Global;
+    }
+
+    int getArrayLength() {
+      auto attr = getEncodingAsTensorDescAttr();
+      if (attr && attr.getArrayLength())
+        return attr.getArrayLength().getInt();
+      // return default value
+      return 1; 
+    }
+
+    bool getBoundaryCheck() {
+      auto attr = getEncodingAsTensorDescAttr();
+      if (attr && attr.getBoundaryCheck())
+        return attr.getBoundaryCheck().getValue();
+      // return default value
+      return true;
+    }
+  }];
+
+  let hasCustomAssemblyFormat = true;
+  
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 4f839ee773476b..bd72d5c17b6ea1 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -6,7 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <llvm/ADT/TypeSwitch.h>
 #include <mlir/Dialect/XeGPU/IR/XeGPU.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/DialectImplementation.h>
 
 namespace mlir {
 namespace xegpu {
@@ -26,8 +29,73 @@ void XeGPUDialect::initialize() {
       >();
 }
 
-// this file is for position occupation,
-// we will add functions in following PRs.
+
+//===----------------------------------------------------------------------===//
+// XeGPU_TensorDescAttr
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XeGPU_TensorDescType
+//===----------------------------------------------------------------------===//
+mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
+  llvm::SmallVector<int64_t> shape;
+  mlir::Type elementType;
+  mlir::FailureOr<mlir::Attribute> encoding;
+
+  // Parse literal '<'
+  if (parser.parseLess())
+    return {};
+
+  auto shapeLoc = parser.getCurrentLocation();
+  if (mlir::failed(parser.parseDimensionList(shape))) {
+    parser.emitError(shapeLoc, "failed to parse parameter 'shape'");
+    return {};
+  }
+
+  auto elemTypeLoc = parser.getCurrentLocation();
+  if (mlir::failed(parser.parseType(elementType))) {
+    parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'");
+    return {};
+  }
+
+  // parse optional attributes
+  if (mlir::succeeded(parser.parseOptionalComma())) {
+    encoding = mlir::FieldParser<mlir::Attribute>::parse(parser);
+    if (mlir::failed(encoding)) {
+      parser.emitError(parser.getCurrentLocation(),
+          "Failed to parse the attribute field for TensorDescType.\n");
+      return {};
+    }
+  }
+
+  // Parse literal '>'
+  if (parser.parseGreater())
+    return {};
+
+  return TensorDescType::get(parser.getContext(), shape, elementType,
+                             encoding.value_or(mlir::Attribute()));
+}
+
+void TensorDescType::print(::mlir::AsmPrinter &printer) const {
+  printer << "<";
+
+  auto shape = getShape();
+  for (int64_t dim : shape) {
+    if (mlir::ShapedType::isDynamic(dim))
+      printer << '?';
+    else
+      printer << dim;
+    printer << 'x';
+  }
+
+  printer << getElementType();
+
+  if (auto encoding = getEncoding())
+    printer << ", " << encoding;
+
+  printer << ">";
+}
+
 
 } // namespace xegpu
 } // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 0e89ac4df6ef28..a388db4f5c2dc6 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -6,14 +6,286 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <mlir/Dialect/Utils/StaticValueUtils.h>
 #include <mlir/Dialect/XeGPU/IR/XeGPU.h>
+#include <mlir/Interfaces/ViewLikeInterface.h>
+#include <mlir/IR/Builders.h>
 
 #define DEBUG_TYPE "xegpu"
 
 namespace mlir {
 namespace xegpu {
-// this file is for position occupation,
-// we will add functions in following PRs.
+
+static size_t getRankOf(Value value) {
+  if (value.getType().isIntOrIndexOrFloat())
+    return 0;
+  if (auto ty = llvm::dyn_cast_if_present<MemRefType>(value.getType()))
+    return ty.getRank();
+  if (auto ty = llvm::dyn_cast_if_present<VectorType>(value.getType()))
+    return ty.getRank();
+  llvm_unreachable("Unsupported value for getRankOf");
+}
+
+static void transpose(llvm::ArrayRef<int64_t> trans,
+                      std::vector<int64_t> &shape) {
+  std::vector<int64_t> old = shape;
+  for (size_t i = 0; i < trans.size(); i++)
+    shape[i] = old[trans[i]];
+}
+
+template <typename T>
+static std::string makeString(T array, bool breakline = false) {
+  std::string buf;
+  buf.clear();
+  llvm::raw_string_ostream os(buf);
+  os << "[";
+  for (size_t i = 1; i < array.size(); i++) {
+    os << array[i - 1] << ", ";
+    if (breakline)
+      os << "\n\t\t";
+  }
+  os << array.back() << "]";
+  os.flush();
+  return buf;
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_CreateNdDescOp
+//===----------------------------------------------------------------------===//
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type TensorDesc, Value source, ValueRange offsets,
+                           ValueRange shape, ValueRange strides,
+                           llvm::ArrayRef<int64_t> static_offsets) {
+  auto offsetRank = static_offsets.size();
+  auto shapeRank = shape.size() ? shape.size() : getRankOf(source);
+
+  size_t dynOffsetRank =
+      std::count_if(static_offsets.begin(), static_offsets.end(),
+                    [](int64_t d) { return ShapedType::isDynamic(d); });
+
+  // shape and strides should exists at the same time
+  // and the final rank for shape and offset (dynamic + static)
+  // should be the same
+  assert(shape.size() == strides.size() && shapeRank == offsetRank &&
+         offsets.size() == dynOffsetRank);
+
+  state.addOperands(source);
+  state.addOperands(offsets);
+  state.addOperands(shape);
+  state.addOperands(strides);
+  state.addAttribute(
+      getOperandSegmentSizesAttrName(state.name),
+      builder.getDenseI32ArrayAttr({1, static_cast<int32_t>(offsets.size()),
+                                    static_cast<int32_t>(shape.size()),
+                                    static_cast<int32_t>(strides.size())}));
+  state.addAttribute(getStaticOffsetsAttrName(state.name),
+                     builder.getDenseI64ArrayAttr(static_offsets));
+  state.addTypes(TensorDesc);
+}
+
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type tdesc, Value source,
+                           llvm::ArrayRef<OpFoldResult> offsets) {
+  auto ty = llvm::dyn_cast_if_present<MemRefType>(source.getType());
+  assert(ty && ty.hasStaticShape() && offsets.size() == getRankOf(source));
+
+  llvm::SmallVector<int64_t> staticOffsets;
+  llvm::SmallVector<Value> dynamicOffsets;
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+
+  build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */,
+        ValueRange({}) /* empty dynamic shape */,
+        ValueRange({}) /* empty dynamic strides */,
+        staticOffsets /* static offsets */);
+}
+
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type tdesc, Value source,
+                           llvm::ArrayRef<OpFoldResult> offsets,
+                           ValueRange shape, ValueRange stride) {
+  assert(shape.size() && offsets.size() && stride.size() &&
+         shape.size() == stride.size() && shape.size() == offsets.size());
+
+  llvm::SmallVector<int64_t> staticOffsets;
+  llvm::SmallVector<Value> dynamicOffsets;
+
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+
+  build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets,
+        /* dynamic shape = */ shape , /* dynamic strides = */ stride,
+        /* static offsets = */ staticOffsets);
+}
+
+
+LogicalResult CreateNdDescOp::verify() {
+  auto offsetRank = getOffsets().size();
+  auto shapeRank = getShape().size();
+  auto stridesRank = getStrides().size();
+  auto baseRank = getRankOf(getSource()) ? getRankOf(getSource()) : 2;
+
+  if (offsetRank != shapeRank || shapeRank != stridesRank ||
+      shapeRank != baseRank)
+
+    return emitOpError(
+        "Expecting the rank of shape, strides, offsets and memref type "
+        "should match with each other (they currently should be 2D).");
+  return success();
+}
+
+// compute consolidated offsets from dynamic_offsets and static_offsets parameters
+llvm::SmallVector<OpFoldResult> CreateNdDescOp::getOffsets() {
+  llvm::SmallVector<OpFoldResult> offsets;
+  auto dynamicOffsets = getDynamicOffsets(); // dynamic_offsets variable
+  auto staticOffsets = getStaticOffsets();   // static_offsets attribute
+
+  // in case static_offsets is missing, dynamic_offsets will be used
+  if (staticOffsets.size() == 0) {
+    offsets.assign(dynamicOffsets.begin(), dynamicOffsets.end());
+    return offsets;
+  }
+
+  // use static offsets for each dim if it has valid value, 
+  // othwise use the value from dynamic_offsets
+  for (size_t i = 0, j = 0; i < staticOffsets.size(); i++) {
+    if (ShapedType::isDynamic(staticOffsets[i])) {
+      assert(j < dynamicOffsets.size());
+      offsets.push_back(dynamicOffsets[j++]);
+    } else {
+      auto ty = IndexType::get(getContext());
+      auto attr = IntegerAttr::get(ty, staticOffsets[i]);
+      offsets.push_back(attr);
+    }
+  }
+  return offsets;
+}
+
+// get the consolidated shape of the 2D memory region. 
+// It prefer dynamic_shape than the static shape of 
+// memref type.
+llvm::SmallVector<OpFoldResult> CreateNdDescOp::getShape() {
+  llvm::SmallVector<OpFoldResult> shape;
+  auto dynShape = getDynamicShape();
+  if (dynShape.size()) {
+    shape.append(dynShape.begin(), dynShape.end());
+    return shape;
+  }
+
+  auto ty = llvm::dyn_cast_if_present<MemRefType>(getSourceType());
+  if (ty && ty.hasStaticShape()) {
+    for (auto dim : ty.getShape()) {
+      auto attr = IntegerAttr::get(IndexType::get(getContext()), dim);
+      shape.push_back(attr);
+    }
+    return shape;
+  }
+  
+  this->emitError("The shape information of the memory is missing.\n");
+  return {};
+}
+
+// get the consolidated strides of the 2D memory region. 
+// It prefer dynamic_stride than the static strides of 
+// memref type.
+llvm::SmallVector<OpFoldResult> CreateNdDescOp::getStrides() {
+  llvm::SmallVector<OpFoldResult> strides;
+
+  auto dynStrides = getDynamicStrides();
+  if (dynStrides.size()) {
+    strides.append(dynStrides.begin(), dynStrides.end());
+    return strides;
+  }
+
+  auto ty = llvm::dyn_cast_if_present<MemRefType>(getSourceType());
+  if (ty && ty.hasStaticShape()) {
+    auto [staticStrides, offset] = getStridesAndOffset(ty);
+    for (auto dim : staticStrides) {
+      auto attr = IntegerAttr::get(IndexType::get(getContext()), dim);
+      strides.push_back(attr);
+    }
+    return strides;
+  }
+
+  this->emitError("The strides information of the memory is missing.\n");
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_LoadNDOp
+//===----------------------------------------------------------------------===//
+LogicalResult LoadNDOp::verify() {
+  auto tdescTy = getTensorDescType();
+  auto valueTy = getType();
+
+  if (tdescTy.getRank() != 2)
+    return emitOpError(
+        "The TensorDesc for LoadNDOp should be a 2D TensorDesc.");
+
+  if (!valueTy)
+    return emitOpError("Invalid result, it should be a VectorType.\n");
+
+  auto tdescElemTy = tdescTy.getElementType();
+  auto valueElemTy = valueTy.getElementType();
+
+  if (tdescElemTy != valueElemTy)
+    return emitOpError(
+        "Value should have the same element type as TensorDesc.");
+
+  auto array_len = tdescTy.getArrayLength();
+  auto tdescShape = tdescTy.getShape().vec();
+  auto valueShape = valueTy.getShape().vec();
+
+  if (getTranspose()) {
+    auto trans = getTranspose().value();
+    if (tdescShape.size() >= trans.size())
+      transpose(trans, tdescShape);
+    else
+      emitWarning("Invalid transpose attr. It is ignored.");
+  }
+
+  if (getVnniAxis()) {
+    auto axis = getVnniAxis().value();
+    auto vnni_factor = valueShape.back();
+    tdescShape[axis] /= vnni_factor;
+    tdescShape.push_back(vnni_factor);
+  }
+
+  if (array_len > 1) {
+    auto it = tdescShape.begin();
+    tdescShape.insert(it, array_len);
+  }
+
+  if (tdescShape != valueShape)
+    return emitOpError() <<"Result shape doesn't match TensorDesc shape."
+           << "The expected shape is " << makeString(tdescShape) << ". "
+           << "But the given shape is " << makeString(valueShape) << ".\n";
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_StoreNDOp
+//===----------------------------------------------------------------------===//
+LogicalResult StoreNDOp::verify() {
+  auto dstTy = getTensorDesc().getType();               // Tile
+  auto valTy = getValue().getType().cast<VectorType>(); // Vector
+
+  if (dstTy.getRank() != 2)
+    return emitOpError("Expecting a 2D TensorDesc shape.\n");
+
+  if (!valTy)
+    return emitOpError("Exepcting a VectorType result.\n");
+
+  auto dstElemTy = dstTy.getElementType();
+  auto valElemTy = valTy.getElementType();
+
+  if (dstElemTy != valElemTy) {
+    return emitOpError() << "The element type of the value should "
+                       "match the elementtype of the TensorDesc.\n";
+  }
+
+  if (dstTy.getShape() != valTy.getShape())
+    return emitOpError() << "The result shape should match the TensorDesc shape.\n";
+  return success();
+}
 
 } // namespace xegpu
 } // namespace mlir
diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
new file mode 100644
index 00000000000000..f9b3510beb4335
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
@@ -0,0 +1,62 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: gpu.module @test {
+gpu.module @test {
+// CHECK: gpu.func @test_create_nd_tdesc_vc_1(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+  gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_vc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) {
+gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
+  //CHECK: %[[C:.*]] = arith.constant 1 : index
+  %c1 = arith.constant 1 : index
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+  gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_vc_3(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr<array_length = 2 : i64>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr<array_length = 2>>
+  gpu.return
+}
+
+// CHECK: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK: xegpu.prefetch_nd %[[R0]] {l2_hint = #xegpu.cache_hint<uncached>, li_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<8x16xf16>
+  xegpu.prefetch_nd %1 {li_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}: !xegpu.tensor_desc<8x16xf16>
+  gpu.return
+}
+
+// CHECK: func @test_load_nd_vc(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, vnni_axis = 0 : i64} : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+  %2 = xegpu.load_nd %1 {vnni_axis = 0, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>} 
+       : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+  gpu.return
+}
+
+// CHECK: func @test_store_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
+  // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16>
+  %1 = arith.constant dense<1.0>: vector<24x32xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+  %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+  // CHECK: xegpu.store_nd %[[C]], %[[R0]] {l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>} : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
+  xegpu.store_nd %1, %2 {l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>} : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
+  gpu.return
+}
+
+}
\ No newline at end of file

>From 74bd038f61985874694c01023c16f04e070e1419 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Sun, 10 Mar 2024 20:38:46 +0000
Subject: [PATCH 06/19] run clang-format

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp |  5 ++--
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp     | 33 ++++++++++++----------
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index bd72d5c17b6ea1..43337a6ab43dcd 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -29,7 +29,6 @@ void XeGPUDialect::initialize() {
       >();
 }
 
-
 //===----------------------------------------------------------------------===//
 // XeGPU_TensorDescAttr
 //===----------------------------------------------------------------------===//
@@ -62,7 +61,8 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
   if (mlir::succeeded(parser.parseOptionalComma())) {
     encoding = mlir::FieldParser<mlir::Attribute>::parse(parser);
     if (mlir::failed(encoding)) {
-      parser.emitError(parser.getCurrentLocation(),
+      parser.emitError(
+          parser.getCurrentLocation(),
           "Failed to parse the attribute field for TensorDescType.\n");
       return {};
     }
@@ -96,7 +96,6 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const {
   printer << ">";
 }
 
-
 } // namespace xegpu
 } // namespace mlir
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index a388db4f5c2dc6..be631c4678eacb 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -8,8 +8,8 @@
 
 #include <mlir/Dialect/Utils/StaticValueUtils.h>
 #include <mlir/Dialect/XeGPU/IR/XeGPU.h>
-#include <mlir/Interfaces/ViewLikeInterface.h>
 #include <mlir/IR/Builders.h>
+#include <mlir/Interfaces/ViewLikeInterface.h>
 
 #define DEBUG_TYPE "xegpu"
 
@@ -112,11 +112,10 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
 
   build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets,
-        /* dynamic shape = */ shape , /* dynamic strides = */ stride,
+        /* dynamic shape = */ shape, /* dynamic strides = */ stride,
         /* static offsets = */ staticOffsets);
 }
 
-
 LogicalResult CreateNdDescOp::verify() {
   auto offsetRank = getOffsets().size();
   auto shapeRank = getShape().size();
@@ -132,7 +131,8 @@ LogicalResult CreateNdDescOp::verify() {
   return success();
 }
 
-// compute consolidated offsets from dynamic_offsets and static_offsets parameters
+// compute consolidated offsets from dynamic_offsets and static_offsets
+// parameters
 llvm::SmallVector<OpFoldResult> CreateNdDescOp::getOffsets() {
   llvm::SmallVector<OpFoldResult> offsets;
   auto dynamicOffsets = getDynamicOffsets(); // dynamic_offsets variable
@@ -144,7 +144,7 @@ llvm::SmallVector<OpFoldResult> CreateNdDescOp::getOffsets() {
     return offsets;
   }
 
-  // use static offsets for each dim if it has valid value, 
+  // use static offsets for each dim if it has valid value,
   // othwise use the value from dynamic_offsets
   for (size_t i = 0, j = 0; i < staticOffsets.size(); i++) {
     if (ShapedType::isDynamic(staticOffsets[i])) {
@@ -159,8 +159,8 @@ llvm::SmallVector<OpFoldResult> CreateNdDescOp::getOffsets() {
   return offsets;
 }
 
-// get the consolidated shape of the 2D memory region. 
-// It prefer dynamic_shape than the static shape of 
+// get the consolidated shape of the 2D memory region.
+// It prefer dynamic_shape than the static shape of
 // memref type.
 llvm::SmallVector<OpFoldResult> CreateNdDescOp::getShape() {
   llvm::SmallVector<OpFoldResult> shape;
@@ -178,13 +178,13 @@ llvm::SmallVector<OpFoldResult> CreateNdDescOp::getShape() {
     }
     return shape;
   }
-  
+
   this->emitError("The shape information of the memory is missing.\n");
   return {};
 }
 
-// get the consolidated strides of the 2D memory region. 
-// It prefer dynamic_stride than the static strides of 
+// get the consolidated strides of the 2D memory region.
+// It prefer dynamic_stride than the static strides of
 // memref type.
 llvm::SmallVector<OpFoldResult> CreateNdDescOp::getStrides() {
   llvm::SmallVector<OpFoldResult> strides;
@@ -255,9 +255,11 @@ LogicalResult LoadNDOp::verify() {
   }
 
   if (tdescShape != valueShape)
-    return emitOpError() <<"Result shape doesn't match TensorDesc shape."
-           << "The expected shape is " << makeString(tdescShape) << ". "
-           << "But the given shape is " << makeString(valueShape) << ".\n";
+    return emitOpError() << "Result shape doesn't match TensorDesc shape."
+                         << "The expected shape is " << makeString(tdescShape)
+                         << ". "
+                         << "But the given shape is " << makeString(valueShape)
+                         << ".\n";
   return success();
 }
 
@@ -279,11 +281,12 @@ LogicalResult StoreNDOp::verify() {
 
   if (dstElemTy != valElemTy) {
     return emitOpError() << "The element type of the value should "
-                       "match the elementtype of the TensorDesc.\n";
+                            "match the elementtype of the TensorDesc.\n";
   }
 
   if (dstTy.getShape() != valTy.getShape())
-    return emitOpError() << "The result shape should match the TensorDesc shape.\n";
+    return emitOpError()
+           << "The result shape should match the TensorDesc shape.\n";
   return success();
 }
 

>From 778d4d2c09eed97231db300614387e6bd3fb1608 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 11 Mar 2024 23:18:35 +0000
Subject: [PATCH 07/19] sync for OffsetSizeAndStrideOpInterface

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h    | 12 ++--
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 59 +++++++++++--------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 45 ++++++++------
 3 files changed, 67 insertions(+), 49 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 8dc3ff78d25ede..662fd7ef197414 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -9,11 +9,13 @@
 #ifndef MLIR_DIALECT_XEGPU_IR_XEGPU_H
 #define MLIR_DIALECT_XEGPU_IR_XEGPU_H
 
-#include <mlir/Bytecode/BytecodeOpInterface.h>
-#include <mlir/IR/BuiltinTypes.h>
-#include <mlir/IR/Dialect.h>
-#include <mlir/Interfaces/ShapedOpInterfaces.h>
-#include <mlir/Interfaces/SideEffectInterfaces.h>
+#include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/Interfaces/ShapedOpInterfaces.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
+
 
 namespace mlir {
 namespace xegpu {
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 9d37d77e03a0c5..d8eba0588c7c86 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -9,26 +9,13 @@
 #ifndef MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
 #define MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
 
+include "mlir/IR/AttrTypeBase.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
-include "mlir/IR/AttrTypeBase.td"
-
-
-include "mlir/IR/OpBase.td"
-include "mlir/IR/OpAsmInterface.td"
-include "mlir/IR/AttrTypeBase.td"
-include "mlir/IR/BuiltinTypes.td"
-include "mlir/IR/BuiltinTypeInterfaces.td"
+include "mlir/Interfaces/ShapedOpInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
-include "mlir/Interfaces/CastInterfaces.td"
-include "mlir/Interfaces/ControlFlowInterfaces.td"
-include "mlir/Interfaces/CopyOpInterface.td"
-include "mlir/Interfaces/InferTypeOpInterface.td"
-include "mlir/Interfaces/ShapedOpInterfaces.td"
-
 
 // Base class for dialect operations. This operation inherits from the base
 // `Op` class in OpBase.td, and provides:
@@ -39,7 +26,8 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
           Op<XeGPU_Dialect, mnemonic, traits>;
 
 
-def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSegments]> {
+def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, 
+  AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> {
 
   let summary = "create nd tensor descriptor operation";
   let description = [{
@@ -90,17 +78,20 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg
 
   let arguments = (ins 
     XeGPU_BaseAddrType: $source, 
-    Variadic<Index>: $dynamic_offsets, 
-    Variadic<Index>: $dynamic_shape, 
-    Variadic<Index>: $dynamic_strides,
-    DenseI64ArrayAttr: $static_offsets
+    Variadic<Index>: $offsets, 
+    Variadic<Index>: $shape, 
+    Variadic<Index>: $strides,
+    DenseI64ArrayAttr: $static_offsets,
+    DefaultValuedAttr<DenseI64ArrayAttr, "{0, 0}">: $static_shape,
+    DefaultValuedAttr<DenseI64ArrayAttr, "{0, 0}">: $static_strides
   );
   let results = (outs XeGPU_TensorDesc: $TensorDesc);
 
   let assemblyFormat = [{
     $source ``
-    custom<DynamicIndexList>($dynamic_offsets, $static_offsets)
-    (`,` `[` $dynamic_shape^ `]` `,` `[` $dynamic_strides `]`)?
+    custom<DynamicIndexList>($offsets, $static_offsets)
+    (`,` custom<DynamicIndexList>($shape, $static_shape)^
+     `,` custom<DynamicIndexList>($strides, $static_strides))?
     attr-dict `:` type($source) `->` qualified(type($TensorDesc))
   }];
   let skipDefaultBuilders = 1;
@@ -135,21 +126,21 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg
     /// parameters. static_offsets parameter always has the expected
     /// ranks with some dim could have ShapeType::kDynamic value
     /// indicating the corresponding value should be from dynamic_offsets.
-    llvm::SmallVector<OpFoldResult> getOffsets();
+    llvm::SmallVector<OpFoldResult> getEffectiveOffsets();
 
     /// returns the shape info of the source. It is either from the
     /// memref type, if source is a memref with static shape
     /// information or from the dynamic_shape parameter. If both
     /// exists, the dynamic_shape parameter will be used and the
     /// shape information from  memref type will be ignored.
-    llvm::SmallVector<OpFoldResult> getShape();
+    llvm::SmallVector<OpFoldResult> getEffectiveShape();
 
     /// returns the strides info of the source. It is either from the
     /// memref type, if source is a memref with static shape
     /// information or from the dynamic_stride parameter. If both
     /// exists, the dynamic_strides parameter will be used and the
     /// strides information from  memref type will be ignored.
-    llvm::SmallVector<OpFoldResult> getStrides();
+    llvm::SmallVector<OpFoldResult> getEffectiveStrides();
 
     /// Return the element type of the TensorDesc
     Type getElementType() {
@@ -160,6 +151,24 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg
     llvm::ArrayRef<int64_t> getTensorDescShape() {
       return getType().getShape();
     }
+
+    /// Return the expected rank of each of the`static_offsets`, `static_sizes`
+    /// and `static_strides` attributes.
+    std::array<unsigned, 3> getArrayAttrMaxRanks() {
+      return {2, 2, 2};
+    }
+
+    mlir::OperandRange getSizes() {
+      return getShape();
+    }
+    
+    llvm::ArrayRef<int64_t> getStaticSizes() {
+      return getStaticShape();
+    }
+
+    /// Return the number of leading operands before the `offsets`, `sizes` and
+    /// and `strides` operands.
+    static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
   }];
 }
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index a388db4f5c2dc6..34d1a90a50a488 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -8,9 +8,10 @@
 
 #include <mlir/Dialect/Utils/StaticValueUtils.h>
 #include <mlir/Dialect/XeGPU/IR/XeGPU.h>
-#include <mlir/Interfaces/ViewLikeInterface.h>
 #include <mlir/IR/Builders.h>
 
+#include <llvm/Support/Debug.h>
+
 #define DEBUG_TYPE "xegpu"
 
 namespace mlir {
@@ -118,24 +119,30 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
 
 
 LogicalResult CreateNdDescOp::verify() {
-  auto offsetRank = getOffsets().size();
-  auto shapeRank = getShape().size();
-  auto stridesRank = getStrides().size();
-  auto baseRank = getRankOf(getSource()) ? getRankOf(getSource()) : 2;
-
-  if (offsetRank != shapeRank || shapeRank != stridesRank ||
-      shapeRank != baseRank)
-
-    return emitOpError(
-        "Expecting the rank of shape, strides, offsets and memref type "
-        "should match with each other (they currently should be 2D).");
+  // auto offsetRank = getEffectiveOffsets().size();
+  // auto shapeRank = getEffectiveShape().size();
+  // auto stridesRank = getEffectiveStrides().size();
+  // auto baseRank = getRankOf(getSource()) ? getRankOf(getSource()) : 2;
+
+  llvm::dbgs() << "\nNum of mixed Offsets: " << getMixedOffsets().size()
+               << "\nNum of mixed Sizes: " << getMixedSizes().size()
+               << "\nNum of mixed Strides: " << getMixedStrides().size()
+               << "\n";
+
+  // if (offsetRank != shapeRank || shapeRank != stridesRank ||
+  //     shapeRank != baseRank)
+
+  //   return emitOpError(
+  //       "Expecting the rank of shape, strides, offsets and memref type "
+  //       "should match with each other (they currently should be 2D).");
   return success();
 }
 
-// compute consolidated offsets from dynamic_offsets and static_offsets parameters
-llvm::SmallVector<OpFoldResult> CreateNdDescOp::getOffsets() {
+// compute consolidated offsets from dynamic_offsets and static_offsets
+// parameters
+llvm::SmallVector<OpFoldResult> CreateNdDescOp::getEffectiveOffsets() {
   llvm::SmallVector<OpFoldResult> offsets;
-  auto dynamicOffsets = getDynamicOffsets(); // dynamic_offsets variable
+  auto dynamicOffsets = getOffsets(); // offsets variable
   auto staticOffsets = getStaticOffsets();   // static_offsets attribute
 
   // in case static_offsets is missing, dynamic_offsets will be used
@@ -162,9 +169,9 @@ llvm::SmallVector<OpFoldResult> CreateNdDescOp::getOffsets() {
 // get the consolidated shape of the 2D memory region. 
 // It prefer dynamic_shape than the static shape of 
 // memref type.
-llvm::SmallVector<OpFoldResult> CreateNdDescOp::getShape() {
+llvm::SmallVector<OpFoldResult> CreateNdDescOp::getEffectiveShape() {
   llvm::SmallVector<OpFoldResult> shape;
-  auto dynShape = getDynamicShape();
+  auto dynShape = getShape();
   if (dynShape.size()) {
     shape.append(dynShape.begin(), dynShape.end());
     return shape;
@@ -186,10 +193,10 @@ llvm::SmallVector<OpFoldResult> CreateNdDescOp::getShape() {
 // get the consolidated strides of the 2D memory region. 
 // It prefer dynamic_stride than the static strides of 
 // memref type.
-llvm::SmallVector<OpFoldResult> CreateNdDescOp::getStrides() {
+llvm::SmallVector<OpFoldResult> CreateNdDescOp::getEffectiveStrides() {
   llvm::SmallVector<OpFoldResult> strides;
 
-  auto dynStrides = getDynamicStrides();
+  auto dynStrides = getStrides();
   if (dynStrides.size()) {
     strides.append(dynStrides.begin(), dynStrides.end());
     return strides;

>From 3c37828ce6bb54c1e4af99a3726eb898fd55b61b Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 12 Mar 2024 00:13:55 +0000
Subject: [PATCH 08/19] sync

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |  83 +++++++-----
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 126 +++++++++---------
 2 files changed, 109 insertions(+), 100 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index d8eba0588c7c86..447ea2e0f3982b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -81,34 +81,31 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure,
     Variadic<Index>: $offsets, 
     Variadic<Index>: $shape, 
     Variadic<Index>: $strides,
-    DenseI64ArrayAttr: $static_offsets,
-    DefaultValuedAttr<DenseI64ArrayAttr, "{0, 0}">: $static_shape,
-    DefaultValuedAttr<DenseI64ArrayAttr, "{0, 0}">: $static_strides
+    DenseI64ArrayAttr: $static_offsets
   );
   let results = (outs XeGPU_TensorDesc: $TensorDesc);
 
   let assemblyFormat = [{
     $source ``
     custom<DynamicIndexList>($offsets, $static_offsets)
-    (`,` custom<DynamicIndexList>($shape, $static_shape)^
-     `,` custom<DynamicIndexList>($strides, $static_strides))?
+    (`,` `[` $shape^ `]` `,` `[` $strides `]`)?
     attr-dict `:` type($source) `->` qualified(type($TensorDesc))
   }];
-  let skipDefaultBuilders = 1;
-  let hasVerifier = 1;
-
-  let builders = [
-    OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets, 
-                   "ValueRange": $shape, "ValueRange": $strides, 
-                   "llvm::ArrayRef<int64_t>": $static_offsets)>,
 
-    OpBuilder<(ins "Type": $tdesc, "Value": $source, 
-                   "llvm::ArrayRef<OpFoldResult>": $offsets)>,
+  let hasVerifier = 1;
 
-    OpBuilder<(ins "Type": $tdesc, "Value": $source, 
-                   "llvm::ArrayRef<OpFoldResult>": $offsets,
-                   "ValueRange": $shape, "ValueRange": $stride)>
-  ];
+//  let builders = [
+//    OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets, 
+//                   "ValueRange": $shape, "ValueRange": $strides, 
+//                   "llvm::ArrayRef<int64_t>": $static_offsets)>,
+//
+//    OpBuilder<(ins "Type": $tdesc, "Value": $source, 
+//                   "llvm::ArrayRef<OpFoldResult>": $offsets)>,
+//
+//    OpBuilder<(ins "Type": $tdesc, "Value": $source, 
+//                   "llvm::ArrayRef<OpFoldResult>": $offsets,
+//                   "ValueRange": $shape, "ValueRange": $stride)>
+//  ];
 
   let extraClassDeclaration = [{
     /// Returns the type of the source memref operand.
@@ -121,6 +118,35 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure,
       return getTensorDesc().getType();
     }
 
+    /// Return the element type of the TensorDesc
+    Type getElementType() {
+      return getType().getElementType();
+    }
+
+
+
+    /// Return the shape of the TensorDesc
+    llvm::ArrayRef<int64_t> getTensorDescShape() {
+      return getType().getShape();
+    }
+
+    OperandRange getSizes() {
+      return getShape();
+    }
+
+    SmallVector<int64_t> getStaticSizes() {
+      if (auto ty = getSourceType().dyn_cast<MemRefType>()) {
+        return SmallVector<int64_t>(ty.getShape());
+      }
+    }
+
+    SmallVector<int64_t> getStaticStrides() {
+      if (auto ty = getSourceType().dyn_cast<MemRefType>()) {
+        auto [strides, offset] = getStridesAndOffset(ty);
+        return strides;
+      }
+    }
+
     /// Returns the offsets info to the source. It consolidates
     /// information from both dynamic_offsets and static_offsets
     /// parameters. static_offsets parameter always has the expected
@@ -142,30 +168,13 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure,
     /// strides information from  memref type will be ignored.
     llvm::SmallVector<OpFoldResult> getEffectiveStrides();
 
-    /// Return the element type of the TensorDesc
-    Type getElementType() {
-      return getType().getElementType();
-    }
 
-    /// Return the shape of the TensorDesc
-    llvm::ArrayRef<int64_t> getTensorDescShape() {
-      return getType().getShape();
-    }
-
-    /// Return the expected rank of each of the`static_offsets`, `static_sizes`
-    /// and `static_strides` attributes.
+    /// Return the expected rank of each of the`static_offsets`, 
+    /// `static_sizes` and `static_strides` attributes.
     std::array<unsigned, 3> getArrayAttrMaxRanks() {
       return {2, 2, 2};
     }
-
-    mlir::OperandRange getSizes() {
-      return getShape();
-    }
     
-    llvm::ArrayRef<int64_t> getStaticSizes() {
-      return getStaticShape();
-    }
-
     /// Return the number of leading operands before the `offsets`, `sizes` and
     /// and `strides` operands.
     static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 34d1a90a50a488..5a2bc2d72bfd8c 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -53,69 +53,69 @@ static std::string makeString(T array, bool breakline = false) {
 //===----------------------------------------------------------------------===//
 // XeGPU_CreateNdDescOp
 //===----------------------------------------------------------------------===//
-void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
-                           Type TensorDesc, Value source, ValueRange offsets,
-                           ValueRange shape, ValueRange strides,
-                           llvm::ArrayRef<int64_t> static_offsets) {
-  auto offsetRank = static_offsets.size();
-  auto shapeRank = shape.size() ? shape.size() : getRankOf(source);
-
-  size_t dynOffsetRank =
-      std::count_if(static_offsets.begin(), static_offsets.end(),
-                    [](int64_t d) { return ShapedType::isDynamic(d); });
-
-  // shape and strides should exists at the same time
-  // and the final rank for shape and offset (dynamic + static)
-  // should be the same
-  assert(shape.size() == strides.size() && shapeRank == offsetRank &&
-         offsets.size() == dynOffsetRank);
-
-  state.addOperands(source);
-  state.addOperands(offsets);
-  state.addOperands(shape);
-  state.addOperands(strides);
-  state.addAttribute(
-      getOperandSegmentSizesAttrName(state.name),
-      builder.getDenseI32ArrayAttr({1, static_cast<int32_t>(offsets.size()),
-                                    static_cast<int32_t>(shape.size()),
-                                    static_cast<int32_t>(strides.size())}));
-  state.addAttribute(getStaticOffsetsAttrName(state.name),
-                     builder.getDenseI64ArrayAttr(static_offsets));
-  state.addTypes(TensorDesc);
-}
-
-void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
-                           Type tdesc, Value source,
-                           llvm::ArrayRef<OpFoldResult> offsets) {
-  auto ty = llvm::dyn_cast_if_present<MemRefType>(source.getType());
-  assert(ty && ty.hasStaticShape() && offsets.size() == getRankOf(source));
-
-  llvm::SmallVector<int64_t> staticOffsets;
-  llvm::SmallVector<Value> dynamicOffsets;
-  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
-
-  build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */,
-        ValueRange({}) /* empty dynamic shape */,
-        ValueRange({}) /* empty dynamic strides */,
-        staticOffsets /* static offsets */);
-}
-
-void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
-                           Type tdesc, Value source,
-                           llvm::ArrayRef<OpFoldResult> offsets,
-                           ValueRange shape, ValueRange stride) {
-  assert(shape.size() && offsets.size() && stride.size() &&
-         shape.size() == stride.size() && shape.size() == offsets.size());
-
-  llvm::SmallVector<int64_t> staticOffsets;
-  llvm::SmallVector<Value> dynamicOffsets;
-
-  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
-
-  build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets,
-        /* dynamic shape = */ shape , /* dynamic strides = */ stride,
-        /* static offsets = */ staticOffsets);
-}
+// void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+//                            Type TensorDesc, Value source, ValueRange offsets,
+//                            ValueRange shape, ValueRange strides,
+//                            llvm::ArrayRef<int64_t> static_offsets) {
+//   auto offsetRank = static_offsets.size();
+//   auto shapeRank = shape.size() ? shape.size() : getRankOf(source);
+
+//   size_t dynOffsetRank =
+//       std::count_if(static_offsets.begin(), static_offsets.end(),
+//                     [](int64_t d) { return ShapedType::isDynamic(d); });
+
+//   // shape and strides should exists at the same time
+//   // and the final rank for shape and offset (dynamic + static)
+//   // should be the same
+//   assert(shape.size() == strides.size() && shapeRank == offsetRank &&
+//          offsets.size() == dynOffsetRank);
+
+//   state.addOperands(source);
+//   state.addOperands(offsets);
+//   state.addOperands(shape);
+//   state.addOperands(strides);
+//   state.addAttribute(
+//       getOperandSegmentSizesAttrName(state.name),
+//       builder.getDenseI32ArrayAttr({1, static_cast<int32_t>(offsets.size()),
+//                                     static_cast<int32_t>(shape.size()),
+//                                     static_cast<int32_t>(strides.size())}));
+//   state.addAttribute(getStaticOffsetsAttrName(state.name),
+//                      builder.getDenseI64ArrayAttr(static_offsets));
+//   state.addTypes(TensorDesc);
+// }
+
+// void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+//                            Type tdesc, Value source,
+//                            llvm::ArrayRef<OpFoldResult> offsets) {
+//   auto ty = llvm::dyn_cast_if_present<MemRefType>(source.getType());
+//   assert(ty && ty.hasStaticShape() && offsets.size() == getRankOf(source));
+
+//   llvm::SmallVector<int64_t> staticOffsets;
+//   llvm::SmallVector<Value> dynamicOffsets;
+//   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+
+//   build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */,
+//         ValueRange({}) /* empty dynamic shape */,
+//         ValueRange({}) /* empty dynamic strides */,
+//         staticOffsets /* static offsets */);
+// }
+
+// void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+//                            Type tdesc, Value source,
+//                            llvm::ArrayRef<OpFoldResult> offsets,
+//                            ValueRange shape, ValueRange stride) {
+//   assert(shape.size() && offsets.size() && stride.size() &&
+//          shape.size() == stride.size() && shape.size() == offsets.size());
+
+//   llvm::SmallVector<int64_t> staticOffsets;
+//   llvm::SmallVector<Value> dynamicOffsets;
+
+//   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+
+//   build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets,
+//         /* dynamic shape = */ shape , /* dynamic strides = */ stride,
+//         /* static offsets = */ staticOffsets);
+// }
 
 
 LogicalResult CreateNdDescOp::verify() {

>From b40a514960f81d88bbecf0090c9616ed9d098789 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 11 Mar 2024 20:01:00 -0500
Subject: [PATCH 09/19] clean up code

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 46 ++++------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 83 -------------------
 2 files changed, 16 insertions(+), 113 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 447ea2e0f3982b..c961bce4e51094 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -123,56 +123,42 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure,
       return getType().getElementType();
     }
 
-
-
     /// Return the shape of the TensorDesc
     llvm::ArrayRef<int64_t> getTensorDescShape() {
       return getType().getShape();
     }
 
+    /// wrapper for matching with OffsetSizeAndStrideOpInterface
     OperandRange getSizes() {
       return getShape();
     }
 
+    /// wrapper for matching with OffsetSizeAndStrideOpInterface
     SmallVector<int64_t> getStaticSizes() {
-      if (auto ty = getSourceType().dyn_cast<MemRefType>()) {
-        return SmallVector<int64_t>(ty.getShape());
+      if (getSourceType().dyn_cast<IntegerType>()) {
+        auto dims = getMixedOffsets().size();
+        return SmallVector<int64_t>(dims, ShapedType::kDynamic);
       }
+      auto memrefType = getSourceType().dyn_cast<MemRefType>();
+      return SmallVector<int64_t>(memrefType.getShape());
     }
 
+    /// wrapper for matching with OffsetSizeAndStrideOpInterface
     SmallVector<int64_t> getStaticStrides() {
-      if (auto ty = getSourceType().dyn_cast<MemRefType>()) {
-        auto [strides, offset] = getStridesAndOffset(ty);
-        return strides;
+      if (getSourceType().dyn_cast<IntegerType>()) {
+        auto dims = getMixedOffsets().size();
+        return SmallVector<int64_t>(dims, ShapedType::kDynamic);
       }
+      auto memrefType = getSourceType().dyn_cast<MemRefType>();
+      auto [strides, offset] = getStridesAndOffset(memrefType);
+      return strides;
     }
 
-    /// Returns the offsets info to the source. It consolidates
-    /// information from both dynamic_offsets and static_offsets
-    /// parameters. static_offsets parameter always has the expected
-    /// ranks with some dim could have ShapeType::kDynamic value
-    /// indicating the corresponding value should be from dynamic_offsets.
-    llvm::SmallVector<OpFoldResult> getEffectiveOffsets();
-
-    /// returns the shape info of the source. It is either from the
-    /// memref type, if source is a memref with static shape
-    /// information or from the dynamic_shape parameter. If both
-    /// exists, the dynamic_shape parameter will be used and the
-    /// shape information from  memref type will be ignored.
-    llvm::SmallVector<OpFoldResult> getEffectiveShape();
-
-    /// returns the strides info of the source. It is either from the
-    /// memref type, if source is a memref with static shape
-    /// information or from the dynamic_stride parameter. If both
-    /// exists, the dynamic_strides parameter will be used and the
-    /// strides information from  memref type will be ignored.
-    llvm::SmallVector<OpFoldResult> getEffectiveStrides();
-
-
     /// Return the expected rank of each of the`static_offsets`, 
     /// `static_sizes` and `static_strides` attributes.
     std::array<unsigned, 3> getArrayAttrMaxRanks() {
-      return {2, 2, 2};
+      auto rank = getMixedOffsets().size();
+      return {rank, rank, rank};
     }
     
     /// Return the number of leading operands before the `offsets`, `sizes` and
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 5a2bc2d72bfd8c..0c9ab064d62b29 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -124,11 +124,6 @@ LogicalResult CreateNdDescOp::verify() {
   // auto stridesRank = getEffectiveStrides().size();
   // auto baseRank = getRankOf(getSource()) ? getRankOf(getSource()) : 2;
 
-  llvm::dbgs() << "\nNum of mixed Offsets: " << getMixedOffsets().size()
-               << "\nNum of mixed Sizes: " << getMixedSizes().size()
-               << "\nNum of mixed Strides: " << getMixedStrides().size()
-               << "\n";
-
   // if (offsetRank != shapeRank || shapeRank != stridesRank ||
   //     shapeRank != baseRank)
 
@@ -138,84 +133,6 @@ LogicalResult CreateNdDescOp::verify() {
   return success();
 }
 
-// compute consolidated offsets from dynamic_offsets and static_offsets
-// parameters
-llvm::SmallVector<OpFoldResult> CreateNdDescOp::getEffectiveOffsets() {
-  llvm::SmallVector<OpFoldResult> offsets;
-  auto dynamicOffsets = getOffsets(); // offsets variable
-  auto staticOffsets = getStaticOffsets();   // static_offsets attribute
-
-  // in case static_offsets is missing, dynamic_offsets will be used
-  if (staticOffsets.size() == 0) {
-    offsets.assign(dynamicOffsets.begin(), dynamicOffsets.end());
-    return offsets;
-  }
-
-  // use static offsets for each dim if it has valid value, 
-  // othwise use the value from dynamic_offsets
-  for (size_t i = 0, j = 0; i < staticOffsets.size(); i++) {
-    if (ShapedType::isDynamic(staticOffsets[i])) {
-      assert(j < dynamicOffsets.size());
-      offsets.push_back(dynamicOffsets[j++]);
-    } else {
-      auto ty = IndexType::get(getContext());
-      auto attr = IntegerAttr::get(ty, staticOffsets[i]);
-      offsets.push_back(attr);
-    }
-  }
-  return offsets;
-}
-
-// get the consolidated shape of the 2D memory region. 
-// It prefer dynamic_shape than the static shape of 
-// memref type.
-llvm::SmallVector<OpFoldResult> CreateNdDescOp::getEffectiveShape() {
-  llvm::SmallVector<OpFoldResult> shape;
-  auto dynShape = getShape();
-  if (dynShape.size()) {
-    shape.append(dynShape.begin(), dynShape.end());
-    return shape;
-  }
-
-  auto ty = llvm::dyn_cast_if_present<MemRefType>(getSourceType());
-  if (ty && ty.hasStaticShape()) {
-    for (auto dim : ty.getShape()) {
-      auto attr = IntegerAttr::get(IndexType::get(getContext()), dim);
-      shape.push_back(attr);
-    }
-    return shape;
-  }
-  
-  this->emitError("The shape information of the memory is missing.\n");
-  return {};
-}
-
-// get the consolidated strides of the 2D memory region. 
-// It prefer dynamic_stride than the static strides of 
-// memref type.
-llvm::SmallVector<OpFoldResult> CreateNdDescOp::getEffectiveStrides() {
-  llvm::SmallVector<OpFoldResult> strides;
-
-  auto dynStrides = getStrides();
-  if (dynStrides.size()) {
-    strides.append(dynStrides.begin(), dynStrides.end());
-    return strides;
-  }
-
-  auto ty = llvm::dyn_cast_if_present<MemRefType>(getSourceType());
-  if (ty && ty.hasStaticShape()) {
-    auto [staticStrides, offset] = getStridesAndOffset(ty);
-    for (auto dim : staticStrides) {
-      auto attr = IntegerAttr::get(IndexType::get(getContext()), dim);
-      strides.push_back(attr);
-    }
-    return strides;
-  }
-
-  this->emitError("The strides information of the memory is missing.\n");
-  return {};
-}
-
 //===----------------------------------------------------------------------===//
 // XeGPU_LoadNDOp
 //===----------------------------------------------------------------------===//

>From b050207d16e0b90c87ed9fd668a04a2454a2e7af Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 12 Mar 2024 09:23:00 -0500
Subject: [PATCH 10/19] fix typos and improve CreateNdDescOp::verifier

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |  47 +++---
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       |   6 +-
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 136 ++++++------------
 3 files changed, 76 insertions(+), 113 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index c961bce4e51094..5d0d6f359292d9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -32,7 +32,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure,
   let summary = "create nd tensor descriptor operation";
   let description = [{
     The "create_nd_tdesc" operation creates a TensorDescType which represents
-    a sub-view of a 2D memory region (It can be extended to support N-D memory
+    a sub-view of a 2D memory region (It can be extended to support n-D memory
     region if needed in future). Elements in the subview continuous in each 
     dimention. It encodes the following important information for supporting 
     Intel hardware features:
@@ -94,18 +94,14 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure,
 
   let hasVerifier = 1;
 
-//  let builders = [
-//    OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets, 
-//                   "ValueRange": $shape, "ValueRange": $strides, 
-//                   "llvm::ArrayRef<int64_t>": $static_offsets)>,
-//
-//    OpBuilder<(ins "Type": $tdesc, "Value": $source, 
-//                   "llvm::ArrayRef<OpFoldResult>": $offsets)>,
-//
-//    OpBuilder<(ins "Type": $tdesc, "Value": $source, 
-//                   "llvm::ArrayRef<OpFoldResult>": $offsets,
-//                   "ValueRange": $shape, "ValueRange": $stride)>
-//  ];
+  let builders = [
+    OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source, 
+                   "llvm::ArrayRef<OpFoldResult>": $offsets)>,
+
+    OpBuilder<(ins "Type": $tdesc, "TypedValue<IntegerType> ": $source, 
+                   "llvm::ArrayRef<OpFoldResult>": $offsets,
+                   "ValueRange": $shape, "ValueRange": $stride)>
+  ];
 
   let extraClassDeclaration = [{
     /// Returns the type of the source memref operand.
@@ -134,6 +130,9 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure,
     }
 
     /// wrapper for matching with OffsetSizeAndStrideOpInterface
+    /// If source is integer it will simply return an array of 
+    /// ShapedType::kDynamic representing dynamic shape encoded 
+    /// in the `shape` argument will be used.
     SmallVector<int64_t> getStaticSizes() {
       if (getSourceType().dyn_cast<IntegerType>()) {
         auto dims = getMixedOffsets().size();
@@ -144,6 +143,9 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure,
     }
 
     /// wrapper for matching with OffsetSizeAndStrideOpInterface
+    /// If source is integer it will simply return an array of 
+    /// ShapedType::kDynamic representing dynamic strides encoded 
+    /// in the `strides` argument will be used.
     SmallVector<int64_t> getStaticStrides() {
       if (getSourceType().dyn_cast<IntegerType>()) {
         auto dims = getMixedOffsets().size();
@@ -155,14 +157,19 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure,
     }
 
     /// Return the expected rank of each of the`static_offsets`, 
-    /// `static_sizes` and `static_strides` attributes.
+    /// `static_shape` and `static_strides` attributes.
     std::array<unsigned, 3> getArrayAttrMaxRanks() {
-      auto rank = getMixedOffsets().size();
+      unsigned rank;
+      if (auto ty = getSourceType().dyn_cast<MemRefType>()) {
+        rank = ty.getRank();
+      } else {
+        rank = (unsigned)getMixedOffsets().size();
+      }
       return {rank, rank, rank};
     }
     
-    /// Return the number of leading operands before the `offsets`, `sizes` and
-    /// and `strides` operands.
+    /// Return the number of leading operands before the `offsets`, 
+    /// `shape` and `strides` operands.
     static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
   }];
 }
@@ -182,11 +189,11 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
 }
 
 
-def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> {
+def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> {
   let summary = "loads a n-D block from memory (represented by TensorDesc)" 
                 "to registers (represented by vector)";
   let description = [{
-    LoadNDOp essentially mimics the hardware block read instruction to read 
+    LoadNdOp essentially mimics the hardware block read instruction to read 
     a block of data from memory to register. It takes a set of cache hints 
     for each level of cache, L1, L2 and L3. If hardware does not have a 
     correspoding cache, Corresponding cache hint attribute will be masked.
@@ -222,7 +229,7 @@ def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> {
   let hasVerifier = 1;
 }
 
-def XeGPU_StoreNDOp : XeGPU_Op<"store_nd", []> {
+def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> {
   let summary = "stores a n-D block register region back to memory, currently only supports 2D";
   let arguments = (ins XeGPU_ValueType: $value,
                        XeGPU_TensorDesc: $TensorDesc,
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 36b04ea12bcad0..8734c1c364e572 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -41,8 +41,8 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     It encodes the following information:
 
     * shape:  the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows
-              and each row contains 16 continious data element. The rows could be
-              either continuous or not, depends on whether the encoding attribute
+              and each row contains 16 contiguous data element. The rows could be
+              either contiguous or not, depends on whether the encoding attribute
               is set or not.
     * element_type: the data type of the data element, e.g., f16, f32.
 
@@ -50,7 +50,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     the following information via the TensorDescAttr object:
     * memory_scope (xegpu::MemoryScope): [optional] where the data is located, 
                 global memory or shared memory. It is default to Global.
-    * array_length (int): [optional] The number of continuous blocks with size as `shape`,
+    * array_length (int): [optional] The number of contiguous blocks with size as `shape`,
                that will be loaded by block load at a time. It is default to 1.
     * boundary_check (bool): [optional] indicates whether the operation detects the boundary 
                 and pads with zero for out-of-boundary access. It is default to do boundary check.
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 0c9ab064d62b29..7b8f853827e41c 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -10,23 +10,11 @@
 #include <mlir/Dialect/XeGPU/IR/XeGPU.h>
 #include <mlir/IR/Builders.h>
 
-#include <llvm/Support/Debug.h>
-
 #define DEBUG_TYPE "xegpu"
 
 namespace mlir {
 namespace xegpu {
 
-static size_t getRankOf(Value value) {
-  if (value.getType().isIntOrIndexOrFloat())
-    return 0;
-  if (auto ty = llvm::dyn_cast_if_present<MemRefType>(value.getType()))
-    return ty.getRank();
-  if (auto ty = llvm::dyn_cast_if_present<VectorType>(value.getType()))
-    return ty.getRank();
-  llvm_unreachable("Unsupported value for getRankOf");
-}
-
 static void transpose(llvm::ArrayRef<int64_t> trans,
                       std::vector<int64_t> &shape) {
   std::vector<int64_t> old = shape;
@@ -53,96 +41,64 @@ static std::string makeString(T array, bool breakline = false) {
 //===----------------------------------------------------------------------===//
 // XeGPU_CreateNdDescOp
 //===----------------------------------------------------------------------===//
-// void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
-//                            Type TensorDesc, Value source, ValueRange offsets,
-//                            ValueRange shape, ValueRange strides,
-//                            llvm::ArrayRef<int64_t> static_offsets) {
-//   auto offsetRank = static_offsets.size();
-//   auto shapeRank = shape.size() ? shape.size() : getRankOf(source);
-
-//   size_t dynOffsetRank =
-//       std::count_if(static_offsets.begin(), static_offsets.end(),
-//                     [](int64_t d) { return ShapedType::isDynamic(d); });
-
-//   // shape and strides should exists at the same time
-//   // and the final rank for shape and offset (dynamic + static)
-//   // should be the same
-//   assert(shape.size() == strides.size() && shapeRank == offsetRank &&
-//          offsets.size() == dynOffsetRank);
-
-//   state.addOperands(source);
-//   state.addOperands(offsets);
-//   state.addOperands(shape);
-//   state.addOperands(strides);
-//   state.addAttribute(
-//       getOperandSegmentSizesAttrName(state.name),
-//       builder.getDenseI32ArrayAttr({1, static_cast<int32_t>(offsets.size()),
-//                                     static_cast<int32_t>(shape.size()),
-//                                     static_cast<int32_t>(strides.size())}));
-//   state.addAttribute(getStaticOffsetsAttrName(state.name),
-//                      builder.getDenseI64ArrayAttr(static_offsets));
-//   state.addTypes(TensorDesc);
-// }
-
-// void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
-//                            Type tdesc, Value source,
-//                            llvm::ArrayRef<OpFoldResult> offsets) {
-//   auto ty = llvm::dyn_cast_if_present<MemRefType>(source.getType());
-//   assert(ty && ty.hasStaticShape() && offsets.size() == getRankOf(source));
-
-//   llvm::SmallVector<int64_t> staticOffsets;
-//   llvm::SmallVector<Value> dynamicOffsets;
-//   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
-
-//   build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */,
-//         ValueRange({}) /* empty dynamic shape */,
-//         ValueRange({}) /* empty dynamic strides */,
-//         staticOffsets /* static offsets */);
-// }
-
-// void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
-//                            Type tdesc, Value source,
-//                            llvm::ArrayRef<OpFoldResult> offsets,
-//                            ValueRange shape, ValueRange stride) {
-//   assert(shape.size() && offsets.size() && stride.size() &&
-//          shape.size() == stride.size() && shape.size() == offsets.size());
-
-//   llvm::SmallVector<int64_t> staticOffsets;
-//   llvm::SmallVector<Value> dynamicOffsets;
-
-//   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
-
-//   build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets,
-//         /* dynamic shape = */ shape , /* dynamic strides = */ stride,
-//         /* static offsets = */ staticOffsets);
-// }
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type tdesc, TypedValue<MemRefType> source,
+                           llvm::ArrayRef<OpFoldResult> offsets) {
+  auto ty = llvm::dyn_cast_if_present<MemRefType>(source.getType());
+  assert(ty && ty.hasStaticShape() && offsets.size() == ty.getRank());
+
+  llvm::SmallVector<int64_t> staticOffsets;
+  llvm::SmallVector<Value> dynamicOffsets;
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+
+  build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */,
+        ValueRange({}) /* empty dynamic shape */,
+        ValueRange({}) /* empty dynamic strides */,
+        staticOffsets /* static offsets */);
+}
 
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type tdesc, TypedValue<IntegerType> source,
+                           llvm::ArrayRef<OpFoldResult> offsets,
+                           ValueRange shape, ValueRange stride) {
+  assert(shape.size() && offsets.size() && stride.size() &&
+         shape.size() == stride.size() && shape.size() == offsets.size());
 
-LogicalResult CreateNdDescOp::verify() {
-  // auto offsetRank = getEffectiveOffsets().size();
-  // auto shapeRank = getEffectiveShape().size();
-  // auto stridesRank = getEffectiveStrides().size();
-  // auto baseRank = getRankOf(getSource()) ? getRankOf(getSource()) : 2;
+  llvm::SmallVector<int64_t> staticOffsets;
+  llvm::SmallVector<Value> dynamicOffsets;
 
-  // if (offsetRank != shapeRank || shapeRank != stridesRank ||
-  //     shapeRank != baseRank)
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
 
-  //   return emitOpError(
-  //       "Expecting the rank of shape, strides, offsets and memref type "
-  //       "should match with each other (they currently should be 2D).");
+  build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets,
+        /* dynamic shape = */ shape , /* dynamic strides = */ stride,
+        /* static offsets = */ staticOffsets);
+}
+
+LogicalResult CreateNdDescOp::verify() {
+  auto rank = getMixedOffsets().size();
+  bool invalid = (rank != 2);
+  auto memrefTy = getSourceType().dyn_cast<MemRefType>();
+  if (memrefTy) {
+    invalid |= (memrefTy.getRank() != rank);
+  }
+  if (invalid) {
+    return emitOpError("Expecting the rank of shape, strides, offsets and "
+                       "memref type (if source is a memref) should match "
+                       "with each other. They currenlty are 2D.");
+  }
   return success();
 }
 
 //===----------------------------------------------------------------------===//
-// XeGPU_LoadNDOp
+// XeGPU_LoadNdOp
 //===----------------------------------------------------------------------===//
-LogicalResult LoadNDOp::verify() {
+LogicalResult LoadNdOp::verify() {
   auto tdescTy = getTensorDescType();
   auto valueTy = getType();
 
   if (tdescTy.getRank() != 2)
     return emitOpError(
-        "The TensorDesc for LoadNDOp should be a 2D TensorDesc.");
+        "The TensorDesc for LoadNdOp should be a 2D TensorDesc.");
 
   if (!valueTy)
     return emitOpError("Invalid result, it should be a VectorType.\n");
@@ -186,9 +142,9 @@ LogicalResult LoadNDOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
-// XeGPU_StoreNDOp
+// XeGPU_StoreNdOp
 //===----------------------------------------------------------------------===//
-LogicalResult StoreNDOp::verify() {
+LogicalResult StoreNdOp::verify() {
   auto dstTy = getTensorDesc().getType();               // Tile
   auto valTy = getValue().getType().cast<VectorType>(); // Vector
 

>From 2ca12a7bb74f03030aad0dff2f37469110786b6e Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 12 Mar 2024 09:29:49 -0500
Subject: [PATCH 11/19] code format

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h |  1 -
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp |  5 ++--
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp     | 27 +++++++++++++---------
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 662fd7ef197414..87aabdc015fea5 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -16,7 +16,6 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 
-
 namespace mlir {
 namespace xegpu {
 // placeholder
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index bd72d5c17b6ea1..43337a6ab43dcd 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -29,7 +29,6 @@ void XeGPUDialect::initialize() {
       >();
 }
 
-
 //===----------------------------------------------------------------------===//
 // XeGPU_TensorDescAttr
 //===----------------------------------------------------------------------===//
@@ -62,7 +61,8 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
   if (mlir::succeeded(parser.parseOptionalComma())) {
     encoding = mlir::FieldParser<mlir::Attribute>::parse(parser);
     if (mlir::failed(encoding)) {
-      parser.emitError(parser.getCurrentLocation(),
+      parser.emitError(
+          parser.getCurrentLocation(),
           "Failed to parse the attribute field for TensorDescType.\n");
       return {};
     }
@@ -96,7 +96,6 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const {
   printer << ">";
 }
 
-
 } // namespace xegpu
 } // namespace mlir
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 7b8f853827e41c..ee57f7a4f748ae 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -70,21 +70,24 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
 
   build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets,
-        /* dynamic shape = */ shape , /* dynamic strides = */ stride,
+        /* dynamic shape = */ shape, /* dynamic strides = */ stride,
         /* static offsets = */ staticOffsets);
 }
 
 LogicalResult CreateNdDescOp::verify() {
   auto rank = getMixedOffsets().size();
   bool invalid = (rank != 2);
+
   auto memrefTy = getSourceType().dyn_cast<MemRefType>();
-  if (memrefTy) {
+  if (memrefTy)
     invalid |= (memrefTy.getRank() != rank);
-  }
+
+  invalid = (getTensorDescType().getRank() != rank);
+
   if (invalid) {
-    return emitOpError("Expecting the rank of shape, strides, offsets and "
-                       "memref type (if source is a memref) should match "
-                       "with each other. They currenlty are 2D.");
+    return emitOpError("Expecting the rank of shape, strides, offsets, "
+            "source memref type (if source is a memref) and TensorDesc "
+            "should match with each other. They currenlty are 2D.");
   }
   return success();
 }
@@ -135,9 +138,10 @@ LogicalResult LoadNdOp::verify() {
   }
 
   if (tdescShape != valueShape)
-    return emitOpError() <<"Result shape doesn't match TensorDesc shape."
-           << "The expected shape is " << makeString(tdescShape) << ". "
-           << "But the given shape is " << makeString(valueShape) << ".\n";
+    return emitOpError() << "Result shape doesn't match TensorDesc shape."
+                         << "The expected shape is " << makeString(tdescShape)
+                         << ". But the given shape is " << makeString(valueShape)
+                         << ".\n";
   return success();
 }
 
@@ -159,11 +163,12 @@ LogicalResult StoreNdOp::verify() {
 
   if (dstElemTy != valElemTy) {
     return emitOpError() << "The element type of the value should "
-                       "match the elementtype of the TensorDesc.\n";
+                            "match the elementtype of the TensorDesc.\n";
   }
 
   if (dstTy.getShape() != valTy.getShape())
-    return emitOpError() << "The result shape should match the TensorDesc shape.\n";
+    return emitOpError()
+           << "The result shape should match the TensorDesc shape.\n";
   return success();
 }
 

>From 9039b5fc36daaedfa2db8160486cb5d6bd795036 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 12 Mar 2024 09:31:00 -0500
Subject: [PATCH 12/19] code format

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index ee57f7a4f748ae..a21c7607c73b5d 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -85,9 +85,10 @@ LogicalResult CreateNdDescOp::verify() {
   invalid = (getTensorDescType().getRank() != rank);
 
   if (invalid) {
-    return emitOpError("Expecting the rank of shape, strides, offsets, "
-            "source memref type (if source is a memref) and TensorDesc "
-            "should match with each other. They currenlty are 2D.");
+    return emitOpError(
+        "Expecting the rank of shape, strides, offsets, "
+        "source memref type (if source is a memref) and TensorDesc "
+        "should match with each other. They currenlty are 2D.");
   }
   return success();
 }
@@ -140,8 +141,8 @@ LogicalResult LoadNdOp::verify() {
   if (tdescShape != valueShape)
     return emitOpError() << "Result shape doesn't match TensorDesc shape."
                          << "The expected shape is " << makeString(tdescShape)
-                         << ". But the given shape is " << makeString(valueShape)
-                         << ".\n";
+                         << ". But the given shape is "
+                         << makeString(valueShape) << ".\n";
   return success();
 }
 

>From 632637eda0688ba31a2892f3df5fbc4893467c62 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 12 Mar 2024 10:29:58 -0500
Subject: [PATCH 13/19] sync viewlikeOpInterface and some updates

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 24 +++++++++++--------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 23 +++++++++++++-----
 2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 5d0d6f359292d9..24fae9596994e3 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -26,8 +26,8 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
           Op<XeGPU_Dialect, mnemonic, traits>;
 
 
-def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, 
-  AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> {
+def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface, 
+                        AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> {
 
   let summary = "create nd tensor descriptor operation";
   let description = [{
@@ -130,11 +130,12 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure,
     }
 
     /// wrapper for matching with OffsetSizeAndStrideOpInterface
-    /// If source is integer it will simply return an array of 
-    /// ShapedType::kDynamic representing dynamic shape encoded 
-    /// in the `shape` argument will be used.
+    /// If source is IntegerType and `shape` is filled, it will 
+    /// return an array of ShapedType::kDynamic representing dynamic 
+    /// shape encoded in the `shape` argument will be used. Presence
+    /// of `shape` overides static shape from source memref type.
     SmallVector<int64_t> getStaticSizes() {
-      if (getSourceType().dyn_cast<IntegerType>()) {
+      if (getSourceType().isa<IntegerType>() || getShape().size()) {
         auto dims = getMixedOffsets().size();
         return SmallVector<int64_t>(dims, ShapedType::kDynamic);
       }
@@ -143,11 +144,12 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure,
     }
 
     /// wrapper for matching with OffsetSizeAndStrideOpInterface
-    /// If source is integer it will simply return an array of 
-    /// ShapedType::kDynamic representing dynamic strides encoded 
-    /// in the `strides` argument will be used.
+    /// If source is IntegerType or `strides` is filled, it will 
+    /// return an array of ShapedType::kDynamic representing dynamic 
+    /// strides encoded in the `strides` argument will be used. Presence
+    /// of `strides` overides static strides from source memref type.
     SmallVector<int64_t> getStaticStrides() {
-      if (getSourceType().dyn_cast<IntegerType>()) {
+      if (getSourceType().isa<IntegerType>() || getStrides().size()) {
         auto dims = getMixedOffsets().size();
         return SmallVector<int64_t>(dims, ShapedType::kDynamic);
       }
@@ -171,6 +173,8 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure,
     /// Return the number of leading operands before the `offsets`, 
     /// `shape` and `strides` operands.
     static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
+
+    mlir::Value getViewSource() { return getSource(); }
   }];
 }
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index a21c7607c73b5d..5adf1c2a6b9849 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -76,20 +76,31 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
 
 LogicalResult CreateNdDescOp::verify() {
   auto rank = getMixedOffsets().size();
-  bool invalid = (rank != 2);
+  bool invalidRank = (rank != 2);
+  bool invalidElemTy = false;
 
+  // check source type matches the rank if it is a memref
   auto memrefTy = getSourceType().dyn_cast<MemRefType>();
-  if (memrefTy)
-    invalid |= (memrefTy.getRank() != rank);
+  if (memrefTy) {
+    invalidRank |= (memrefTy.getRank() != rank);
+    // TensorDesc should have the same element type with memref.
+    invalidElemTy != memrefTy.getElementType() != getElementType(); 
+  }
 
-  invalid = (getTensorDescType().getRank() != rank);
+  // check result type matches the rank
+  invalidRank = (getType().getRank() != rank);
 
-  if (invalid) {
+  if (invalidRank)
     return emitOpError(
         "Expecting the rank of shape, strides, offsets, "
         "source memref type (if source is a memref) and TensorDesc "
         "should match with each other. They currenlty are 2D.");
-  }
+
+  if (invalidElemTy)
+    return emitOpError("TensorDesc should have the same element "
+                       "type with the source if it is a memref.\n");
+
+
   return success();
 }
 

>From 447d623cf501fd19f6f9b9188ee65eca0b9a6dc0 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 12 Mar 2024 15:46:30 +0000
Subject: [PATCH 14/19] fix a typo

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 5adf1c2a6b9849..077877cb444b47 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -44,8 +44,8 @@ static std::string makeString(T array, bool breakline = false) {
 void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
                            Type tdesc, TypedValue<MemRefType> source,
                            llvm::ArrayRef<OpFoldResult> offsets) {
-  auto ty = llvm::dyn_cast_if_present<MemRefType>(source.getType());
-  assert(ty && ty.hasStaticShape() && offsets.size() == ty.getRank());
+  auto ty = source.getType();
+  assert(ty && ty.hasStaticShape() && offsets.size() == (size_t)ty.getRank());
 
   llvm::SmallVector<int64_t> staticOffsets;
   llvm::SmallVector<Value> dynamicOffsets;
@@ -75,21 +75,24 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
 }
 
 LogicalResult CreateNdDescOp::verify() {
-  auto rank = getMixedOffsets().size();
+  auto rank = (int64_t)getMixedOffsets().size();
   bool invalidRank = (rank != 2);
   bool invalidElemTy = false;
 
-  // check source type matches the rank if it is a memref
+  // check source type matches the rank if it is a memref.
+  // It also should have the same ElementType as TensorDesc.
   auto memrefTy = getSourceType().dyn_cast<MemRefType>();
   if (memrefTy) {
     invalidRank |= (memrefTy.getRank() != rank);
-    // TensorDesc should have the same element type with memref.
-    invalidElemTy != memrefTy.getElementType() != getElementType(); 
+    invalidElemTy |= memrefTy.getElementType() != getElementType(); 
   }
 
   // check result type matches the rank
   invalidRank = (getType().getRank() != rank);
 
+  // mismatches among shape, strides, and offsets are
+  // already handeled by OffsetSizeAndStrideOpInterface.
+  // So they are not check here.
   if (invalidRank)
     return emitOpError(
         "Expecting the rank of shape, strides, offsets, "

>From 37a348d37dec5a8f3c0d0f85e41cdc7d008b2ece Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 12 Mar 2024 15:47:08 +0000
Subject: [PATCH 15/19] code format

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 077877cb444b47..08723d12c278c8 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -84,7 +84,7 @@ LogicalResult CreateNdDescOp::verify() {
   auto memrefTy = getSourceType().dyn_cast<MemRefType>();
   if (memrefTy) {
     invalidRank |= (memrefTy.getRank() != rank);
-    invalidElemTy |= memrefTy.getElementType() != getElementType(); 
+    invalidElemTy |= memrefTy.getElementType() != getElementType();
   }
 
   // check result type matches the rank
@@ -103,7 +103,6 @@ LogicalResult CreateNdDescOp::verify() {
     return emitOpError("TensorDesc should have the same element "
                        "type with the source if it is a memref.\n");
 
-
   return success();
 }
 

>From ff338280afdd024f7bc95417f947f64266f6b90e Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 12 Mar 2024 16:19:14 +0000
Subject: [PATCH 16/19] add ViewLikeOpInterface and
 OffsetSizeAndStrideOpInterface to createNdDescOp

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h    |  11 +-
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 121 ++++++------
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       |   8 +-
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 174 ++++--------------
 4 files changed, 113 insertions(+), 201 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 8dc3ff78d25ede..87aabdc015fea5 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -9,11 +9,12 @@
 #ifndef MLIR_DIALECT_XEGPU_IR_XEGPU_H
 #define MLIR_DIALECT_XEGPU_IR_XEGPU_H
 
-#include <mlir/Bytecode/BytecodeOpInterface.h>
-#include <mlir/IR/BuiltinTypes.h>
-#include <mlir/IR/Dialect.h>
-#include <mlir/Interfaces/ShapedOpInterfaces.h>
-#include <mlir/Interfaces/SideEffectInterfaces.h>
+#include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/Interfaces/ShapedOpInterfaces.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
 
 namespace mlir {
 namespace xegpu {
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 9d37d77e03a0c5..24fae9596994e3 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -9,26 +9,13 @@
 #ifndef MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
 #define MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
 
+include "mlir/IR/AttrTypeBase.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
-include "mlir/IR/AttrTypeBase.td"
-
-
-include "mlir/IR/OpBase.td"
-include "mlir/IR/OpAsmInterface.td"
-include "mlir/IR/AttrTypeBase.td"
-include "mlir/IR/BuiltinTypes.td"
-include "mlir/IR/BuiltinTypeInterfaces.td"
+include "mlir/Interfaces/ShapedOpInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
-include "mlir/Interfaces/CastInterfaces.td"
-include "mlir/Interfaces/ControlFlowInterfaces.td"
-include "mlir/Interfaces/CopyOpInterface.td"
-include "mlir/Interfaces/InferTypeOpInterface.td"
-include "mlir/Interfaces/ShapedOpInterfaces.td"
-
 
 // Base class for dialect operations. This operation inherits from the base
 // `Op` class in OpBase.td, and provides:
@@ -39,12 +26,13 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
           Op<XeGPU_Dialect, mnemonic, traits>;
 
 
-def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSegments]> {
+def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface, 
+                        AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> {
 
   let summary = "create nd tensor descriptor operation";
   let description = [{
     The "create_nd_tdesc" operation creates a TensorDescType which represents
-    a sub-view of a 2D memory region (It can be extended to support N-D memory
+    a sub-view of a 2D memory region (It can be extended to support n-D memory
     region if needed in future). Elements in the subview continuous in each 
     dimention. It encodes the following important information for supporting 
     Intel hardware features:
@@ -90,31 +78,27 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg
 
   let arguments = (ins 
     XeGPU_BaseAddrType: $source, 
-    Variadic<Index>: $dynamic_offsets, 
-    Variadic<Index>: $dynamic_shape, 
-    Variadic<Index>: $dynamic_strides,
+    Variadic<Index>: $offsets, 
+    Variadic<Index>: $shape, 
+    Variadic<Index>: $strides,
     DenseI64ArrayAttr: $static_offsets
   );
   let results = (outs XeGPU_TensorDesc: $TensorDesc);
 
   let assemblyFormat = [{
     $source ``
-    custom<DynamicIndexList>($dynamic_offsets, $static_offsets)
-    (`,` `[` $dynamic_shape^ `]` `,` `[` $dynamic_strides `]`)?
+    custom<DynamicIndexList>($offsets, $static_offsets)
+    (`,` `[` $shape^ `]` `,` `[` $strides `]`)?
     attr-dict `:` type($source) `->` qualified(type($TensorDesc))
   }];
-  let skipDefaultBuilders = 1;
+
   let hasVerifier = 1;
 
   let builders = [
-    OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets, 
-                   "ValueRange": $shape, "ValueRange": $strides, 
-                   "llvm::ArrayRef<int64_t>": $static_offsets)>,
-
-    OpBuilder<(ins "Type": $tdesc, "Value": $source, 
+    OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source, 
                    "llvm::ArrayRef<OpFoldResult>": $offsets)>,
 
-    OpBuilder<(ins "Type": $tdesc, "Value": $source, 
+    OpBuilder<(ins "Type": $tdesc, "TypedValue<IntegerType> ": $source, 
                    "llvm::ArrayRef<OpFoldResult>": $offsets,
                    "ValueRange": $shape, "ValueRange": $stride)>
   ];
@@ -130,27 +114,6 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg
       return getTensorDesc().getType();
     }
 
-    /// Returns the offsets info to the source. It consolidates
-    /// information from both dynamic_offsets and static_offsets
-    /// parameters. static_offsets parameter always has the expected
-    /// ranks with some dim could have ShapeType::kDynamic value
-    /// indicating the corresponding value should be from dynamic_offsets.
-    llvm::SmallVector<OpFoldResult> getOffsets();
-
-    /// returns the shape info of the source. It is either from the
-    /// memref type, if source is a memref with static shape
-    /// information or from the dynamic_shape parameter. If both
-    /// exists, the dynamic_shape parameter will be used and the
-    /// shape information from  memref type will be ignored.
-    llvm::SmallVector<OpFoldResult> getShape();
-
-    /// returns the strides info of the source. It is either from the
-    /// memref type, if source is a memref with static shape
-    /// information or from the dynamic_stride parameter. If both
-    /// exists, the dynamic_strides parameter will be used and the
-    /// strides information from  memref type will be ignored.
-    llvm::SmallVector<OpFoldResult> getStrides();
-
     /// Return the element type of the TensorDesc
     Type getElementType() {
       return getType().getElementType();
@@ -160,6 +123,58 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg
     llvm::ArrayRef<int64_t> getTensorDescShape() {
       return getType().getShape();
     }
+
+    /// wrapper for matching with OffsetSizeAndStrideOpInterface
+    OperandRange getSizes() {
+      return getShape();
+    }
+
+    /// wrapper for matching with OffsetSizeAndStrideOpInterface
+    /// If source is IntegerType and `shape` is filled, it will 
+    /// return an array of ShapedType::kDynamic representing dynamic 
+    /// shape encoded in the `shape` argument will be used. Presence
+    /// of `shape` overides static shape from source memref type.
+    SmallVector<int64_t> getStaticSizes() {
+      if (getSourceType().isa<IntegerType>() || getShape().size()) {
+        auto dims = getMixedOffsets().size();
+        return SmallVector<int64_t>(dims, ShapedType::kDynamic);
+      }
+      auto memrefType = getSourceType().dyn_cast<MemRefType>();
+      return SmallVector<int64_t>(memrefType.getShape());
+    }
+
+    /// wrapper for matching with OffsetSizeAndStrideOpInterface
+    /// If source is IntegerType or `strides` is filled, it will 
+    /// return an array of ShapedType::kDynamic representing dynamic 
+    /// strides encoded in the `strides` argument will be used. Presence
+    /// of `strides` overides static strides from source memref type.
+    SmallVector<int64_t> getStaticStrides() {
+      if (getSourceType().isa<IntegerType>() || getStrides().size()) {
+        auto dims = getMixedOffsets().size();
+        return SmallVector<int64_t>(dims, ShapedType::kDynamic);
+      }
+      auto memrefType = getSourceType().dyn_cast<MemRefType>();
+      auto [strides, offset] = getStridesAndOffset(memrefType);
+      return strides;
+    }
+
+    /// Return the expected rank of each of the`static_offsets`, 
+    /// `static_shape` and `static_strides` attributes.
+    std::array<unsigned, 3> getArrayAttrMaxRanks() {
+      unsigned rank;
+      if (auto ty = getSourceType().dyn_cast<MemRefType>()) {
+        rank = ty.getRank();
+      } else {
+        rank = (unsigned)getMixedOffsets().size();
+      }
+      return {rank, rank, rank};
+    }
+    
+    /// Return the number of leading operands before the `offsets`, 
+    /// `shape` and `strides` operands.
+    static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
+
+    mlir::Value getViewSource() { return getSource(); }
   }];
 }
 
@@ -178,11 +193,11 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
 }
 
 
-def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> {
+def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> {
   let summary = "loads a n-D block from memory (represented by TensorDesc)" 
                 "to registers (represented by vector)";
   let description = [{
-    LoadNDOp essentially mimics the hardware block read instruction to read 
+    LoadNdOp essentially mimics the hardware block read instruction to read 
     a block of data from memory to register. It takes a set of cache hints 
     for each level of cache, L1, L2 and L3. If hardware does not have a 
     correspoding cache, Corresponding cache hint attribute will be masked.
@@ -218,7 +233,7 @@ def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> {
   let hasVerifier = 1;
 }
 
-def XeGPU_StoreNDOp : XeGPU_Op<"store_nd", []> {
+def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> {
   let summary = "stores a n-D block register region back to memory, currently only supports 2D";
   let arguments = (ins XeGPU_ValueType: $value,
                        XeGPU_TensorDesc: $TensorDesc,
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 36b04ea12bcad0..19ac1693712dd8 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -37,12 +37,12 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     TensorDesc is a type designed to describe regions of the interested data as well as some 
     features that are unique to Intel hardware. Different with the builtin tensor type in MLIR, 
     it essentially only contains the meta data, and doesn't hold the data by itself. It is designed 
-    to mainly support 2d block load/store and DPAS (matrix multiplication instruction) on Intel GPU. 
+    to mainly support 2D block load/store and DPAS (matrix multiplication instruction) on Intel GPU. 
     It encodes the following information:
 
     * shape:  the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows
-              and each row contains 16 continious data element. The rows could be
-              either continuous or not, depends on whether the encoding attribute
+              and each row contains 16 contiguous data element. The rows could be
+              either contiguous or not, depends on whether the encoding attribute
               is set or not.
     * element_type: the data type of the data element, e.g., f16, f32.
 
@@ -50,7 +50,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     the following information via the TensorDescAttr object:
     * memory_scope (xegpu::MemoryScope): [optional] where the data is located, 
                 global memory or shared memory. It is default to Global.
-    * array_length (int): [optional] The number of continuous blocks with size as `shape`,
+    * array_length (int): [optional] The number of contiguous blocks with size as `shape`,
                that will be loaded by block load at a time. It is default to 1.
     * boundary_check (bool): [optional] indicates whether the operation detects the boundary 
                 and pads with zero for out-of-boundary access. It is default to do boundary check.
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index be631c4678eacb..08723d12c278c8 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -9,23 +9,12 @@
 #include <mlir/Dialect/Utils/StaticValueUtils.h>
 #include <mlir/Dialect/XeGPU/IR/XeGPU.h>
 #include <mlir/IR/Builders.h>
-#include <mlir/Interfaces/ViewLikeInterface.h>
 
 #define DEBUG_TYPE "xegpu"
 
 namespace mlir {
 namespace xegpu {
 
-static size_t getRankOf(Value value) {
-  if (value.getType().isIntOrIndexOrFloat())
-    return 0;
-  if (auto ty = llvm::dyn_cast_if_present<MemRefType>(value.getType()))
-    return ty.getRank();
-  if (auto ty = llvm::dyn_cast_if_present<VectorType>(value.getType()))
-    return ty.getRank();
-  llvm_unreachable("Unsupported value for getRankOf");
-}
-
 static void transpose(llvm::ArrayRef<int64_t> trans,
                       std::vector<int64_t> &shape) {
   std::vector<int64_t> old = shape;
@@ -53,41 +42,10 @@ static std::string makeString(T array, bool breakline = false) {
 // XeGPU_CreateNdDescOp
 //===----------------------------------------------------------------------===//
 void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
-                           Type TensorDesc, Value source, ValueRange offsets,
-                           ValueRange shape, ValueRange strides,
-                           llvm::ArrayRef<int64_t> static_offsets) {
-  auto offsetRank = static_offsets.size();
-  auto shapeRank = shape.size() ? shape.size() : getRankOf(source);
-
-  size_t dynOffsetRank =
-      std::count_if(static_offsets.begin(), static_offsets.end(),
-                    [](int64_t d) { return ShapedType::isDynamic(d); });
-
-  // shape and strides should exists at the same time
-  // and the final rank for shape and offset (dynamic + static)
-  // should be the same
-  assert(shape.size() == strides.size() && shapeRank == offsetRank &&
-         offsets.size() == dynOffsetRank);
-
-  state.addOperands(source);
-  state.addOperands(offsets);
-  state.addOperands(shape);
-  state.addOperands(strides);
-  state.addAttribute(
-      getOperandSegmentSizesAttrName(state.name),
-      builder.getDenseI32ArrayAttr({1, static_cast<int32_t>(offsets.size()),
-                                    static_cast<int32_t>(shape.size()),
-                                    static_cast<int32_t>(strides.size())}));
-  state.addAttribute(getStaticOffsetsAttrName(state.name),
-                     builder.getDenseI64ArrayAttr(static_offsets));
-  state.addTypes(TensorDesc);
-}
-
-void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
-                           Type tdesc, Value source,
+                           Type tdesc, TypedValue<MemRefType> source,
                            llvm::ArrayRef<OpFoldResult> offsets) {
-  auto ty = llvm::dyn_cast_if_present<MemRefType>(source.getType());
-  assert(ty && ty.hasStaticShape() && offsets.size() == getRankOf(source));
+  auto ty = source.getType();
+  assert(ty && ty.hasStaticShape() && offsets.size() == (size_t)ty.getRank());
 
   llvm::SmallVector<int64_t> staticOffsets;
   llvm::SmallVector<Value> dynamicOffsets;
@@ -100,7 +58,7 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
 }
 
 void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
-                           Type tdesc, Value source,
+                           Type tdesc, TypedValue<IntegerType> source,
                            llvm::ArrayRef<OpFoldResult> offsets,
                            ValueRange shape, ValueRange stride) {
   assert(shape.size() && offsets.size() && stride.size() &&
@@ -117,108 +75,47 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
 }
 
 LogicalResult CreateNdDescOp::verify() {
-  auto offsetRank = getOffsets().size();
-  auto shapeRank = getShape().size();
-  auto stridesRank = getStrides().size();
-  auto baseRank = getRankOf(getSource()) ? getRankOf(getSource()) : 2;
-
-  if (offsetRank != shapeRank || shapeRank != stridesRank ||
-      shapeRank != baseRank)
-
-    return emitOpError(
-        "Expecting the rank of shape, strides, offsets and memref type "
-        "should match with each other (they currently should be 2D).");
-  return success();
-}
-
-// compute consolidated offsets from dynamic_offsets and static_offsets
-// parameters
-llvm::SmallVector<OpFoldResult> CreateNdDescOp::getOffsets() {
-  llvm::SmallVector<OpFoldResult> offsets;
-  auto dynamicOffsets = getDynamicOffsets(); // dynamic_offsets variable
-  auto staticOffsets = getStaticOffsets();   // static_offsets attribute
-
-  // in case static_offsets is missing, dynamic_offsets will be used
-  if (staticOffsets.size() == 0) {
-    offsets.assign(dynamicOffsets.begin(), dynamicOffsets.end());
-    return offsets;
-  }
-
-  // use static offsets for each dim if it has valid value,
-  // othwise use the value from dynamic_offsets
-  for (size_t i = 0, j = 0; i < staticOffsets.size(); i++) {
-    if (ShapedType::isDynamic(staticOffsets[i])) {
-      assert(j < dynamicOffsets.size());
-      offsets.push_back(dynamicOffsets[j++]);
-    } else {
-      auto ty = IndexType::get(getContext());
-      auto attr = IntegerAttr::get(ty, staticOffsets[i]);
-      offsets.push_back(attr);
-    }
-  }
-  return offsets;
-}
-
-// get the consolidated shape of the 2D memory region.
-// It prefer dynamic_shape than the static shape of
-// memref type.
-llvm::SmallVector<OpFoldResult> CreateNdDescOp::getShape() {
-  llvm::SmallVector<OpFoldResult> shape;
-  auto dynShape = getDynamicShape();
-  if (dynShape.size()) {
-    shape.append(dynShape.begin(), dynShape.end());
-    return shape;
-  }
-
-  auto ty = llvm::dyn_cast_if_present<MemRefType>(getSourceType());
-  if (ty && ty.hasStaticShape()) {
-    for (auto dim : ty.getShape()) {
-      auto attr = IntegerAttr::get(IndexType::get(getContext()), dim);
-      shape.push_back(attr);
-    }
-    return shape;
+  auto rank = (int64_t)getMixedOffsets().size();
+  bool invalidRank = (rank != 2);
+  bool invalidElemTy = false;
+
+  // check source type matches the rank if it is a memref.
+  // It also should have the same ElementType as TensorDesc.
+  auto memrefTy = getSourceType().dyn_cast<MemRefType>();
+  if (memrefTy) {
+    invalidRank |= (memrefTy.getRank() != rank);
+    invalidElemTy |= memrefTy.getElementType() != getElementType();
   }
 
-  this->emitError("The shape information of the memory is missing.\n");
-  return {};
-}
-
-// get the consolidated strides of the 2D memory region.
-// It prefer dynamic_stride than the static strides of
-// memref type.
-llvm::SmallVector<OpFoldResult> CreateNdDescOp::getStrides() {
-  llvm::SmallVector<OpFoldResult> strides;
+  // check result type matches the rank
+  invalidRank = (getType().getRank() != rank);
 
-  auto dynStrides = getDynamicStrides();
-  if (dynStrides.size()) {
-    strides.append(dynStrides.begin(), dynStrides.end());
-    return strides;
-  }
+  // mismatches among shape, strides, and offsets are
+  // already handeled by OffsetSizeAndStrideOpInterface.
+  // So they are not check here.
+  if (invalidRank)
+    return emitOpError(
+        "Expecting the rank of shape, strides, offsets, "
+        "source memref type (if source is a memref) and TensorDesc "
+        "should match with each other. They currenlty are 2D.");
 
-  auto ty = llvm::dyn_cast_if_present<MemRefType>(getSourceType());
-  if (ty && ty.hasStaticShape()) {
-    auto [staticStrides, offset] = getStridesAndOffset(ty);
-    for (auto dim : staticStrides) {
-      auto attr = IntegerAttr::get(IndexType::get(getContext()), dim);
-      strides.push_back(attr);
-    }
-    return strides;
-  }
+  if (invalidElemTy)
+    return emitOpError("TensorDesc should have the same element "
+                       "type with the source if it is a memref.\n");
 
-  this->emitError("The strides information of the memory is missing.\n");
-  return {};
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
-// XeGPU_LoadNDOp
+// XeGPU_LoadNdOp
 //===----------------------------------------------------------------------===//
-LogicalResult LoadNDOp::verify() {
+LogicalResult LoadNdOp::verify() {
   auto tdescTy = getTensorDescType();
   auto valueTy = getType();
 
   if (tdescTy.getRank() != 2)
     return emitOpError(
-        "The TensorDesc for LoadNDOp should be a 2D TensorDesc.");
+        "The TensorDesc for LoadNdOp should be a 2D TensorDesc.");
 
   if (!valueTy)
     return emitOpError("Invalid result, it should be a VectorType.\n");
@@ -257,16 +154,15 @@ LogicalResult LoadNDOp::verify() {
   if (tdescShape != valueShape)
     return emitOpError() << "Result shape doesn't match TensorDesc shape."
                          << "The expected shape is " << makeString(tdescShape)
-                         << ". "
-                         << "But the given shape is " << makeString(valueShape)
-                         << ".\n";
+                         << ". But the given shape is "
+                         << makeString(valueShape) << ".\n";
   return success();
 }
 
 //===----------------------------------------------------------------------===//
-// XeGPU_StoreNDOp
+// XeGPU_StoreNdOp
 //===----------------------------------------------------------------------===//
-LogicalResult StoreNDOp::verify() {
+LogicalResult StoreNdOp::verify() {
   auto dstTy = getTensorDesc().getType();               // Tile
   auto valTy = getValue().getType().cast<VectorType>(); // Vector
 

>From 8a9df4ba16cdaf422d7980f565f3c2046b141a90 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 12 Mar 2024 20:03:34 -0500
Subject: [PATCH 17/19] fix include format

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 8 ++++----
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp     | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 43337a6ab43dcd..0b3f4b9c9dbeae 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <llvm/ADT/TypeSwitch.h>
-#include <mlir/Dialect/XeGPU/IR/XeGPU.h>
-#include <mlir/IR/Builders.h>
-#include <mlir/IR/DialectImplementation.h>
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "llvm/ADT/TypeSwitch.h"
 
 namespace mlir {
 namespace xegpu {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 08723d12c278c8..3a75b173b757c5 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <mlir/Dialect/Utils/StaticValueUtils.h>
-#include <mlir/Dialect/XeGPU/IR/XeGPU.h>
-#include <mlir/IR/Builders.h>
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/IR/Builders.h"
 
 #define DEBUG_TYPE "xegpu"
 

>From e3857bb9971b1570108ec55b97f2a18d44354cdc Mon Sep 17 00:00:00 2001
From: Chao Chen <116223022+chencha3 at users.noreply.github.com>
Date: Thu, 14 Mar 2024 10:13:19 -0500
Subject: [PATCH 18/19] Update mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td

Co-authored-by: Mehdi Amini <joker.eph at gmail.com>
---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 24fae9596994e3..cb768d5e6b9af3 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -29,7 +29,7 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
 def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface, 
                         AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> {
 
-  let summary = "create nd tensor descriptor operation";
+  let summary = "Create nd-tensor descriptor operation";
   let description = [{
     The "create_nd_tdesc" operation creates a TensorDescType which represents
     a sub-view of a 2D memory region (It can be extended to support n-D memory

>From 5b6ebf8f8dcd4aef7fcca8ff44f6a88c7700580e Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 15 Mar 2024 11:33:30 -0500
Subject: [PATCH 19/19] move examaple to description

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 70 ++++++++++++++-----
 1 file changed, 53 insertions(+), 17 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index cb768d5e6b9af3..02dc73ce7eb33d 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -180,15 +180,25 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
 
 def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
   let summary = "prefetches a nD block to cache";
+  let description = [{
+    It issues an instruction to prefetch the data from memory to each 
+    level of the cache based on their cache policy.
+
+    Example:
+    ```
+      xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>, 
+                                l2_hint = #xegpu.cache_hint<cached>, 
+                                l3_hint = #xegpu.cache_hint<cached>}
+        : !xegpu.tensor_desc<8x16xf16>
+    ```
+
+  }];
+
   let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
 
-  // Format: xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>, 
-  //                                   l2_hint = #xegpu.cache_hint<cached>, 
-  //                                   l3_hint = #xegpu.cache_hint<cached>}
-  //         : !xegpu.tensor_desc<8x16xf16>
   let assemblyFormat = "$TensorDesc attr-dict `:` qualified(type($TensorDesc))";
 }
 
@@ -198,11 +208,27 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> {
                 "to registers (represented by vector)";
   let description = [{
     LoadNdOp essentially mimics the hardware block read instruction to read 
-    a block of data from memory to register. It takes a set of cache hints 
-    for each level of cache, L1, L2 and L3. If hardware does not have a 
+    a block of data from memory to register. It takes a set of optional cache 
+    hints for each level of cache, L1, L2 and L3. If hardware does not have a 
     correspoding cache, Corresponding cache hint attribute will be masked.
-    If both transpose and vnni_axis present at the same time. It assume to 
-    perform transpose first and then vnni transform.
+    vnni transform is an hardware feature for Intel GPU, which is used to 
+    do data packing during the load for B operand of matrix operation, if 
+    the bit width of the data type is less then 32 bits, e.g., fp16. And 
+    transpose is another Intel hardware feature, which will do transpose
+    operation when loading the data if the bit width of the data type is 
+    fp32 or fp64. It implies that vnni and transpose cannot exit at the 
+    same time.
+
+    Example:
+    ```
+      xegpu.load_nd %1 {transpose = [1, 0],
+                        l1_hint = #xegpu.cache_hint<cached>, 
+                        l2_hint = #xegpu.cache_hint<uncached>, 
+                        l3_hint = #xegpu.cache_hint<streaming>}
+              : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
+    ```
+
+
   }];
 
   let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
@@ -224,27 +250,37 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> {
     }
   }];
 
-  // Format: xegpu.load_nd %1 {transpose = [1, 0], 
-  //                l1_hint = #xegpu.cache_hint<cached>, 
-  //                l2_hint = #xegpu.cache_hint<uncached>, 
-  //                l3_hint = #xegpu.cache_hint<streaming>}
-  //         : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
   let assemblyFormat = "$TensorDesc attr-dict `:` qualified(type($TensorDesc)) `->` type($value)";
   let hasVerifier = 1;
 }
 
 def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> {
   let summary = "stores a n-D block register region back to memory, currently only supports 2D";
+
+  let description = [{
+    StoreNdOp essentially mimics the hardware block write instruction io
+    write a block of data from register into the memory region as described 
+    by the TensorDesc. It takes a set of optional cache hints for each level 
+    of cache, L1, L2 and L3. If hardware does not have a correspoding cache, 
+    Corresponding cache hint attribute will be masked.
+
+    Example:
+    ```
+      xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+                             l2_hint = #xegpu.cache_hint<write_back>, 
+                             l3_hint = #xegpu.cache_hint<write_through>}
+                             : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+    ```
+
+
+  }];
+
   let arguments = (ins XeGPU_ValueType: $value,
                        XeGPU_TensorDesc: $TensorDesc,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
 
-  // Format: xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
-  //                                l2_hint = #xegpu.cache_hint<write_back>, 
-  //                                l3_hint = #xegpu.cache_hint<write_through>}
-  //                                : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
   let assemblyFormat = "$value `,` $TensorDesc attr-dict `:` type($value) `,` qualified(type($TensorDesc))";
   let hasVerifier = 1;
 }