[Mlir-commits] [mlir] Xegpu 2d block ops (PR #84692)

Sun Mar 10 13:45:15 PDT 2024

https://github.com/chencha3 created https://github.com/llvm/llvm-project/pull/84692

This adds XeGPU 2D block operators. It contains:
1. `TensorDescType` and `TensorDescAttr` definitions
2. `MemoryScopeAttr` and `CacheHintAttr` definitions which are used by `TensorDescAttr`. 
3. `CreateNdDescOp`, `PrefetchNdOp`, `LoadNdOp`, and `StoreNdOp` definitions, and their corresponding testcases for illustration. 

>From ad27a81fd0fddbffb7e5b3529017f3c532b0db7d Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Sun, 10 Mar 2024 20:35:47 +0000
Subject: [PATCH 1/2] add XeGPU 2D block operators

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h    |   4 +
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  61 ++++
 .../mlir/Dialect/XeGPU/IR/XeGPUDialect.td     |   4 +-
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 211 +++++++++++++
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       | 104 ++++++-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  72 ++++-
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 276 +++++++++++++++++-
 mlir/test/Dialect/XeGPU/XeGPUOps.mlir         |  62 ++++
 8 files changed, 787 insertions(+), 7 deletions(-)
 create mode 100644 mlir/test/Dialect/XeGPU/XeGPUOps.mlir

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 7aaa4ecc7ee77a..8dc3ff78d25ede 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -9,7 +9,11 @@
 #ifndef MLIR_DIALECT_XEGPU_IR_XEGPU_H
 #define MLIR_DIALECT_XEGPU_IR_XEGPU_H
 
+#include <mlir/Bytecode/BytecodeOpInterface.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Dialect.h>
+#include <mlir/Interfaces/ShapedOpInterfaces.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
 
 namespace mlir {
 namespace xegpu {
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index bb325c272e3324..cd38549f1ccf43 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -10,6 +10,7 @@
 #define MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
 
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
+include "mlir/IR/EnumAttr.td"
 
 class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
                 string baseCppClass = "::mlir::Attribute">
@@ -17,4 +18,64 @@ class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
   let mnemonic = attrMnemonic;
 }
 
+def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
+  let parameters = (ins
+    OptionalParameter<"MemoryScopeAttr">: $memory_scope,
+    OptionalParameter<"IntegerAttr", "1">: $array_length,
+    OptionalParameter<"BoolAttr", "true">: $boundary_check
+  );
+
+  let builders = [
+    AttrBuilder<(ins
+      CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
+      CArg<"int", "1">:$array_length,
+      CArg<"bool", "true">: $boundary_check
+    )>
+  ];
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU Memory Scope Enums.
+//===----------------------------------------------------------------------===//
+def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">;
+def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">;
+def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope", 
+      "The address space of the memory the tensor descritor is created for", 
+      [XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::xegpu";
+}
+
+def XeGPU_MemoryScopeAttr: 
+  EnumAttr<XeGPU_Dialect, XeGPU_MemoryScope, "memory_scope"> {
+    let assemblyFormat = "$value";
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU Cache Enums.
+//===----------------------------------------------------------------------===//
+def XeGPU_CachePolicyCached:        I32EnumAttrCase<"CACHED", 0, "cached">;                    // valid for read and write
+def XeGPU_CachePolicyUncached:      I32EnumAttrCase<"UNCACHED", 1, "uncached">;                // valid for read and write
+def XeGPU_CachePolicyStreaming:     I32EnumAttrCase<"STREAMING", 2, "streaming">;              // valid for read only
+def XeGPU_CachePolicyInvalid:       I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate">;  // valid for read only
+def XeGPU_CachePolicyWriteBack:     I32EnumAttrCase<"WRITE_BACK", 4, "write_back">;            // valid for write only
+def XeGPU_CachePolicyWriteThrough:  I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">;      // valid for write only
+
+def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy", 
+  [XeGPU_CachePolicyCached, XeGPU_CachePolicyUncached, 
+   XeGPU_CachePolicyStreaming, XeGPU_CachePolicyInvalid,
+   XeGPU_CachePolicyWriteBack, XeGPU_CachePolicyWriteThrough]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::xegpu";
+}
+
+def XeGPU_CacheHintAttr 
+  : EnumAttr<XeGPU_Dialect, XeGPU_CachePolicyEnums, "cache_hint"> {
+    let assemblyFormat = "`<` $value `>`";
+}
+
+
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
index 3851275ad30a0a..c2f09319c790e0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -23,8 +23,8 @@ def XeGPU_Dialect : Dialect {
       the lower-level GPU compiler.
     }];
 
-    // let useDefaultTypePrinterParser = true;
-    // let useDefaultAttributePrinterParser = true;
+    let useDefaultTypePrinterParser = true;
+    let useDefaultAttributePrinterParser = true;
 }
 
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUDIALECT_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 5825ef9195b03f..9d37d77e03a0c5 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -12,6 +12,22 @@
 include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/AttrTypeBase.td"
+
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/OpAsmInterface.td"
+include "mlir/IR/AttrTypeBase.td"
+include "mlir/IR/BuiltinTypes.td"
+include "mlir/IR/BuiltinTypeInterfaces.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/ViewLikeInterface.td"
+include "mlir/Interfaces/CastInterfaces.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
+include "mlir/Interfaces/CopyOpInterface.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/ShapedOpInterfaces.td"
 
 
 // Base class for dialect operations. This operation inherits from the base
@@ -23,4 +39,199 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
           Op<XeGPU_Dialect, mnemonic, traits>;
 
 
+def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSegments]> {
+
+  let summary = "create nd tensor descriptor operation";
+  let description = [{
+    The "create_nd_tdesc" operation creates a TensorDescType which represents
+    a sub-view of a 2D memory region (It can be extended to support N-D memory
+    region if needed in future). Elements in the subview continuous in each 
+    dimention. It encodes the following important information for supporting 
+    Intel hardware features:
+
+    * source: an object representing (starting address/pointer of) a 2D memory region. 
+        It can be either a 2D memref object, or simply a pointer represented by uint64_t type.
+        for the later case, the shape and layout information of the 2D memory region should 
+        be explicitly passed via `dynamic_shape` and `dynamic_strides` parameters.
+    * offsets: two index values represents offsets from the "source" at the each dimension 
+        at which the subview of the target memory will be created. It is encoded via two
+        variables, including "dynamic_offsets" and "static_offsets", such that it can
+        accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])).
+    * shape: the shape information of the memory region pointed by the "source".  It is 
+        typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. 
+        But if "source" is simply a pointer represented as uint64_t type, or a memref 
+        type without shape information e.g., memref<?x?xf16>, the shape information has 
+        to be explicitly passed via the "dynamic_shape" argument. Currently "dynamic_shape" 
+        only accepts operands(e.g., [%c4096, %c4096]), not attributes(e.g., [4096, 4096]).
+    * strides: the strides of the memory region pointed by the "source". Similar to shape, 
+        it is typically encoded via the MemRefType of the source too. But if "source" is 
+        simply a pointer represented as uint64_t type, or a memref type without shape 
+        information e.g., memref<?x?xf16>, the strides information has to be explicitly 
+        passed via the "dynamic_strides" argument. And it currently only accepts operands two.
+
+    Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
+    %0 = memref.alloc() : memref<1024x1024xf32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.create_nd_tdesc %0[%c0, %c0]: memref<1024x1024xf32> -> TensorDesc<8x16xf32>
+
+    Example 2 (suppose the tensor shape inferred by the compiler is 8x16):
+    %0 = memref.alloc(%h, %w) : memref<?x?xf32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: memref<?x?xf32> -> TensorDesc<8x16xf32>
+
+    Example 3 (suppose the tensor shape inferred by the compiler is 8x16):
+    %0 = ... : ui64
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
+  }];
+
+  let arguments = (ins 
+    XeGPU_BaseAddrType: $source, 
+    Variadic<Index>: $dynamic_offsets, 
+    Variadic<Index>: $dynamic_shape, 
+    Variadic<Index>: $dynamic_strides,
+    DenseI64ArrayAttr: $static_offsets
+  );
+  let results = (outs XeGPU_TensorDesc: $TensorDesc);
+
+  let assemblyFormat = [{
+    $source ``
+    custom<DynamicIndexList>($dynamic_offsets, $static_offsets)
+    (`,` `[` $dynamic_shape^ `]` `,` `[` $dynamic_strides `]`)?
+    attr-dict `:` type($source) `->` qualified(type($TensorDesc))
+  }];
+  let skipDefaultBuilders = 1;
+  let hasVerifier = 1;
+
+  let builders = [
+    OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets, 
+                   "ValueRange": $shape, "ValueRange": $strides, 
+                   "llvm::ArrayRef<int64_t>": $static_offsets)>,
+
+    OpBuilder<(ins "Type": $tdesc, "Value": $source, 
+                   "llvm::ArrayRef<OpFoldResult>": $offsets)>,
+
+    OpBuilder<(ins "Type": $tdesc, "Value": $source, 
+                   "llvm::ArrayRef<OpFoldResult>": $offsets,
+                   "ValueRange": $shape, "ValueRange": $stride)>
+  ];
+
+  let extraClassDeclaration = [{
+    /// Returns the type of the source memref operand.
+    Type getSourceType() {
+      return getSource().getType();
+    }
+
+    /// Returns the type of the result TensorDesc.
+    xegpu::TensorDescType getType() {
+      return getTensorDesc().getType();
+    }
+
+    /// Returns the offsets info to the source. It consolidates
+    /// information from both dynamic_offsets and static_offsets
+    /// parameters. static_offsets parameter always has the expected
+    /// ranks with some dim could have ShapeType::kDynamic value
+    /// indicating the corresponding value should be from dynamic_offsets.
+    llvm::SmallVector<OpFoldResult> getOffsets();
+
+    /// returns the shape info of the source. It is either from the
+    /// memref type, if source is a memref with static shape
+    /// information or from the dynamic_shape parameter. If both
+    /// exists, the dynamic_shape parameter will be used and the
+    /// shape information from  memref type will be ignored.
+    llvm::SmallVector<OpFoldResult> getShape();
+
+    /// returns the strides info of the source. It is either from the
+    /// memref type, if source is a memref with static shape
+    /// information or from the dynamic_stride parameter. If both
+    /// exists, the dynamic_strides parameter will be used and the
+    /// strides information from  memref type will be ignored.
+    llvm::SmallVector<OpFoldResult> getStrides();
+
+    /// Return the element type of the TensorDesc
+    Type getElementType() {
+      return getType().getElementType();
+    }
+
+    /// Return the shape of the TensorDesc
+    llvm::ArrayRef<int64_t> getTensorDescShape() {
+      return getType().getShape();
+    }
+  }];
+}
+
+def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
+  let summary = "prefetches a nD block to cache";
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+  // Format: xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>, 
+  //                                   l2_hint = #xegpu.cache_hint<cached>, 
+  //                                   l3_hint = #xegpu.cache_hint<cached>}
+  //         : !xegpu.tensor_desc<8x16xf16>
+  let assemblyFormat = "$TensorDesc attr-dict `:` qualified(type($TensorDesc))";
+}
+
+
+def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> {
+  let summary = "loads a n-D block from memory (represented by TensorDesc)" 
+                "to registers (represented by vector)";
+  let description = [{
+    LoadNDOp essentially mimics the hardware block read instruction to read 
+    a block of data from memory to register. It takes a set of cache hints 
+    for each level of cache, L1, L2 and L3. If hardware does not have a 
+    correspoding cache, Corresponding cache hint attribute will be masked.
+    If both transpose and vnni_axis present at the same time. It assume to 
+    perform transpose first and then vnni transform.
+  }];
+
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<I64Attr>: $vnni_axis,
+                       OptionalAttr<DenseI64ArrayAttr>: $transpose,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+  let results = (outs XeGPU_ValueType: $value);
+
+  let extraClassDeclaration = [{
+    VectorType getType() {
+      return llvm::dyn_cast<VectorType>(getValue().getType());
+    }
+
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+  }];
+
+  // Format: xegpu.load_nd %1 {transpose = [1, 0], 
+  //                l1_hint = #xegpu.cache_hint<cached>, 
+  //                l2_hint = #xegpu.cache_hint<uncached>, 
+  //                l3_hint = #xegpu.cache_hint<streaming>}
+  //         : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
+  let assemblyFormat = "$TensorDesc attr-dict `:` qualified(type($TensorDesc)) `->` type($value)";
+  let hasVerifier = 1;
+}
+
+def XeGPU_StoreNDOp : XeGPU_Op<"store_nd", []> {
+  let summary = "stores a n-D block register region back to memory, currently only supports 2D";
+  let arguments = (ins XeGPU_ValueType: $value,
+                       XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+  // Format: xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+  //                                l2_hint = #xegpu.cache_hint<write_back>, 
+  //                                l3_hint = #xegpu.cache_hint<write_through>}
+  //                                : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+  let assemblyFormat = "$value `,` $TensorDesc attr-dict `:` type($value) `,` qualified(type($TensorDesc))";
+  let hasVerifier = 1;
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 1d75bb4e2906fe..36b04ea12bcad0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -9,9 +9,9 @@
 #ifndef MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
 #define MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
 
-include "mlir/IR/BuiltinTypes.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
+include "mlir/IR/BuiltinTypes.td"
 
 def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64, UI1, UI8, UI16, UI32, UI64]>;
 def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>;
@@ -30,4 +30,106 @@ class XeGPUTypeDef<string name, string typeMnemonic, list<Trait> traits = [],
   let mnemonic = typeMnemonic;
 }
 
+def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
+        [ShapedTypeInterface], "::mlir::TensorType"> {
+  let summary = "TensorDesc describing regions of interested data.";
+  let description = [{
+    TensorDesc is a type designed to describe regions of the interested data as well as some 
+    features that are unique to Intel hardware. Different with the builtin tensor type in MLIR, 
+    it essentially only contains the meta data, and doesn't hold the data by itself. It is designed 
+    to mainly support 2d block load/store and DPAS (matrix multiplication instruction) on Intel GPU. 
+    It encodes the following information:
+
+    * shape:  the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows
+              and each row contains 16 continious data element. The rows could be
+              either continuous or not, depends on whether the encoding attribute
+              is set or not.
+    * element_type: the data type of the data element, e.g., f16, f32.
+
+    Similar to the builtin tensor, it also provides an optinal attribute to encoding 
+    the following information via the TensorDescAttr object:
+    * memory_scope (xegpu::MemoryScope): [optional] where the data is located, 
+                global memory or shared memory. It is default to Global.
+    * array_length (int): [optional] The number of continuous blocks with size as `shape`,
+               that will be loaded by block load at a time. It is default to 1.
+    * boundary_check (bool): [optional] indicates whether the operation detects the boundary 
+                and pads with zero for out-of-boundary access. It is default to do boundary check.
+    
+
+    Syntax:
+
+    ```
+    TensorDesc-type ::= `tensor_desc` `<` dim-list element-type (attr-list)? `>`
+    element-type ::= float-type | integer-type | index-type
+    dim-list := (static-dim-list `x`)?
+    static-dim-list ::= decimal-literal `x` decimal-literal
+    attr-list = (, memory_scope = value)? (, arr_len = value)? (, boundary_check = value)?
+    ```
+
+    Examples:
+
+    ```mlir
+    // A block TensorDesc with 8x16 i32 elements
+    xegpu.tensor_desc<8x16xi32>
+
+    // A block TensorDesc with 8x16 f32 elements
+    xegpu.tensor_desc<8x16xf32>
+
+    // A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
+    xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+    ```
+  }];
+
+  let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
+                        "mlir::Type": $elementType,
+                        OptionalParameter<"mlir::Attribute">: $encoding);
+
+  let extraClassDeclaration = [{
+    using TensorType::clone;
+    using mlir::ShapedType::Trait<TensorDescType>::getElementTypeBitWidth;
+    using mlir::ShapedType::Trait<TensorDescType>::getRank;
+    using mlir::ShapedType::Trait<TensorDescType>::getNumElements;
+    using mlir::ShapedType::Trait<TensorDescType>::isDynamicDim;
+    using mlir::ShapedType::Trait<TensorDescType>::hasStaticShape;
+    using mlir::ShapedType::Trait<TensorDescType>::getNumDynamicDims;
+    using mlir::ShapedType::Trait<TensorDescType>::getDimSize;
+    using mlir::ShapedType::Trait<TensorDescType>::getDynamicDimIndex;
+
+    TensorDescType clone(::mlir::Type elementType) {
+      return llvm::cast<TensorDescType>(cloneWith(getShape(), elementType));
+    }
+
+    TensorDescAttr getEncodingAsTensorDescAttr() const {
+      return llvm::dyn_cast_if_present<TensorDescAttr>(getEncoding());
+    }
+
+    xegpu::MemoryScope getMemoryScope() const {
+      auto attr = getEncodingAsTensorDescAttr();
+      if (attr && attr.getMemoryScope())
+        return attr.getMemoryScope().getValue();
+      // return default value
+      return MemoryScope::Global;
+    }
+
+    int getArrayLength() {
+      auto attr = getEncodingAsTensorDescAttr();
+      if (attr && attr.getArrayLength())
+        return attr.getArrayLength().getInt();
+      // return default value
+      return 1; 
+    }
+
+    bool getBoundaryCheck() {
+      auto attr = getEncodingAsTensorDescAttr();
+      if (attr && attr.getBoundaryCheck())
+        return attr.getBoundaryCheck().getValue();
+      // return default value
+      return true;
+    }
+  }];
+
+  let hasCustomAssemblyFormat = true;
+  
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 4f839ee773476b..bd72d5c17b6ea1 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -6,7 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <llvm/ADT/TypeSwitch.h>
 #include <mlir/Dialect/XeGPU/IR/XeGPU.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/DialectImplementation.h>
 
 namespace mlir {
 namespace xegpu {
@@ -26,8 +29,73 @@ void XeGPUDialect::initialize() {
       >();
 }
 
-// this file is for position occupation,
-// we will add functions in following PRs.
+
+//===----------------------------------------------------------------------===//
+// XeGPU_TensorDescAttr
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XeGPU_TensorDescType
+//===----------------------------------------------------------------------===//
+mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
+  llvm::SmallVector<int64_t> shape;
+  mlir::Type elementType;
+  mlir::FailureOr<mlir::Attribute> encoding;
+
+  // Parse literal '<'
+  if (parser.parseLess())
+    return {};
+
+  auto shapeLoc = parser.getCurrentLocation();
+  if (mlir::failed(parser.parseDimensionList(shape))) {
+    parser.emitError(shapeLoc, "failed to parse parameter 'shape'");
+    return {};
+  }
+
+  auto elemTypeLoc = parser.getCurrentLocation();
+  if (mlir::failed(parser.parseType(elementType))) {
+    parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'");
+    return {};
+  }
+
+  // parse optional attributes
+  if (mlir::succeeded(parser.parseOptionalComma())) {
+    encoding = mlir::FieldParser<mlir::Attribute>::parse(parser);
+    if (mlir::failed(encoding)) {
+      parser.emitError(parser.getCurrentLocation(),
+          "Failed to parse the attribute field for TensorDescType.\n");
+      return {};
+    }
+  }
+
+  // Parse literal '>'
+  if (parser.parseGreater())
+    return {};
+
+  return TensorDescType::get(parser.getContext(), shape, elementType,
+                             encoding.value_or(mlir::Attribute()));
+}
+
+void TensorDescType::print(::mlir::AsmPrinter &printer) const {
+  printer << "<";
+
+  auto shape = getShape();
+  for (int64_t dim : shape) {
+    if (mlir::ShapedType::isDynamic(dim))
+      printer << '?';
+    else
+      printer << dim;
+    printer << 'x';
+  }
+
+  printer << getElementType();
+
+  if (auto encoding = getEncoding())
+    printer << ", " << encoding;
+
+  printer << ">";
+}
+
 
 } // namespace xegpu
 } // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 0e89ac4df6ef28..a388db4f5c2dc6 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -6,14 +6,286 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <mlir/Dialect/Utils/StaticValueUtils.h>
 #include <mlir/Dialect/XeGPU/IR/XeGPU.h>
+#include <mlir/Interfaces/ViewLikeInterface.h>
+#include <mlir/IR/Builders.h>
 
 #define DEBUG_TYPE "xegpu"
 
 namespace mlir {
 namespace xegpu {
-// this file is for position occupation,
-// we will add functions in following PRs.
+
+static size_t getRankOf(Value value) {
+  if (value.getType().isIntOrIndexOrFloat())
+    return 0;
+  if (auto ty = llvm::dyn_cast_if_present<MemRefType>(value.getType()))
+    return ty.getRank();
+  if (auto ty = llvm::dyn_cast_if_present<VectorType>(value.getType()))
+    return ty.getRank();
+  llvm_unreachable("Unsupported value for getRankOf");
+}
+
+static void transpose(llvm::ArrayRef<int64_t> trans,
+                      std::vector<int64_t> &shape) {
+  std::vector<int64_t> old = shape;
+  for (size_t i = 0; i < trans.size(); i++)
+    shape[i] = old[trans[i]];
+}
+
+template <typename T>
+static std::string makeString(T array, bool breakline = false) {
+  std::string buf;
+  buf.clear();
+  llvm::raw_string_ostream os(buf);
+  os << "[";
+  for (size_t i = 1; i < array.size(); i++) {
+    os << array[i - 1] << ", ";
+    if (breakline)
+      os << "\n\t\t";
+  }
+  os << array.back() << "]";
+  os.flush();
+  return buf;
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_CreateNdDescOp
+//===----------------------------------------------------------------------===//
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type TensorDesc, Value source, ValueRange offsets,
+                           ValueRange shape, ValueRange strides,
+                           llvm::ArrayRef<int64_t> static_offsets) {
+  auto offsetRank = static_offsets.size();
+  auto shapeRank = shape.size() ? shape.size() : getRankOf(source);
+
+  size_t dynOffsetRank =
+      std::count_if(static_offsets.begin(), static_offsets.end(),
+                    [](int64_t d) { return ShapedType::isDynamic(d); });
+
+  // shape and strides should exists at the same time
+  // and the final rank for shape and offset (dynamic + static)
+  // should be the same
+  assert(shape.size() == strides.size() && shapeRank == offsetRank &&
+         offsets.size() == dynOffsetRank);
+
+  state.addOperands(source);
+  state.addOperands(offsets);
+  state.addOperands(shape);
+  state.addOperands(strides);
+  state.addAttribute(
+      getOperandSegmentSizesAttrName(state.name),
+      builder.getDenseI32ArrayAttr({1, static_cast<int32_t>(offsets.size()),
+                                    static_cast<int32_t>(shape.size()),
+                                    static_cast<int32_t>(strides.size())}));
+  state.addAttribute(getStaticOffsetsAttrName(state.name),
+                     builder.getDenseI64ArrayAttr(static_offsets));
+  state.addTypes(TensorDesc);
+}
+
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type tdesc, Value source,
+                           llvm::ArrayRef<OpFoldResult> offsets) {
+  auto ty = llvm::dyn_cast_if_present<MemRefType>(source.getType());
+  assert(ty && ty.hasStaticShape() && offsets.size() == getRankOf(source));
+
+  llvm::SmallVector<int64_t> staticOffsets;
+  llvm::SmallVector<Value> dynamicOffsets;
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+
+  build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */,
+        ValueRange({}) /* empty dynamic shape */,
+        ValueRange({}) /* empty dynamic strides */,
+        staticOffsets /* static offsets */);
+}
+
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type tdesc, Value source,
+                           llvm::ArrayRef<OpFoldResult> offsets,
+                           ValueRange shape, ValueRange stride) {
+  assert(shape.size() && offsets.size() && stride.size() &&
+         shape.size() == stride.size() && shape.size() == offsets.size());
+
+  llvm::SmallVector<int64_t> staticOffsets;
+  llvm::SmallVector<Value> dynamicOffsets;
+
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+
+  build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets,
+        /* dynamic shape = */ shape , /* dynamic strides = */ stride,
+        /* static offsets = */ staticOffsets);
+}
+
+
+LogicalResult CreateNdDescOp::verify() {
+  auto offsetRank = getOffsets().size();
+  auto shapeRank = getShape().size();
+  auto stridesRank = getStrides().size();
+  auto baseRank = getRankOf(getSource()) ? getRankOf(getSource()) : 2;
+
+  if (offsetRank != shapeRank || shapeRank != stridesRank ||
+      shapeRank != baseRank)
+
+    return emitOpError(
+        "Expecting the rank of shape, strides, offsets and memref type "
+        "should match with each other (they currently should be 2D).");
+  return success();
+}
+
+// compute consolidated offsets from dynamic_offsets and static_offsets parameters
+llvm::SmallVector<OpFoldResult> CreateNdDescOp::getOffsets() {
+  llvm::SmallVector<OpFoldResult> offsets;
+  auto dynamicOffsets = getDynamicOffsets(); // dynamic_offsets variable
+  auto staticOffsets = getStaticOffsets();   // static_offsets attribute
+
+  // in case static_offsets is missing, dynamic_offsets will be used
+  if (staticOffsets.size() == 0) {
+    offsets.assign(dynamicOffsets.begin(), dynamicOffsets.end());
+    return offsets;
+  }
+
+  // use static offsets for each dim if it has valid value, 
+  // othwise use the value from dynamic_offsets
+  for (size_t i = 0, j = 0; i < staticOffsets.size(); i++) {
+    if (ShapedType::isDynamic(staticOffsets[i])) {
+      assert(j < dynamicOffsets.size());
+      offsets.push_back(dynamicOffsets[j++]);
+    } else {
+      auto ty = IndexType::get(getContext());
+      auto attr = IntegerAttr::get(ty, staticOffsets[i]);
+      offsets.push_back(attr);
+    }
+  }
+  return offsets;
+}
+
+// get the consolidated shape of the 2D memory region. 
+// It prefer dynamic_shape than the static shape of 
+// memref type.
+llvm::SmallVector<OpFoldResult> CreateNdDescOp::getShape() {
+  llvm::SmallVector<OpFoldResult> shape;
+  auto dynShape = getDynamicShape();
+  if (dynShape.size()) {
+    shape.append(dynShape.begin(), dynShape.end());
+    return shape;
+  }
+
+  auto ty = llvm::dyn_cast_if_present<MemRefType>(getSourceType());
+  if (ty && ty.hasStaticShape()) {
+    for (auto dim : ty.getShape()) {
+      auto attr = IntegerAttr::get(IndexType::get(getContext()), dim);
+      shape.push_back(attr);
+    }
+    return shape;
+  }
+  
+  this->emitError("The shape information of the memory is missing.\n");
+  return {};
+}
+
+// get the consolidated strides of the 2D memory region. 
+// It prefer dynamic_stride than the static strides of 
+// memref type.
+llvm::SmallVector<OpFoldResult> CreateNdDescOp::getStrides() {
+  llvm::SmallVector<OpFoldResult> strides;
+
+  auto dynStrides = getDynamicStrides();
+  if (dynStrides.size()) {
+    strides.append(dynStrides.begin(), dynStrides.end());
+    return strides;
+  }
+
+  auto ty = llvm::dyn_cast_if_present<MemRefType>(getSourceType());
+  if (ty && ty.hasStaticShape()) {
+    auto [staticStrides, offset] = getStridesAndOffset(ty);
+    for (auto dim : staticStrides) {
+      auto attr = IntegerAttr::get(IndexType::get(getContext()), dim);
+      strides.push_back(attr);
+    }
+    return strides;
+  }
+
+  this->emitError("The strides information of the memory is missing.\n");
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_LoadNDOp
+//===----------------------------------------------------------------------===//
+LogicalResult LoadNDOp::verify() {
+  auto tdescTy = getTensorDescType();
+  auto valueTy = getType();
+
+  if (tdescTy.getRank() != 2)
+    return emitOpError(
+        "The TensorDesc for LoadNDOp should be a 2D TensorDesc.");
+
+  if (!valueTy)
+    return emitOpError("Invalid result, it should be a VectorType.\n");
+
+  auto tdescElemTy = tdescTy.getElementType();
+  auto valueElemTy = valueTy.getElementType();
+
+  if (tdescElemTy != valueElemTy)
+    return emitOpError(
+        "Value should have the same element type as TensorDesc.");
+
+  auto array_len = tdescTy.getArrayLength();
+  auto tdescShape = tdescTy.getShape().vec();
+  auto valueShape = valueTy.getShape().vec();
+
+  if (getTranspose()) {
+    auto trans = getTranspose().value();
+    if (tdescShape.size() >= trans.size())
+      transpose(trans, tdescShape);
+    else
+      emitWarning("Invalid transpose attr. It is ignored.");
+  }
+
+  if (getVnniAxis()) {
+    auto axis = getVnniAxis().value();
+    auto vnni_factor = valueShape.back();
+    tdescShape[axis] /= vnni_factor;
+    tdescShape.push_back(vnni_factor);
+  }
+
+  if (array_len > 1) {
+    auto it = tdescShape.begin();
+    tdescShape.insert(it, array_len);
+  }
+
+  if (tdescShape != valueShape)
+    return emitOpError() <<"Result shape doesn't match TensorDesc shape."
+           << "The expected shape is " << makeString(tdescShape) << ". "
+           << "But the given shape is " << makeString(valueShape) << ".\n";
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_StoreNDOp
+//===----------------------------------------------------------------------===//
+LogicalResult StoreNDOp::verify() {
+  auto dstTy = getTensorDesc().getType();               // Tile
+  auto valTy = getValue().getType().cast<VectorType>(); // Vector
+
+  if (dstTy.getRank() != 2)
+    return emitOpError("Expecting a 2D TensorDesc shape.\n");
+
+  if (!valTy)
+    return emitOpError("Exepcting a VectorType result.\n");
+
+  auto dstElemTy = dstTy.getElementType();
+  auto valElemTy = valTy.getElementType();
+
+  if (dstElemTy != valElemTy) {
+    return emitOpError() << "The element type of the value should "
+                       "match the elementtype of the TensorDesc.\n";
+  }
+
+  if (dstTy.getShape() != valTy.getShape())
+    return emitOpError() << "The result shape should match the TensorDesc shape.\n";
+  return success();
+}
 
 } // namespace xegpu
 } // namespace mlir
diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
new file mode 100644
index 00000000000000..f9b3510beb4335
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
@@ -0,0 +1,62 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: gpu.module @test {
+gpu.module @test {
+// CHECK: gpu.func @test_create_nd_tdesc_vc_1(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+  gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_vc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) {
+gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
+  //CHECK: %[[C:.*]] = arith.constant 1 : index
+  %c1 = arith.constant 1 : index
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+  gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_vc_3(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr<array_length = 2 : i64>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr<array_length = 2>>
+  gpu.return
+}
+
+// CHECK: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK: xegpu.prefetch_nd %[[R0]] {l2_hint = #xegpu.cache_hint<uncached>, li_hint = #xegpu.cache_hint<cached>} : !xegpu.tensor_desc<8x16xf16>
+  xegpu.prefetch_nd %1 {li_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}: !xegpu.tensor_desc<8x16xf16>
+  gpu.return
+}
+
+// CHECK: func @test_load_nd_vc(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, vnni_axis = 0 : i64} : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+  %2 = xegpu.load_nd %1 {vnni_axis = 0, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>} 
+       : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+  gpu.return
+}
+
+// CHECK: func @test_store_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
+  // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16>
+  %1 = arith.constant dense<1.0>: vector<24x32xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+  %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+  // CHECK: xegpu.store_nd %[[C]], %[[R0]] {l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>} : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
+  xegpu.store_nd %1, %2 {l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>} : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
+  gpu.return
+}
+
+}
\ No newline at end of file

>From 74bd038f61985874694c01023c16f04e070e1419 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Sun, 10 Mar 2024 20:38:46 +0000
Subject: [PATCH 2/2] run clang-format

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp |  5 ++--
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp     | 33 ++++++++++++----------
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index bd72d5c17b6ea1..43337a6ab43dcd 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -29,7 +29,6 @@ void XeGPUDialect::initialize() {
       >();
 }
 
-
 //===----------------------------------------------------------------------===//
 // XeGPU_TensorDescAttr
 //===----------------------------------------------------------------------===//
@@ -62,7 +61,8 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
   if (mlir::succeeded(parser.parseOptionalComma())) {
     encoding = mlir::FieldParser<mlir::Attribute>::parse(parser);
     if (mlir::failed(encoding)) {
-      parser.emitError(parser.getCurrentLocation(),
+      parser.emitError(
+          parser.getCurrentLocation(),
           "Failed to parse the attribute field for TensorDescType.\n");
       return {};
     }
@@ -96,7 +96,6 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const {
   printer << ">";
 }
 
-
 } // namespace xegpu
 } // namespace mlir
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index a388db4f5c2dc6..be631c4678eacb 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -8,8 +8,8 @@
 
 #include <mlir/Dialect/Utils/StaticValueUtils.h>
 #include <mlir/Dialect/XeGPU/IR/XeGPU.h>
-#include <mlir/Interfaces/ViewLikeInterface.h>
 #include <mlir/IR/Builders.h>
+#include <mlir/Interfaces/ViewLikeInterface.h>
 
 #define DEBUG_TYPE "xegpu"
 
@@ -112,11 +112,10 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
 
   build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets,
-        /* dynamic shape = */ shape , /* dynamic strides = */ stride,
+        /* dynamic shape = */ shape, /* dynamic strides = */ stride,
         /* static offsets = */ staticOffsets);
 }
 
-
 LogicalResult CreateNdDescOp::verify() {
   auto offsetRank = getOffsets().size();
   auto shapeRank = getShape().size();
@@ -132,7 +131,8 @@ LogicalResult CreateNdDescOp::verify() {
   return success();
 }
 
-// compute consolidated offsets from dynamic_offsets and static_offsets parameters
+// compute consolidated offsets from dynamic_offsets and static_offsets
+// parameters
 llvm::SmallVector<OpFoldResult> CreateNdDescOp::getOffsets() {
   llvm::SmallVector<OpFoldResult> offsets;
   auto dynamicOffsets = getDynamicOffsets(); // dynamic_offsets variable
@@ -144,7 +144,7 @@ llvm::SmallVector<OpFoldResult> CreateNdDescOp::getOffsets() {
     return offsets;
   }
 
-  // use static offsets for each dim if it has valid value, 
+  // use static offsets for each dim if it has valid value,
   // othwise use the value from dynamic_offsets
   for (size_t i = 0, j = 0; i < staticOffsets.size(); i++) {
     if (ShapedType::isDynamic(staticOffsets[i])) {
@@ -159,8 +159,8 @@ llvm::SmallVector<OpFoldResult> CreateNdDescOp::getOffsets() {
   return offsets;
 }
 
-// get the consolidated shape of the 2D memory region. 
-// It prefer dynamic_shape than the static shape of 
+// get the consolidated shape of the 2D memory region.
+// It prefer dynamic_shape than the static shape of
 // memref type.
 llvm::SmallVector<OpFoldResult> CreateNdDescOp::getShape() {
   llvm::SmallVector<OpFoldResult> shape;
@@ -178,13 +178,13 @@ llvm::SmallVector<OpFoldResult> CreateNdDescOp::getShape() {
     }
     return shape;
   }
-  
+
   this->emitError("The shape information of the memory is missing.\n");
   return {};
 }
 
-// get the consolidated strides of the 2D memory region. 
-// It prefer dynamic_stride than the static strides of 
+// get the consolidated strides of the 2D memory region.
+// It prefer dynamic_stride than the static strides of
 // memref type.
 llvm::SmallVector<OpFoldResult> CreateNdDescOp::getStrides() {
   llvm::SmallVector<OpFoldResult> strides;
@@ -255,9 +255,11 @@ LogicalResult LoadNDOp::verify() {
   }
 
   if (tdescShape != valueShape)
-    return emitOpError() <<"Result shape doesn't match TensorDesc shape."
-           << "The expected shape is " << makeString(tdescShape) << ". "
-           << "But the given shape is " << makeString(valueShape) << ".\n";
+    return emitOpError() << "Result shape doesn't match TensorDesc shape."
+                         << "The expected shape is " << makeString(tdescShape)
+                         << ". "
+                         << "But the given shape is " << makeString(valueShape)
+                         << ".\n";
   return success();
 }
 
@@ -279,11 +281,12 @@ LogicalResult StoreNDOp::verify() {
 
   if (dstElemTy != valElemTy) {
     return emitOpError() << "The element type of the value should "
-                       "match the elementtype of the TensorDesc.\n";
+                            "match the elementtype of the TensorDesc.\n";
   }
 
   if (dstTy.getShape() != valTy.getShape())
-    return emitOpError() << "The result shape should match the TensorDesc shape.\n";
+    return emitOpError()
+           << "The result shape should match the TensorDesc shape.\n";
   return success();
 }