[Mlir-commits] [mlir] 61b24c6 - [MLIR][XeGPU] Adding XeGPU 2d block operators (#85804)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Mar 20 15:32:34 PDT 2024
Author: Chao Chen
Date: 2024-03-20T17:32:30-05:00
New Revision: 61b24c61a90802e06e40a7ab0aa5e2138486bd73
URL: https://github.com/llvm/llvm-project/commit/61b24c61a90802e06e40a7ab0aa5e2138486bd73
DIFF: https://github.com/llvm/llvm-project/commit/61b24c61a90802e06e40a7ab0aa5e2138486bd73.diff
LOG: [MLIR][XeGPU] Adding XeGPU 2d block operators (#85804)
This PR adds XeGPU 2D block operators. It contains:
1. TensorDescType and TensorDescAttr definitions
2. MemoryScopeAttr and CacheHintAttr definitions which are used by
TensorDescAttr.
3. CreateNdDescOp, PrefetchNdOp, LoadNdOp, and StoreNdOp definitions,
and their corresponding testcases for illustration.
It cherry-picks daebe5c4f27ba140ac8d13abf41e3fe4db72b91a with asan fix.
---------
Co-authored-by: Mehdi Amini <joker.eph at gmail.com>
Added:
mlir/test/Dialect/XeGPU/XeGPUOps.mlir
Modified:
mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 7aaa4ecc7ee77a..87aabdc015fea5 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -9,7 +9,12 @@
#ifndef MLIR_DIALECT_XEGPU_IR_XEGPU_H
#define MLIR_DIALECT_XEGPU_IR_XEGPU_H
-#include <mlir/IR/Dialect.h>
+#include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/Interfaces/ShapedOpInterfaces.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
namespace mlir {
namespace xegpu {
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index bb325c272e3324..cd38549f1ccf43 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -10,6 +10,7 @@
#define MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
+include "mlir/IR/EnumAttr.td"
class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
string baseCppClass = "::mlir::Attribute">
@@ -17,4 +18,64 @@ class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
let mnemonic = attrMnemonic;
}
+def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
+ let parameters = (ins
+ OptionalParameter<"MemoryScopeAttr">: $memory_scope,
+ OptionalParameter<"IntegerAttr", "1">: $array_length,
+ OptionalParameter<"BoolAttr", "true">: $boundary_check
+ );
+
+ let builders = [
+ AttrBuilder<(ins
+ CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
+ CArg<"int", "1">:$array_length,
+ CArg<"bool", "true">: $boundary_check
+ )>
+ ];
+
+ let assemblyFormat = "`<` struct(params) `>`";
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU Memory Scope Enums.
+//===----------------------------------------------------------------------===//
+def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">;
+def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">;
+def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope",
+ "The address space of the memory the tensor descritor is created for",
+ [XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::xegpu";
+}
+
+def XeGPU_MemoryScopeAttr:
+ EnumAttr<XeGPU_Dialect, XeGPU_MemoryScope, "memory_scope"> {
+ let assemblyFormat = "$value";
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU Cache Enums.
+//===----------------------------------------------------------------------===//
+def XeGPU_CachePolicyCached: I32EnumAttrCase<"CACHED", 0, "cached">; // valid for read and write
+def XeGPU_CachePolicyUncached: I32EnumAttrCase<"UNCACHED", 1, "uncached">; // valid for read and write
+def XeGPU_CachePolicyStreaming: I32EnumAttrCase<"STREAMING", 2, "streaming">; // valid for read only
+def XeGPU_CachePolicyInvalid: I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate">; // valid for read only
+def XeGPU_CachePolicyWriteBack: I32EnumAttrCase<"WRITE_BACK", 4, "write_back">; // valid for write only
+def XeGPU_CachePolicyWriteThrough: I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">; // valid for write only
+
+def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy",
+ [XeGPU_CachePolicyCached, XeGPU_CachePolicyUncached,
+ XeGPU_CachePolicyStreaming, XeGPU_CachePolicyInvalid,
+ XeGPU_CachePolicyWriteBack, XeGPU_CachePolicyWriteThrough]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::xegpu";
+}
+
+def XeGPU_CacheHintAttr
+ : EnumAttr<XeGPU_Dialect, XeGPU_CachePolicyEnums, "cache_hint"> {
+ let assemblyFormat = "`<` $value `>`";
+}
+
+
+
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
index 3851275ad30a0a..c2f09319c790e0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -23,8 +23,8 @@ def XeGPU_Dialect : Dialect {
the lower-level GPU compiler.
}];
- // let useDefaultTypePrinterParser = true;
- // let useDefaultAttributePrinterParser = true;
+ let useDefaultTypePrinterParser = true;
+ let useDefaultAttributePrinterParser = true;
}
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUDIALECT_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 5825ef9195b03f..93c56ad05b432c 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -9,10 +9,13 @@
#ifndef MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
#define MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
+include "mlir/IR/AttrTypeBase.td"
include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td"
-
+include "mlir/Interfaces/ShapedOpInterfaces.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/ViewLikeInterface.td"
// Base class for dialect operations. This operation inherits from the base
// `Op` class in OpBase.td, and provides:
@@ -20,7 +23,305 @@ include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td"
// * The mnemonic for the operation, or the name without the dialect prefix.
// * A list of traits for the operation.
class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
- Op<XeGPU_Dialect, mnemonic, traits>;
+ Op<XeGPU_Dialect, mnemonic, traits> {
+
+ code extraBaseClassDeclaration = [{
+ void printProperties(::mlir::MLIRContext *ctx,
+ ::mlir::OpAsmPrinter &p, const Properties &prop) {
+ Attribute propAttr = getPropertiesAsAttr(ctx, prop);
+ if (propAttr)
+ p << "<" << propAttr << ">";
+ }
+
+ static ::mlir::ParseResult parseProperties(::mlir::OpAsmParser &parser,
+ ::mlir::OperationState &result) {
+ if (mlir::succeeded(parser.parseLess())) {
+ if (parser.parseAttribute(result.propertiesAttr) || parser.parseGreater())
+ return failure();
+ }
+ return success();
+ }
+
+ }];
+}
+
+
+def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface,
+ AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> {
+
+ let summary = "Create nd-tensor descriptor operation";
+ let description = [{
+ The "create_nd_tdesc" operation creates a TensorDescType which represents
+ a sub-view of a 2D memory region (It can be extended to support n-D memory
+ region if needed in future). Elements in the subview continuous in each
+ dimention. It encodes the following important information for supporting
+ Intel hardware features:
+
+ * source: an object representing (starting address/pointer of) a 2D memory region.
+ It can be either a 2D memref object, or simply a pointer represented by uint64_t type.
+ for the later case, the shape and layout information of the 2D memory region should
+ be explicitly passed via `dynamic_shape` and `dynamic_strides` parameters.
+ * offsets: two index values represents offsets from the "source" at the each dimension
+ at which the subview of the target memory will be created. It is encoded via two
+ variables, including "dynamic_offsets" and "static_offsets", such that it can
+ accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])).
+ * shape: the shape information of the memory region pointed by the "source". It is
+ typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>.
+ But if "source" is simply a pointer represented as uint64_t type, or a memref
+ type without shape information e.g., memref<?x?xf16>, the shape information has
+ to be explicitly passed via the "dynamic_shape" argument. Currently "dynamic_shape"
+ only accepts operands(e.g., [%c4096, %c4096]), not attributes(e.g., [4096, 4096]).
+ * strides: the strides of the memory region pointed by the "source". Similar to shape,
+ it is typically encoded via the MemRefType of the source too. But if "source" is
+ simply a pointer represented as uint64_t type, or a memref type without shape
+ information e.g., memref<?x?xf16>, the strides information has to be explicitly
+ passed via the "dynamic_strides" argument. And it currently only accepts operands two.
+
+ Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
+ %0 = memref.alloc() : memref<1024x1024xf32>
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %1 = xegpu.create_nd_tdesc %0[%c0, %c0]: memref<1024x1024xf32> -> TensorDesc<8x16xf32>
+
+ Example 2 (suppose the tensor shape inferred by the compiler is 8x16):
+ %0 = memref.alloc(%h, %w) : memref<?x?xf32>
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: memref<?x?xf32> -> TensorDesc<8x16xf32>
+
+ Example 3 (suppose the tensor shape inferred by the compiler is 8x16):
+ %0 = ... : ui64
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
+ }];
+
+ let arguments = (ins
+ XeGPU_BaseAddrType: $source,
+ Variadic<Index>: $offsets,
+ Variadic<Index>: $shape,
+ Variadic<Index>: $strides,
+ DenseI64ArrayAttr: $const_offsets,
+ OptionalAttr<DenseI64ArrayAttr>: $const_shape,
+ OptionalAttr<DenseI64ArrayAttr>: $const_strides
+ );
+ let results = (outs XeGPU_TensorDesc: $TensorDesc);
+
+ let assemblyFormat = [{
+ $source ``
+ custom<DynamicIndexList>($offsets, $const_offsets)
+ (`,` custom<DynamicIndexList>($shape, $const_shape)^
+ `,` custom<DynamicIndexList>($strides, $const_strides))?
+ attr-dict `:` type($source) `->` qualified(type($TensorDesc))
+ }];
+
+ let hasVerifier = 1;
+
+ let builders = [
+ OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source,
+ "llvm::ArrayRef<OpFoldResult>": $offsets)>,
+
+ OpBuilder<(ins "Type": $tdesc, "TypedValue<IntegerType> ": $source,
+ "llvm::ArrayRef<OpFoldResult>": $offsets,
+ "llvm::ArrayRef<OpFoldResult>": $shape,
+ "llvm::ArrayRef<OpFoldResult>": $strides)>
+ ];
+
+ let extraClassDeclaration = extraBaseClassDeclaration # [{
+ /// Returns the type of the source memref operand.
+ Type getSourceType() {
+ return getSource().getType();
+ }
+
+ /// Returns the type of the result TensorDesc.
+ xegpu::TensorDescType getType() {
+ return getTensorDesc().getType();
+ }
+
+ /// Return the element type of the TensorDesc
+ Type getElementType() {
+ return getType().getElementType();
+ }
+
+ /// Return the shape of the TensorDesc
+ llvm::ArrayRef<int64_t> getTensorDescShape() {
+ return getType().getShape();
+ }
+
+ /// wrapper for matching with OffsetSizeAndStrideOpInterface
+ OperandRange getSizes() {
+ return getShape();
+ }
+
+ ArrayRef<int64_t> getStaticOffsets(){
+ return getConstOffsets();
+ }
+
+ /// wrapper for matching with OffsetSizeAndStrideOpInterface
+ /// If source is IntegerType or `const_shape` is filled,
+ /// it will return `const_shape`, such that mixes of `shape`
+ /// and `const_shape` will be used to represent the shape of
+ /// source operand. They overide static shape from source memref type.
+ ArrayRef<int64_t> getStaticSizes() {
+ auto attr = getConstShapeAttr();
+ if (getSourceType().isa<IntegerType>() || attr)
+ return attr;
+
+ auto memrefType = getSourceType().dyn_cast<MemRefType>();
+ assert(memrefType && "Incorrect use of getStaticSizes");
+ return memrefType.getShape();
+ }
+
+ /// wrapper for matching with OffsetSizeAndStrideOpInterface
+ /// If source is IntegerType or `const_strides` is filled, it
+ /// will return `const_strides`, such that mixes of `strides`
+ /// and `const_strides` will be used to represent the strides of
+ /// source operand. They overide static strides from source memref type.
+ ArrayRef<int64_t> getStaticStrides() {
+ auto attr = getConstStridesAttr();
+ if (getSourceType().isa<IntegerType>() || attr)
+ return attr;
+
+ auto memrefType = getSourceType().dyn_cast<MemRefType>();
+ assert(memrefType && "Incorrect use of getStaticStrides");
+ auto [strides, offset] = getStridesAndOffset(memrefType);
+ // reuse the storage of ConstStridesAttr since strides from
+ // memref is not persistant
+ setConstStrides(strides);
+ attr = getConstStridesAttr();
+ return attr;
+ }
+
+ /// Return the expected rank of each of the`static_offsets`,
+ /// `static_shape` and `static_strides` attributes.
+ std::array<unsigned, 3> getArrayAttrMaxRanks() {
+ unsigned rank;
+ if (auto ty = getSourceType().dyn_cast<MemRefType>()) {
+ rank = ty.getRank();
+ } else {
+ rank = (unsigned)getMixedOffsets().size();
+ }
+ return {rank, rank, rank};
+ }
+
+ /// Return the number of leading operands before the `offsets`,
+ /// `shape` and `strides` operands.
+ static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
+
+ mlir::Value getViewSource() { return getSource(); }
+ }];
+}
+
+def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
+ let summary = "prefetches a nD block to cache";
+ let description = [{
+ It issues an instruction to prefetch the data from memory to each
+ level of the cache based on their cache policy.
+
+ Example:
+ ```
+ xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>,
+ l2_hint = #xegpu.cache_hint<cached>,
+ l3_hint = #xegpu.cache_hint<cached>}
+ : !xegpu.tensor_desc<8x16xf16>
+ ```
+
+ }];
+
+ let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+ let extraClassDeclaration = extraBaseClassDeclaration;
+
+ let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc))";
+}
+
+
+def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> {
+ let summary = "loads a n-D block from memory (represented by TensorDesc)"
+ "to registers (represented by vector)";
+ let description = [{
+ LoadNdOp essentially mimics the hardware block read instruction to read
+ a block of data from memory to register. It takes a set of optional cache
+ hints for each level of cache, L1, L2 and L3. If hardware does not have a
+ correspoding cache, Corresponding cache hint attribute will be masked.
+ vnni transform is an hardware feature for Intel GPU, which is used to
+ do data packing during the load for B operand of matrix operation, if
+ the bit width of the data type is less then 32 bits, e.g., fp16. And
+ transpose is another Intel hardware feature, which will do transpose
+ operation when loading the data if the bit width of the data type is
+ fp32 or fp64. It implies that vnni and transpose cannot exit at the
+ same time.
+
+ Example:
+ ```
+ xegpu.load_nd %1 {transpose = [1, 0],
+ l1_hint = #xegpu.cache_hint<cached>,
+ l2_hint = #xegpu.cache_hint<uncached>,
+ l3_hint = #xegpu.cache_hint<streaming>}
+ : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
+ ```
+
+
+ }];
+
+ let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+ OptionalAttr<I64Attr>: $vnni_axis,
+ OptionalAttr<DenseI64ArrayAttr>: $transpose,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+ let results = (outs XeGPU_ValueType: $value);
+
+ let extraClassDeclaration = extraBaseClassDeclaration # [{
+ VectorType getType() {
+ return llvm::dyn_cast<VectorType>(getValue().getType());
+ }
+
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
+ }];
+
+ let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)";
+ let hasVerifier = 1;
+}
+
+def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> {
+ let summary = "stores a n-D block register region back to memory, currently only supports 2D";
+
+ let description = [{
+ StoreNdOp essentially mimics the hardware block write instruction io
+ write a block of data from register into the memory region as described
+ by the TensorDesc. It takes a set of optional cache hints for each level
+ of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
+ Corresponding cache hint attribute will be masked.
+
+ Example:
+ ```
+ xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+ l2_hint = #xegpu.cache_hint<write_back>,
+ l3_hint = #xegpu.cache_hint<write_through>}
+ : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+ ```
+
+
+ }];
+
+ let arguments = (ins XeGPU_ValueType: $value,
+ XeGPU_TensorDesc: $TensorDesc,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+ let extraClassDeclaration = extraBaseClassDeclaration;
+ let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict
+ `:` type($value) `,` qualified(type($TensorDesc))}];
+ let hasVerifier = 1;
+}
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 1d75bb4e2906fe..19ac1693712dd8 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -9,9 +9,9 @@
#ifndef MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
#define MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
-include "mlir/IR/BuiltinTypes.td"
include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
+include "mlir/IR/BuiltinTypes.td"
def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64, UI1, UI8, UI16, UI32, UI64]>;
def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>;
@@ -30,4 +30,106 @@ class XeGPUTypeDef<string name, string typeMnemonic, list<Trait> traits = [],
let mnemonic = typeMnemonic;
}
+def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
+ [ShapedTypeInterface], "::mlir::TensorType"> {
+ let summary = "TensorDesc describing regions of interested data.";
+ let description = [{
+ TensorDesc is a type designed to describe regions of the interested data as well as some
+ features that are unique to Intel hardware. Different with the builtin tensor type in MLIR,
+ it essentially only contains the meta data, and doesn't hold the data by itself. It is designed
+ to mainly support 2D block load/store and DPAS (matrix multiplication instruction) on Intel GPU.
+ It encodes the following information:
+
+ * shape: the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows
+ and each row contains 16 contiguous data element. The rows could be
+ either contiguous or not, depends on whether the encoding attribute
+ is set or not.
+ * element_type: the data type of the data element, e.g., f16, f32.
+
+ Similar to the builtin tensor, it also provides an optinal attribute to encoding
+ the following information via the TensorDescAttr object:
+ * memory_scope (xegpu::MemoryScope): [optional] where the data is located,
+ global memory or shared memory. It is default to Global.
+ * array_length (int): [optional] The number of contiguous blocks with size as `shape`,
+ that will be loaded by block load at a time. It is default to 1.
+ * boundary_check (bool): [optional] indicates whether the operation detects the boundary
+ and pads with zero for out-of-boundary access. It is default to do boundary check.
+
+
+ Syntax:
+
+ ```
+ TensorDesc-type ::= `tensor_desc` `<` dim-list element-type (attr-list)? `>`
+ element-type ::= float-type | integer-type | index-type
+ dim-list := (static-dim-list `x`)?
+ static-dim-list ::= decimal-literal `x` decimal-literal
+ attr-list = (, memory_scope = value)? (, arr_len = value)? (, boundary_check = value)?
+ ```
+
+ Examples:
+
+ ```mlir
+ // A block TensorDesc with 8x16 i32 elements
+ xegpu.tensor_desc<8x16xi32>
+
+ // A block TensorDesc with 8x16 f32 elements
+ xegpu.tensor_desc<8x16xf32>
+
+ // A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
+ xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+ ```
+ }];
+
+ let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
+ "mlir::Type": $elementType,
+ OptionalParameter<"mlir::Attribute">: $encoding);
+
+ let extraClassDeclaration = [{
+ using TensorType::clone;
+ using mlir::ShapedType::Trait<TensorDescType>::getElementTypeBitWidth;
+ using mlir::ShapedType::Trait<TensorDescType>::getRank;
+ using mlir::ShapedType::Trait<TensorDescType>::getNumElements;
+ using mlir::ShapedType::Trait<TensorDescType>::isDynamicDim;
+ using mlir::ShapedType::Trait<TensorDescType>::hasStaticShape;
+ using mlir::ShapedType::Trait<TensorDescType>::getNumDynamicDims;
+ using mlir::ShapedType::Trait<TensorDescType>::getDimSize;
+ using mlir::ShapedType::Trait<TensorDescType>::getDynamicDimIndex;
+
+ TensorDescType clone(::mlir::Type elementType) {
+ return llvm::cast<TensorDescType>(cloneWith(getShape(), elementType));
+ }
+
+ TensorDescAttr getEncodingAsTensorDescAttr() const {
+ return llvm::dyn_cast_if_present<TensorDescAttr>(getEncoding());
+ }
+
+ xegpu::MemoryScope getMemoryScope() const {
+ auto attr = getEncodingAsTensorDescAttr();
+ if (attr && attr.getMemoryScope())
+ return attr.getMemoryScope().getValue();
+ // return default value
+ return MemoryScope::Global;
+ }
+
+ int getArrayLength() {
+ auto attr = getEncodingAsTensorDescAttr();
+ if (attr && attr.getArrayLength())
+ return attr.getArrayLength().getInt();
+ // return default value
+ return 1;
+ }
+
+ bool getBoundaryCheck() {
+ auto attr = getEncodingAsTensorDescAttr();
+ if (attr && attr.getBoundaryCheck())
+ return attr.getBoundaryCheck().getValue();
+ // return default value
+ return true;
+ }
+ }];
+
+ let hasCustomAssemblyFormat = true;
+
+}
+
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 4f839ee773476b..0b3f4b9c9dbeae 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -6,7 +6,10 @@
//
//===----------------------------------------------------------------------===//
-#include <mlir/Dialect/XeGPU/IR/XeGPU.h>
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "llvm/ADT/TypeSwitch.h"
namespace mlir {
namespace xegpu {
@@ -26,8 +29,72 @@ void XeGPUDialect::initialize() {
>();
}
-// this file is for position occupation,
-// we will add functions in following PRs.
+//===----------------------------------------------------------------------===//
+// XeGPU_TensorDescAttr
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XeGPU_TensorDescType
+//===----------------------------------------------------------------------===//
+mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
+ llvm::SmallVector<int64_t> shape;
+ mlir::Type elementType;
+ mlir::FailureOr<mlir::Attribute> encoding;
+
+ // Parse literal '<'
+ if (parser.parseLess())
+ return {};
+
+ auto shapeLoc = parser.getCurrentLocation();
+ if (mlir::failed(parser.parseDimensionList(shape))) {
+ parser.emitError(shapeLoc, "failed to parse parameter 'shape'");
+ return {};
+ }
+
+ auto elemTypeLoc = parser.getCurrentLocation();
+ if (mlir::failed(parser.parseType(elementType))) {
+ parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'");
+ return {};
+ }
+
+ // parse optional attributes
+ if (mlir::succeeded(parser.parseOptionalComma())) {
+ encoding = mlir::FieldParser<mlir::Attribute>::parse(parser);
+ if (mlir::failed(encoding)) {
+ parser.emitError(
+ parser.getCurrentLocation(),
+ "Failed to parse the attribute field for TensorDescType.\n");
+ return {};
+ }
+ }
+
+ // Parse literal '>'
+ if (parser.parseGreater())
+ return {};
+
+ return TensorDescType::get(parser.getContext(), shape, elementType,
+ encoding.value_or(mlir::Attribute()));
+}
+
+void TensorDescType::print(::mlir::AsmPrinter &printer) const {
+ printer << "<";
+
+ auto shape = getShape();
+ for (int64_t dim : shape) {
+ if (mlir::ShapedType::isDynamic(dim))
+ printer << '?';
+ else
+ printer << dim;
+ printer << 'x';
+ }
+
+ printer << getElementType();
+
+ if (auto encoding = getEncoding())
+ printer << ", " << encoding;
+
+ printer << ">";
+}
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index b356c397fb8369..a0bed513567d29 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -6,15 +6,196 @@
//
//===----------------------------------------------------------------------===//
-#include <mlir/Dialect/XeGPU/IR/XeGPU.h>
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/IR/Builders.h"
#define DEBUG_TYPE "xegpu"
namespace mlir {
namespace xegpu {
-// this file is for position occupation,
-// we will add functions in following PRs.
+static void transpose(llvm::ArrayRef<int64_t> trans,
+ std::vector<int64_t> &shape) {
+ std::vector<int64_t> old = shape;
+ for (size_t i = 0; i < trans.size(); i++)
+ shape[i] = old[trans[i]];
+}
+
+template <typename T>
+static std::string makeString(T array, bool breakline = false) {
+ std::string buf;
+ buf.clear();
+ llvm::raw_string_ostream os(buf);
+ os << "[";
+ for (size_t i = 1; i < array.size(); i++) {
+ os << array[i - 1] << ", ";
+ if (breakline)
+ os << "\n\t\t";
+ }
+ os << array.back() << "]";
+ os.flush();
+ return buf;
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_CreateNdDescOp
+//===----------------------------------------------------------------------===//
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+ Type tdesc, TypedValue<MemRefType> source,
+ llvm::ArrayRef<OpFoldResult> offsets) {
+ auto ty = source.getType();
+ assert(ty.hasStaticShape() && offsets.size() == (size_t)ty.getRank());
+
+ llvm::SmallVector<int64_t> staticOffsets;
+ llvm::SmallVector<Value> dynamicOffsets;
+ dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+
+ build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */,
+ ValueRange({}) /* empty dynamic shape */,
+ ValueRange({}) /* empty dynamic strides */,
+ staticOffsets /* const offsets */, {} /* empty const shape*/,
+ {} /* empty const strides*/);
+}
+
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+ Type tdesc, TypedValue<IntegerType> source,
+ llvm::ArrayRef<OpFoldResult> offsets,
+ llvm::ArrayRef<OpFoldResult> shape,
+ llvm::ArrayRef<OpFoldResult> strides) {
+ assert(shape.size() && offsets.size() && strides.size() &&
+ shape.size() == strides.size() && shape.size() == offsets.size());
+
+ llvm::SmallVector<int64_t> staticOffsets;
+ llvm::SmallVector<int64_t> staticShape;
+ llvm::SmallVector<int64_t> staticStrides;
+ llvm::SmallVector<Value> dynamicOffsets;
+ llvm::SmallVector<Value> dynamicShape;
+ llvm::SmallVector<Value> dynamicStrides;
+
+ dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+ dispatchIndexOpFoldResults(shape, dynamicShape, staticShape);
+ dispatchIndexOpFoldResults(strides, dynamicStrides, staticOffsets);
+
+ auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
+ auto staticShapeAttr = builder.getDenseI64ArrayAttr(staticShape);
+ auto staticStridesAttr = builder.getDenseI64ArrayAttr(staticStrides);
+
+ build(builder, state, tdesc, source, dynamicOffsets, dynamicShape,
+ dynamicStrides, staticOffsetsAttr, staticShapeAttr, staticStridesAttr);
+}
+
+LogicalResult CreateNdDescOp::verify() {
+ auto rank = (int64_t)getMixedOffsets().size();
+ bool invalidRank = (rank != 2);
+ bool invalidElemTy = false;
+
+ // check source type matches the rank if it is a memref.
+ // It also should have the same ElementType as TensorDesc.
+ auto memrefTy = getSourceType().dyn_cast<MemRefType>();
+ if (memrefTy) {
+ invalidRank |= (memrefTy.getRank() != rank);
+ invalidElemTy |= memrefTy.getElementType() != getElementType();
+ }
+
+ // check result type matches the rank
+ invalidRank = (getType().getRank() != rank);
+
+ // mismatches among shape, strides, and offsets are
+ // already handeled by OffsetSizeAndStrideOpInterface.
+ // So they are not check here.
+ if (invalidRank)
+ return emitOpError(
+ "Expecting the rank of shape, strides, offsets, "
+ "source memref type (if source is a memref) and TensorDesc "
+ "should match with each other. They currenlty are 2D.");
+
+ if (invalidElemTy)
+ return emitOpError("TensorDesc should have the same element "
+ "type with the source if it is a memref.\n");
+
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_LoadNdOp
+//===----------------------------------------------------------------------===//
+LogicalResult LoadNdOp::verify() {
+ auto tdescTy = getTensorDescType();
+ auto valueTy = getType();
+
+ if (tdescTy.getRank() != 2)
+ return emitOpError(
+ "The TensorDesc for LoadNdOp should be a 2D TensorDesc.");
+
+ if (!valueTy)
+ return emitOpError("Invalid result, it should be a VectorType.\n");
+
+ auto tdescElemTy = tdescTy.getElementType();
+ auto valueElemTy = valueTy.getElementType();
+
+ if (tdescElemTy != valueElemTy)
+ return emitOpError(
+ "Value should have the same element type as TensorDesc.");
+
+ auto array_len = tdescTy.getArrayLength();
+ auto tdescShape = tdescTy.getShape().vec();
+ auto valueShape = valueTy.getShape().vec();
+
+ if (getTranspose()) {
+ auto trans = getTranspose().value();
+ if (tdescShape.size() >= trans.size())
+ transpose(trans, tdescShape);
+ else
+ emitWarning("Invalid transpose attr. It is ignored.");
+ }
+
+ if (getVnniAxis()) {
+ auto axis = getVnniAxis().value();
+ auto vnni_factor = valueShape.back();
+ tdescShape[axis] /= vnni_factor;
+ tdescShape.push_back(vnni_factor);
+ }
+
+ if (array_len > 1) {
+ auto it = tdescShape.begin();
+ tdescShape.insert(it, array_len);
+ }
+
+ if (tdescShape != valueShape)
+ return emitOpError() << "Result shape doesn't match TensorDesc shape."
+ << "The expected shape is " << makeString(tdescShape)
+ << ". But the given shape is "
+ << makeString(valueShape) << ".\n";
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_StoreNdOp
+//===----------------------------------------------------------------------===//
+LogicalResult StoreNdOp::verify() {
+ auto dstTy = getTensorDesc().getType(); // Tile
+ auto valTy = getValue().getType().cast<VectorType>(); // Vector
+
+ if (dstTy.getRank() != 2)
+ return emitOpError("Expecting a 2D TensorDesc shape.\n");
+
+ if (!valTy)
+ return emitOpError("Exepcting a VectorType result.\n");
+
+ auto dstElemTy = dstTy.getElementType();
+ auto valElemTy = valTy.getElementType();
+
+ if (dstElemTy != valElemTy) {
+ return emitOpError() << "The element type of the value should "
+ "match the elementtype of the TensorDesc.\n";
+ }
+
+ if (dstTy.getShape() != valTy.getShape())
+ return emitOpError()
+ << "The result shape should match the TensorDesc shape.\n";
+ return success();
+}
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
new file mode 100644
index 00000000000000..039346adbb851c
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
@@ -0,0 +1,62 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: gpu.module @test {
+gpu.module @test {
+// CHECK: gpu.func @test_create_nd_tdesc_vc_1(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_vc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) {
+gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
+ //CHECK: %[[C:.*]] = arith.constant 1 : index
+ %c1 = arith.constant 1 : index
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_vc_3(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr<array_length = 2 : i64>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr<array_length = 2>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+ // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16>
+ xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16>
+ gpu.return
+}
+
+// CHECK: func @test_load_nd_vc(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, vnni_axis = 0 : i64}> : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+ %2 = xegpu.load_nd %1 <{vnni_axis = 0, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+ : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+ gpu.return
+}
+
+// CHECK: func @test_store_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
+ // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16>
+ %1 = arith.constant dense<1.0>: vector<24x32xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+ %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
+ xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
+ gpu.return
+}
+
+}
\ No newline at end of file
More information about the Mlir-commits
mailing list