[Mlir-commits] [mlir] [MLIR][XeGPU] Add XeGPU scattered ops (PR #86594)
Chao Chen
llvmlistbot at llvm.org
Tue Mar 26 14:12:54 PDT 2024
https://github.com/chencha3 updated https://github.com/llvm/llvm-project/pull/86594
>From 0f86b93a11d05339ac6ebd44435f513fa0e519e0 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 25 Mar 2024 22:27:58 +0000
Subject: [PATCH 1/9] Add XeGPU scattered ops
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h | 1 +
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 18 +-
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 449 +++++++++++++++---
.../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 21 +-
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 21 +
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 241 +++++++++-
mlir/test/Dialect/XeGPU/XeGPUOps.mlir | 62 +++
7 files changed, 723 insertions(+), 90 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 87aabdc015fea5..eca9255ff3974b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -12,6 +12,7 @@
#include "mlir/Bytecode/BytecodeOpInterface.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Dialect.h"
+#include "mlir/IR/TypeUtilities.h"
#include "mlir/Interfaces/ShapedOpInterfaces.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Interfaces/ViewLikeInterface.h"
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index cd38549f1ccf43..5a05462b3579de 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -22,14 +22,16 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
let parameters = (ins
OptionalParameter<"MemoryScopeAttr">: $memory_scope,
OptionalParameter<"IntegerAttr", "1">: $array_length,
- OptionalParameter<"BoolAttr", "true">: $boundary_check
+ OptionalParameter<"BoolAttr", "true">: $boundary_check,
+ OptionalParameter<"BoolAttr", "false">: $scattered
);
let builders = [
AttrBuilder<(ins
CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
CArg<"int", "1">:$array_length,
- CArg<"bool", "true">: $boundary_check
+ CArg<"bool", "true">: $boundary_check,
+ CArg<"bool", "false">: $scattered
)>
];
@@ -41,14 +43,14 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
//===----------------------------------------------------------------------===//
def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">;
def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">;
-def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope",
- "The address space of the memory the tensor descritor is created for",
+def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope",
+ "The address space of the memory the tensor descritor is created for",
[XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::xegpu";
}
-def XeGPU_MemoryScopeAttr:
+def XeGPU_MemoryScopeAttr:
EnumAttr<XeGPU_Dialect, XeGPU_MemoryScope, "memory_scope"> {
let assemblyFormat = "$value";
}
@@ -63,15 +65,15 @@ def XeGPU_CachePolicyInvalid: I32EnumAttrCase<"READ_INVALIDATE", 3, "read_
def XeGPU_CachePolicyWriteBack: I32EnumAttrCase<"WRITE_BACK", 4, "write_back">; // valid for write only
def XeGPU_CachePolicyWriteThrough: I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">; // valid for write only
-def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy",
- [XeGPU_CachePolicyCached, XeGPU_CachePolicyUncached,
+def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy",
+ [XeGPU_CachePolicyCached, XeGPU_CachePolicyUncached,
XeGPU_CachePolicyStreaming, XeGPU_CachePolicyInvalid,
XeGPU_CachePolicyWriteBack, XeGPU_CachePolicyWriteThrough]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::xegpu";
}
-def XeGPU_CacheHintAttr
+def XeGPU_CacheHintAttr
: EnumAttr<XeGPU_Dialect, XeGPU_CachePolicyEnums, "cache_hint"> {
let assemblyFormat = "`<` $value `>`";
}
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 93c56ad05b432c..0380ff83581517 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -46,36 +46,35 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
}
-def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface,
+def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface,
AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> {
let summary = "Create nd-tensor descriptor operation";
let description = [{
The "create_nd_tdesc" operation creates a TensorDescType which represents
a sub-view of a 2D memory region (It can be extended to support n-D memory
- region if needed in future). Elements in the subview continuous in each
- dimention. It encodes the following important information for supporting
+ region if needed in future). Elements in the subview continuous in each
+ dimention. It encodes the following important information for supporting
Intel hardware features:
- * source: an object representing (starting address/pointer of) a 2D memory region.
+ * source: an object representing (starting address/pointer of) a 2D memory region.
It can be either a 2D memref object, or simply a pointer represented by uint64_t type.
- for the later case, the shape and layout information of the 2D memory region should
- be explicitly passed via `dynamic_shape` and `dynamic_strides` parameters.
- * offsets: two index values represents offsets from the "source" at the each dimension
+ for the later case, the shape and layout information of the 2D memory region should
+ be explicitly passed via `shape` and `strides` parameters.
+ * offsets: two index values represents offsets from the "source" at the each dimension
at which the subview of the target memory will be created. It is encoded via two
- variables, including "dynamic_offsets" and "static_offsets", such that it can
- accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])).
- * shape: the shape information of the memory region pointed by the "source". It is
- typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>.
- But if "source" is simply a pointer represented as uint64_t type, or a memref
- type without shape information e.g., memref<?x?xf16>, the shape information has
- to be explicitly passed via the "dynamic_shape" argument. Currently "dynamic_shape"
- only accepts operands(e.g., [%c4096, %c4096]), not attributes(e.g., [4096, 4096]).
- * strides: the strides of the memory region pointed by the "source". Similar to shape,
- it is typically encoded via the MemRefType of the source too. But if "source" is
- simply a pointer represented as uint64_t type, or a memref type without shape
- information e.g., memref<?x?xf16>, the strides information has to be explicitly
- passed via the "dynamic_strides" argument. And it currently only accepts operands two.
+ variables, including "offsets" and "const_offsets", such that it can
+ accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4]).
+ * shape: the shape information of the memory region pointed by the "source". It is
+ typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>.
+ But if "source" is simply a pointer represented as uint64_t type, or a memref
+ type without shape information e.g., memref<?x?xf16>, the shape information has
+ to be explicitly passed via the "shape" and "const_shape" arguments.
+ * strides: the strides of the memory region pointed by the "source". Similar to shape,
+ it is typically encoded via the MemRefType of the source too. But if "source" is
+ simply a pointer represented as uint64_t type, or a memref type without shape
+ information e.g., memref<?x?xf16>, the strides information has to be explicitly
+ passed via the "strides" and "const_strides" argument.
Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
%0 = memref.alloc() : memref<1024x1024xf32>
@@ -96,10 +95,10 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
%1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
}];
- let arguments = (ins
- XeGPU_BaseAddrType: $source,
- Variadic<Index>: $offsets,
- Variadic<Index>: $shape,
+ let arguments = (ins
+ XeGPU_BaseAddrType: $source,
+ Variadic<Index>: $offsets,
+ Variadic<Index>: $shape,
Variadic<Index>: $strides,
DenseI64ArrayAttr: $const_offsets,
OptionalAttr<DenseI64ArrayAttr>: $const_shape,
@@ -118,12 +117,12 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
let hasVerifier = 1;
let builders = [
- OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source,
+ OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source,
"llvm::ArrayRef<OpFoldResult>": $offsets)>,
- OpBuilder<(ins "Type": $tdesc, "TypedValue<IntegerType> ": $source,
+ OpBuilder<(ins "Type": $tdesc, "TypedValue<IntegerType> ": $source,
"llvm::ArrayRef<OpFoldResult>": $offsets,
- "llvm::ArrayRef<OpFoldResult>": $shape,
+ "llvm::ArrayRef<OpFoldResult>": $shape,
"llvm::ArrayRef<OpFoldResult>": $strides)>
];
@@ -158,41 +157,41 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
}
/// wrapper for matching with OffsetSizeAndStrideOpInterface
- /// If source is IntegerType or `const_shape` is filled,
+ /// If source is IntegerType or `const_shape` is filled,
/// it will return `const_shape`, such that mixes of `shape`
- /// and `const_shape` will be used to represent the shape of
+ /// and `const_shape` will be used to represent the shape of
/// source operand. They overide static shape from source memref type.
ArrayRef<int64_t> getStaticSizes() {
auto attr = getConstShapeAttr();
if (getSourceType().isa<IntegerType>() || attr)
return attr;
-
+
auto memrefType = getSourceType().dyn_cast<MemRefType>();
assert(memrefType && "Incorrect use of getStaticSizes");
return memrefType.getShape();
}
/// wrapper for matching with OffsetSizeAndStrideOpInterface
- /// If source is IntegerType or `const_strides` is filled, it
+ /// If source is IntegerType or `const_strides` is filled, it
/// will return `const_strides`, such that mixes of `strides`
- /// and `const_strides` will be used to represent the strides of
+ /// and `const_strides` will be used to represent the strides of
/// source operand. They overide static strides from source memref type.
ArrayRef<int64_t> getStaticStrides() {
auto attr = getConstStridesAttr();
if (getSourceType().isa<IntegerType>() || attr)
return attr;
-
+
auto memrefType = getSourceType().dyn_cast<MemRefType>();
assert(memrefType && "Incorrect use of getStaticStrides");
auto [strides, offset] = getStridesAndOffset(memrefType);
- // reuse the storage of ConstStridesAttr since strides from
+ // reuse the storage of ConstStridesAttr since strides from
// memref is not persistant
setConstStrides(strides);
attr = getConstStridesAttr();
return attr;
}
- /// Return the expected rank of each of the`static_offsets`,
+ /// Return the expected rank of each of the`static_offsets`,
/// `static_shape` and `static_strides` attributes.
std::array<unsigned, 3> getArrayAttrMaxRanks() {
unsigned rank;
@@ -203,8 +202,8 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
}
return {rank, rank, rank};
}
-
- /// Return the number of leading operands before the `offsets`,
+
+ /// Return the number of leading operands before the `offsets`,
/// `shape` and `strides` operands.
static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
@@ -213,15 +212,15 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
}
def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
- let summary = "prefetches a nD block to cache";
+ let summary = "prefetches a n-D block to cache";
let description = [{
- It issues an instruction to prefetch the data from memory to each
- level of the cache based on their cache policy.
+ It issues an instruction to prefetch a block of data from continuous
+ memory regions to each level of the cache based on their cache policy.
Example:
```
- xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>,
- l2_hint = #xegpu.cache_hint<cached>,
+ xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>,
+ l2_hint = #xegpu.cache_hint<cached>,
l3_hint = #xegpu.cache_hint<cached>}
: !xegpu.tensor_desc<8x16xf16>
```
@@ -232,34 +231,41 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-
- let extraClassDeclaration = extraBaseClassDeclaration;
+
+ let extraClassDeclaration = extraBaseClassDeclaration # [{
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
+ }];
let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc))";
+
+ let hasVerifier = 1;
}
-def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> {
- let summary = "loads a n-D block from memory (represented by TensorDesc)"
+def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "TensorDesc"]>,
+ AllElementCountsMatch<["value", "TensorDesc"]>]> {
+ let summary = "loads a n-D block from memory (represented by TensorDesc)"
"to registers (represented by vector)";
let description = [{
- LoadNdOp essentially mimics the hardware block read instruction to read
- a block of data from memory to register. It takes a set of optional cache
- hints for each level of cache, L1, L2 and L3. If hardware does not have a
+ LoadNdOp essentially mimics the hardware block read instruction to read
+ a block of data from memory to register. It takes a set of optional cache
+ hints for each level of cache, L1, L2 and L3. If hardware does not have a
correspoding cache, Corresponding cache hint attribute will be masked.
- vnni transform is an hardware feature for Intel GPU, which is used to
- do data packing during the load for B operand of matrix operation, if
- the bit width of the data type is less then 32 bits, e.g., fp16. And
+ vnni transform is an hardware feature for Intel GPU, which is used to
+ do data packing during the load for B operand of matrix operation, if
+ the bit width of the data type is less then 32 bits, e.g., fp16. And
transpose is another Intel hardware feature, which will do transpose
- operation when loading the data if the bit width of the data type is
- fp32 or fp64. It implies that vnni and transpose cannot exit at the
+ operation when loading the data if the bit width of the data type is
+ fp32 or fp64. It implies that vnni and transpose cannot exit at the
same time.
Example:
```
xegpu.load_nd %1 {transpose = [1, 0],
- l1_hint = #xegpu.cache_hint<cached>,
- l2_hint = #xegpu.cache_hint<uncached>,
+ l1_hint = #xegpu.cache_hint<cached>,
+ l2_hint = #xegpu.cache_hint<uncached>,
l3_hint = #xegpu.cache_hint<streaming>}
: !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
```
@@ -290,20 +296,21 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> {
let hasVerifier = 1;
}
-def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> {
+def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [AllShapesMatch<["value", "TensorDesc"]>,
+ AllElementTypesMatch<["value", "TensorDesc"]>]> {
let summary = "stores a n-D block register region back to memory, currently only supports 2D";
let description = [{
StoreNdOp essentially mimics the hardware block write instruction io
- write a block of data from register into the memory region as described
- by the TensorDesc. It takes a set of optional cache hints for each level
- of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
+ write a block of data from register into the memory region as described
+ by the TensorDesc. It takes a set of optional cache hints for each level
+ of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
Corresponding cache hint attribute will be masked.
Example:
```
xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
- l2_hint = #xegpu.cache_hint<write_back>,
+ l2_hint = #xegpu.cache_hint<write_back>,
l3_hint = #xegpu.cache_hint<write_through>}
: vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
```
@@ -317,11 +324,327 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> {
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
- let extraClassDeclaration = extraBaseClassDeclaration;
+ let extraClassDeclaration = extraBaseClassDeclaration # [{
+ VectorType getValueType() {
+ return llvm::dyn_cast<VectorType>(getValue().getType());
+ }
+
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
+ }];
- let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict
+ let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict
`:` type($value) `,` qualified(type($TensorDesc))}];
let hasVerifier = 1;
}
+def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
+ [AllTypesMatch<["TensorDesc", "result"]>]> {
+ let summary = "It updates the offsets for the TensorDesc.";
+ let description = [{The op updates the offset of the given TensorDesc.
+ The offsets are relative offset to the current position in the number
+ of elements. It will result in a same type TensorDesc as the input.
+
+ example:
+ ```
+ %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32>
+ ```
+ }];
+
+ let arguments = (ins
+ XeGPU_TensorDesc: $TensorDesc,
+ Variadic<Index>: $offsets,
+ DenseI64ArrayAttr: $const_offsets);
+
+ let results = (outs XeGPU_TensorDesc: $result);
+
+ let extraClassDeclaration = extraBaseClassDeclaration # [{
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
+
+ SmallVector<OpFoldResult> getMixedOffsets() {
+ Builder b(getContext());
+ return getMixedValues(getConstOffsets(), getOffsets(), b);
+ }
+
+ size_t getNumOffsets() {
+ return getMixedOffsets().size();
+ }
+
+ OpFoldResult getOffset(unsigned idx) {
+ assert(idx < getNumOffsets() && "Invalid out of bound access.");
+ return getMixedOffsets()[idx];
+ }
+ }];
+
+ let assemblyFormat = [{
+ $TensorDesc `,`
+ custom<DynamicIndexList>($offsets, $const_offsets)
+ attr-dict `:` qualified(type($result))
+ }];
+
+ let hasVerifier = 1;
+}
+
+def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
+ let summary = "create scattered tensor descriptors (TensorDesc).";
+ let description = [{
+ "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates
+ a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc"
+ is for creating continious subviews, "create_tdesc" is for creating non-continious
+ (scattered) subviews, allowing each work-item in a subgroup specifying their own offset.
+ It accepts the following parameters:
+
+ * source: a 1D memref or pointer (uint64_t) represents the flattened memory object.
+ * offsets: a array containing offsets of each access point. Its size
+ is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
+ implying each element in the array corresponds to a work-item (SIMT lane)
+ in the subgroup.
+ * chunk_size: [optional attribute] indicates number of continious
+ elements accessed for each offset, default is 1.
+
+ Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
+ %a = memref.alloc() : memref<1024xf32>
+ %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32>
+
+ Example 2. It assumes subgroup size is 4, and each workitem access 8 elements.
+ It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
+ %0 = memref.alloc() : memref<1024xf32>
+ %1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
+ }];
+
+ let arguments = (ins XeGPU_BaseAddrType: $source,
+ Variadic<Index>: $offsets,
+ DenseI64ArrayAttr: $const_offsets,
+ DefaultValuedAttr<I64Attr, "1">: $chunk_size);
+ let results = (outs XeGPU_TensorDesc:$TensorDesc);
+
+ let builders = [
+ OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source,
+ "llvm::ArrayRef<OpFoldResult>": $offsets,
+ CArg<"uint32_t", "1"> : $chunk_size)>,
+ ];
+
+ let assemblyFormat = [{
+ $source
+ custom<DynamicIndexList>($offsets, $const_offsets)
+ attr-dict `:` type($source) `->` qualified(type($TensorDesc))
+ }];
+
+ let extraClassDeclaration = extraBaseClassDeclaration # [{
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
+
+ SmallVector<OpFoldResult> getMixedOffsets() {
+ Builder b(getContext());
+ return getMixedValues(getConstOffsets(), getOffsets(), b);
+ }
+
+ size_t getNumOffsets() {
+ return getMixedOffsets().size();
+ }
+
+ mlir::Value getViewSource() { return getSource(); }
+
+ OpFoldResult getOffset(unsigned idx) {
+ assert(idx < getNumOffsets() && "Invalid out of bound access.");
+ return getMixedOffsets()[idx];
+ }
+ }];
+
+ let hasVerifier = 1;
+}
+
+def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
+ let summary = "prefetches a set of scattered data points to cache";
+
+ let description = [{
+ It issues instructions to prefetch a set of scattered data points
+ from memory to each level of the cache based on their cache policy.
+ As compared to prefetch_nd, which works on non-scattered TensorDesc,
+ it works on scattered TensorDesc instead.
+
+ Example:
+ ```
+ xegpu.prefetch %tdesc {l1_hint = #xegpu.cache_hint<cached>,
+ l2_hint = #xegpu.cache_hint<cached>,
+ l3_hint = #xegpu.cache_hint<cached>}
+ : !xegpu.tensor_desc<16xf16>
+ ```
+
+ }];
+
+ let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+ let extraClassDeclaration = extraBaseClassDeclaration # [{
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
+ }];
+
+ let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc))";
+
+ let hasVerifier = 1;
+}
+
+def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]>,
+ AllElementTypesMatch<["value", "TensorDesc"]>,
+ AllElementCountsMatch<["value", "TensorDesc"]>]> {
+ let summary = "load a set of scattered data points from memory.";
+
+ let description = [{ It (aka. load) load data per each work-item. The output
+ describes the data being loaded at the subgroup level, so its size is
+ consistent with the number of work-items in a subgroup. When `chunk_size_per_lane`
+ attribute is larger than 1 in TensorDesc, the output vector will be 2D vector,
+ with dim-1 correspoding to the chunk size.
+
+ The mask operand masks out memory access so that it is safe to pass out-of-boundary
+ addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
+
+ Example:
+ ```
+ %2 = xegpu.load %1, %0 {transpose = [1, 0],
+ l1_hint = #xegpu.cache_hint<cached>,
+ l2_hint = #xegpu.cache_hint<uncached>,
+ l3_hint = #xegpu.cache_hint<uncached>}
+ : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered=true>>, vector<16xi1>
+ -> vector<16xf32>
+ ```
+
+ }];
+
+ let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+ XeGPU_MaskType: $mask,
+ OptionalAttr<DenseI64ArrayAttr>: $transpose,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+ let results = (outs XeGPU_ValueType: $value);
+
+ let extraClassDeclaration = extraBaseClassDeclaration # [{
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
+
+ mlir::Type getElementType() {
+ auto type = getValue().getType();
+ return getElementTypeOrSelf(type);
+ }
+
+ Type getValueType() {
+ return getValue().getType();
+ }
+
+ Type getMaskType() {
+ return getMask().getType();
+ }
+
+ }];
+
+ let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict
+ `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}];
+
+ let hasVerifier = 1;
+}
+
+def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDesc"]>,
+ AllElementTypesMatch<["value", "TensorDesc"]>]> {
+ let summary = "store data to scattered memory locations.";
+ let description = [{ It (aka. store) stores data to scattered memory locations.
+ It has similar semantic to `load_gather`.
+
+ Example:
+ ```
+ %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+ l2_hint = #xegpu.cache_hint<write_back>,
+ l3_hint = #xegpu.cache_hint<write_through>}
+ : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered=true>>, vector<16xi1>
+ ```
+ }];
+
+ let arguments = (ins
+ XeGPU_ValueType: $value,
+ XeGPU_TensorDesc: $TensorDesc,
+ XeGPU_MaskType: $mask,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+ let extraClassDeclaration = extraBaseClassDeclaration # [{
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
+
+ Type getValueType() {
+ return getValue().getType();
+ }
+
+ Type getMaskType() {
+ return getMask().getType();
+ }
+ }];
+
+ let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict
+ `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}];
+
+ let hasVerifier = 1;
+}
+
+def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
+ [AllTypesMatch<["TensorDesc", "result"]>]> {
+ let summary = "It updates the offsets for the given tensor descriptor";
+
+ let description = [{It behaves similar to `update_nd_offset` in terms that
+ it updates offset of a TensorDesc, and the offsets are relative offset to
+ the current position in the number of elements. However, `update_nd_offset`
+ is to update the start point of a 2D block, so its offset constains two
+ elements representing the shift in each dimension. `update_offset` is to
+ update the offset per work-item, so its offsets contains values representing
+ shifts for each work-item.
+
+ Example:
+ ```
+ %2 = xegpu.update_offset %1, [32, 32, 32, 32]
+ : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ ```
+ }];
+
+ let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+ Variadic<Index>: $offsets,
+ DenseI64ArrayAttr: $const_offsets);
+ let results = (outs XeGPU_TensorDesc: $result);
+
+ let extraClassDeclaration = extraBaseClassDeclaration # [{
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
+
+ SmallVector<OpFoldResult> getMixedOffsets() {
+ Builder b(getContext());
+ return getMixedValues(getConstOffsets(), getOffsets(), b);
+ }
+
+ size_t getNumOffsets() {
+ return getMixedOffsets().size();
+ }
+
+ OpFoldResult getOffset(unsigned idx) {
+ assert(idx < getNumOffsets() && "Invalid out of bound access.");
+ return getMixedOffsets()[idx];
+ }
+ }];
+
+ let assemblyFormat = [{
+ $TensorDesc `,`
+ custom<DynamicIndexList>($offsets, $const_offsets)
+ attr-dict `:` qualified(type($TensorDesc))
+ }];
+}
+
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 19ac1693712dd8..0c62e513bee4f3 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -63,7 +63,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
element-type ::= float-type | integer-type | index-type
dim-list := (static-dim-list `x`)?
static-dim-list ::= decimal-literal `x` decimal-literal
- attr-list = (, memory_scope = value)? (, arr_len = value)? (, boundary_check = value)?
+ attr-list = (, memory_scope = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)?
```
Examples:
@@ -84,6 +84,17 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
"mlir::Type": $elementType,
OptionalParameter<"mlir::Attribute">: $encoding);
+ let builders = [
+ TypeBuilder<(ins
+ "llvm::ArrayRef<int64_t>": $shape,
+ "mlir::Type": $elementType,
+ CArg<"bool", "false">: $scattered,
+ CArg<"int", "1">: $array_length,
+ CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
+ CArg<"bool", "true">: $boundary_check
+ )>
+ ];
+
let extraClassDeclaration = [{
using TensorType::clone;
using mlir::ShapedType::Trait<TensorDescType>::getElementTypeBitWidth;
@@ -126,6 +137,14 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
// return default value
return true;
}
+
+ bool getScattered() {
+ auto attr = getEncodingAsTensorDescAttr();
+ if (attr && attr.getScattered())
+ return attr.getScattered().getValue();
+ // return default value
+ return false;
+ }
}];
let hasCustomAssemblyFormat = true;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 0b3f4b9c9dbeae..858cda32013eae 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -32,6 +32,17 @@ void XeGPUDialect::initialize() {
//===----------------------------------------------------------------------===//
// XeGPU_TensorDescAttr
//===----------------------------------------------------------------------===//
+TensorDescAttr TensorDescAttr::get(mlir::MLIRContext *context,
+ xegpu::MemoryScope memory_scope,
+ int array_length, bool boundary_check,
+ bool scattered) {
+ auto scopeAttr = MemoryScopeAttr::get(context, memory_scope);
+ auto lengthAttr =
+ IntegerAttr::get(IntegerType::get(context, 64), array_length);
+ auto boundaryAttr = BoolAttr::get(context, boundary_check);
+ auto scatteredAttr = BoolAttr::get(context, scattered);
+ return Base::get(context, scopeAttr, lengthAttr, boundaryAttr, scatteredAttr);
+}
//===----------------------------------------------------------------------===//
// XeGPU_TensorDescType
@@ -96,6 +107,16 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const {
printer << ">";
}
+TensorDescType TensorDescType::get(mlir::MLIRContext *context,
+ llvm::ArrayRef<int64_t> shape,
+ mlir::Type elementType, bool scattered,
+ int array_length, MemoryScope memory_scope,
+ bool boundary_check) {
+ auto attr = TensorDescAttr::get(context, memory_scope, array_length,
+ boundary_check, scattered);
+ return Base::get(context, shape, elementType, attr);
+}
+
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 02106f221f3233..4efa46642aa78f 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -9,6 +9,9 @@
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/IR/Builders.h"
+#include "mlir/IR/TypeUtilities.h"
+
+#include "llvm/Support/Debug.h"
#define DEBUG_TYPE "xegpu"
@@ -38,6 +41,38 @@ static std::string makeString(T array, bool breakline = false) {
return buf;
}
+static std::vector<int64_t> getShapeOf(Type type) {
+ std::vector<int64_t> shape;
+ if (auto ty = llvm::dyn_cast<ShapedType>(type))
+ shape = ty.getShape().vec();
+ else
+ shape.push_back(1);
+ return shape;
+}
+
+static int64_t getRankOf(Value val) {
+ auto type = val.getType();
+ if (auto ty = llvm::dyn_cast<ShapedType>(type))
+ return ty.getRank();
+ return (int64_t)0;
+};
+
+static bool isReadHintOrNone(const CachePolicyAttr &attr) {
+ if (!attr)
+ return true;
+ auto kind = attr.getValue();
+ return kind == CachePolicy::CACHED || kind == CachePolicy::UNCACHED ||
+ kind == CachePolicy::STREAMING || kind == CachePolicy::READ_INVALIDATE;
+}
+
+static bool isWriteHintOrNone(const CachePolicyAttr &attr) {
+ if (!attr)
+ return true;
+ auto kind = attr.getValue();
+ return kind == CachePolicy::CACHED || kind == CachePolicy::UNCACHED ||
+ kind == CachePolicy::WRITE_BACK || kind == CachePolicy::WRITE_THROUGH;
+}
+
//===----------------------------------------------------------------------===//
// XeGPU_CreateNdDescOp
//===----------------------------------------------------------------------===//
@@ -114,6 +149,29 @@ LogicalResult CreateNdDescOp::verify() {
return emitOpError("TensorDesc should have the same element "
"type with the source if it is a memref.\n");
+ if (getType().getScattered())
+ return emitOpError("Expects a non-scattered TensorDesc.\n");
+
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_PrefetchNdOp
+//===----------------------------------------------------------------------===//
+LogicalResult PrefetchNdOp::verify() {
+ auto tdescTy = getTensorDescType();
+ if (tdescTy.getScattered())
+ return emitOpError("Expects a non-scattered TensorDesc.\n");
+
+ if (!isReadHintOrNone(getL1HintAttr()))
+ return emitOpError("invlid l1_hint: ") << getL1HintAttr();
+
+ if (!isReadHintOrNone(getL2HintAttr()))
+ return emitOpError("invlid l2_hint: ") << getL2HintAttr();
+
+ if (!isReadHintOrNone(getL3HintAttr()))
+ return emitOpError("invlid l3_hint: ") << getL3HintAttr();
+
return success();
}
@@ -125,18 +183,22 @@ LogicalResult LoadNdOp::verify() {
auto valueTy = getType();
if (tdescTy.getRank() != 2)
- return emitOpError(
- "The TensorDesc for LoadNdOp should be a 2D TensorDesc.");
+ return emitOpError("Expecting a 2D TensorDesc.\n");
+
+ if (tdescTy.getScattered())
+ return emitOpError("Expects a non-scattered TensorDesc.\n");
if (!valueTy)
return emitOpError("Invalid result, it should be a VectorType.\n");
- auto tdescElemTy = tdescTy.getElementType();
- auto valueElemTy = valueTy.getElementType();
+ if (!isReadHintOrNone(getL1HintAttr()))
+ return emitOpError("invlid l1_hint: ") << getL1HintAttr();
- if (tdescElemTy != valueElemTy)
- return emitOpError(
- "Value should have the same element type as TensorDesc.");
+ if (!isReadHintOrNone(getL2HintAttr()))
+ return emitOpError("invlid l2_hint: ") << getL2HintAttr();
+
+ if (!isReadHintOrNone(getL3HintAttr()))
+ return emitOpError("invlid l3_hint: ") << getL3HintAttr();
auto array_len = tdescTy.getArrayLength();
auto tdescShape = tdescTy.getShape().vec();
@@ -174,26 +236,169 @@ LogicalResult LoadNdOp::verify() {
// XeGPU_StoreNdOp
//===----------------------------------------------------------------------===//
LogicalResult StoreNdOp::verify() {
- auto dstTy = getTensorDesc().getType(); // Tile
- auto valTy = getValue().getType().cast<VectorType>(); // Vector
+ auto dstTy = getTensorDescType(); // Tile
+ auto valTy = getValueType(); // Vector
if (dstTy.getRank() != 2)
- return emitOpError("Expecting a 2D TensorDesc shape.\n");
+ return emitOpError("Expecting a 2D TensorDesc.\n");
+
+ if (dstTy.getScattered())
+ return emitOpError("Expects a non-scattered TensorDesc.\n");
if (!valTy)
return emitOpError("Exepcting a VectorType result.\n");
- auto dstElemTy = dstTy.getElementType();
- auto valElemTy = valTy.getElementType();
+ if (!isWriteHintOrNone(getL1HintAttr()))
+ return emitOpError("invlid l1_hint: ") << getL1HintAttr();
+
+ if (!isWriteHintOrNone(getL2HintAttr()))
+ return emitOpError("invlid l2_hint: ") << getL2HintAttr();
+
+ if (!isWriteHintOrNone(getL3HintAttr()))
+ return emitOpError("invlid l3_hint: ") << getL3HintAttr();
+
+ return success();
+}
- if (dstElemTy != valElemTy) {
- return emitOpError() << "The element type of the value should "
- "match the elementtype of the TensorDesc.\n";
+//===----------------------------------------------------------------------===//
+// XeGPU_UpdateNDOffsetOp
+//===----------------------------------------------------------------------===//
+LogicalResult UpdateNdOffsetOp::verify() {
+ // number of offsets specified must match the rank of the tensor descriptor
+ if (getTensorDescType().getRank() != (int64_t)getNumOffsets()) {
+ return emitOpError("Invalid number of offsets.");
}
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_CreateDescOp
+//===----------------------------------------------------------------------===//
+void CreateDescOp::build(OpBuilder &builder, OperationState &state,
+ TensorDescType TensorDesc, Value source,
+ llvm::ArrayRef<OpFoldResult> offsets,
+ uint32_t chunk_size) {
+ llvm::SmallVector<int64_t> staticOffsets;
+ llvm::SmallVector<Value> dynamicOffsets;
+ dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+ build(builder, state, TensorDesc, source, dynamicOffsets, staticOffsets,
+ chunk_size);
+}
+
+LogicalResult CreateDescOp::verify() {
+ auto tdescTy = getTensorDescType();
+ auto chunkSize = getChunkSize();
+
+ if (getRankOf(getSource()) > 2)
+ return emitOpError(
+ "Expecting the source is a 1D memref or pointer (uint64_t).");
+
+ if (!tdescTy.getScattered())
+ return emitOpError("Expects a scattered TensorDesc.\n");
+
+ std::vector<int64_t> shape({(int64_t)getNumOffsets()});
+ if (chunkSize != 1)
+ shape.push_back(chunkSize);
+
+ auto tdescShape = tdescTy.getShape();
+ if (shape != tdescShape.vec())
+ return emitOpError("Expecting the size of offsets matchs TensorDesc[0].");
+
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_PrefetchOp
+//===----------------------------------------------------------------------===//
+LogicalResult PrefetchOp::verify() {
+ auto tdescTy = getTensorDescType();
+ if (!tdescTy.getScattered())
+ return emitOpError("Expects a scattered TensorDesc.\n");
+
+ if (!isReadHintOrNone(getL1HintAttr()))
+ return emitOpError("invlid l1_hint: ") << getL1HintAttr();
+
+ if (!isReadHintOrNone(getL2HintAttr()))
+ return emitOpError("invlid l2_hint: ") << getL2HintAttr();
+
+ if (!isReadHintOrNone(getL3HintAttr()))
+ return emitOpError("invlid l3_hint: ") << getL3HintAttr();
+
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_LoadGatherOp
+//===----------------------------------------------------------------------===//
+LogicalResult LoadGatherOp::verify() {
+ auto tdescTy = getTensorDescType();
+ auto maskTy = getMaskType();
+ auto valueTy = getValueType();
+
+ if (!tdescTy.getScattered())
+ return emitOpError("Expects a scattered TensorDesc.\n");
+
+ if (!isReadHintOrNone(getL1HintAttr()))
+ return emitOpError("invlid l1_hint: ") << getL1HintAttr();
+
+ if (!isReadHintOrNone(getL2HintAttr()))
+ return emitOpError("invlid l2_hint: ") << getL2HintAttr();
+
+ if (!isReadHintOrNone(getL3HintAttr()))
+ return emitOpError("invlid l3_hint: ") << getL3HintAttr();
+
+ auto tdescElemTy = tdescTy.getElementType();
+ auto valueElemTy = getElementType();
+ if (tdescElemTy != valueElemTy)
+ return emitOpError(
+ "Value should have the same element type as TensorDesc.");
+
+ std::vector<int64_t> maskShape = getShapeOf(maskTy);
+ std::vector<int64_t> valueShape = getShapeOf(valueTy);
+ std::vector<int64_t> tdescShape = getShapeOf(tdescTy);
+
+ if (tdescShape[0] != maskShape[0])
+ return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
+
+ if (getTransposeAttr()) {
+ auto trans = getTranspose().value();
+ if (tdescShape.size() < trans.size())
+ emitWarning("Invalid transpose attr. It is ignored.");
+ else
+ transpose(trans, tdescShape);
+ }
+
+ if (valueShape != tdescShape)
+ return emitOpError("Unexpected result shape")
+ << "(Expected shape: " << makeString(tdescShape)
+ << ", Given shape: " << makeString(valueShape) << ").\n";
+
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_StoreScatterOp
+//===----------------------------------------------------------------------===//
+LogicalResult StoreScatterOp::verify() {
+ auto tdescTy = getTensorDescType();
+ if (!tdescTy.getScattered())
+ return emitOpError("Expects a scattered TensorDesc.\n");
+
+ if (!isWriteHintOrNone(getL1HintAttr()))
+ return emitOpError("invlid l1_hint: ") << getL1HintAttr();
+
+ if (!isWriteHintOrNone(getL2HintAttr()))
+ return emitOpError("invlid l2_hint: ") << getL2HintAttr();
+
+ if (!isWriteHintOrNone(getL3HintAttr()))
+ return emitOpError("invlid l3_hint: ") << getL3HintAttr();
+
+ auto maskTy = getMaskType();
+ std::vector<int64_t> maskShape = getShapeOf(maskTy);
+ std::vector<int64_t> tdescShape = getShapeOf(tdescTy);
+ if (tdescShape[0] != maskShape[0])
+ return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
- if (dstTy.getShape() != valTy.getShape())
- return emitOpError()
- << "The result shape should match the TensorDesc shape.\n";
return success();
}
diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
index 039346adbb851c..f0945c79a94ac3 100644
--- a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
+++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
@@ -59,4 +59,66 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
gpu.return
}
+// CHECK: gpu.func @test_create_update_nd_tdesc_vc(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32>
+ %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_tdesc_vc(%[[arg0:.*]]: ui64) {
+gpu.func @test_create_tdesc_vc(%src: ui64) {
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_prefetch_vc(%[[arg0:.*]]: ui64) {
+gpu.func @test_prefetch_vc(%src: ui64) {
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_load_gather_vc(%[[arg0:.*]]: ui64) {
+gpu.func @test_load_gather_vc(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<true> : vector<4xi1>
+ %0 = arith.constant dense<1>: vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+ //CHECK-SAME: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1> -> vector<4x2xf32>
+ %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+ : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1> -> vector<4x2xf32>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_store_scatter_vc(%[[arg0:.*]]: ui64) {
+gpu.func @test_store_scatter_vc(%src: ui64) {
+ //CHECK: %[[c0:.*]] = arith.constant dense<true> : vector<4xi1>
+ %0 = arith.constant dense<1>: vector<4xi1>
+ //CHECK: %[[c1:.*]] = arith.constant dense<2.900000e+00> : vector<4x2xf32>
+ %1 = arith.constant dense<2.9>: vector<4x2xf32>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ %2 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ //CHECK: xegpu.store %[[c1]], %[[R0]], %[[c0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
+ //CHECK-SAME: vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1>
+ xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
+ : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_update_tdesc_vc(%[[arg0:.*]]: ui64) {
+gpu.func @test_create_update_tdesc_vc(%src: ui64) {
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], [32, 32, 32, 32] : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ %2 = xegpu.update_offset %1, [32, 32, 32, 32] : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ gpu.return
+}
+
}
\ No newline at end of file
>From 89148e9f58d02795550f735bf350b63036cb442c Mon Sep 17 00:00:00 2001
From: Chao Chen <116223022+chencha3 at users.noreply.github.com>
Date: Tue, 26 Mar 2024 09:50:09 -0500
Subject: [PATCH 2/9] Update mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
Co-authored-by: Mehdi Amini <joker.eph at gmail.com>
---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 4efa46642aa78f..dc18d8c9b40366 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -41,7 +41,7 @@ static std::string makeString(T array, bool breakline = false) {
return buf;
}
-static std::vector<int64_t> getShapeOf(Type type) {
+static SmallVector<int64_t> getShapeOf(Type type) {
std::vector<int64_t> shape;
if (auto ty = llvm::dyn_cast<ShapedType>(type))
shape = ty.getShape().vec();
>From 2c3bd1384f119a753953774ccd297a7c4cad8cb1 Mon Sep 17 00:00:00 2001
From: Chao Chen <116223022+chencha3 at users.noreply.github.com>
Date: Tue, 26 Mar 2024 09:50:41 -0500
Subject: [PATCH 3/9] Update mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
Co-authored-by: Adam Siemieniuk <adam.siemieniuk at intel.com>
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 0380ff83581517..5cea38a78be7de 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -54,7 +54,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
The "create_nd_tdesc" operation creates a TensorDescType which represents
a sub-view of a 2D memory region (It can be extended to support n-D memory
region if needed in future). Elements in the subview continuous in each
- dimention. It encodes the following important information for supporting
+ dimension. It encodes the following important information for supporting
Intel hardware features:
* source: an object representing (starting address/pointer of) a 2D memory region.
>From 6486c994d496b8291220e77e2442eb59bf21d4f1 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 26 Mar 2024 15:43:56 +0000
Subject: [PATCH 4/9] refine getShapeOf implementation
---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index dc18d8c9b40366..972cee69c294d2 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -19,8 +19,8 @@ namespace mlir {
namespace xegpu {
static void transpose(llvm::ArrayRef<int64_t> trans,
- std::vector<int64_t> &shape) {
- std::vector<int64_t> old = shape;
+ SmallVector<int64_t> &shape) {
+ SmallVector<int64_t> old = shape;
for (size_t i = 0; i < trans.size(); i++)
shape[i] = old[trans[i]];
}
@@ -42,9 +42,9 @@ static std::string makeString(T array, bool breakline = false) {
}
static SmallVector<int64_t> getShapeOf(Type type) {
- std::vector<int64_t> shape;
+ SmallVector<int64_t> shape;
if (auto ty = llvm::dyn_cast<ShapedType>(type))
- shape = ty.getShape().vec();
+ shape = SmallVector<int64_t>(ty.getShape());
else
shape.push_back(1);
return shape;
@@ -201,8 +201,8 @@ LogicalResult LoadNdOp::verify() {
return emitOpError("invlid l3_hint: ") << getL3HintAttr();
auto array_len = tdescTy.getArrayLength();
- auto tdescShape = tdescTy.getShape().vec();
- auto valueShape = valueTy.getShape().vec();
+ auto tdescShape = getShapeOf(tdescTy);
+ auto valueShape = getShapeOf(valueTy);
if (getTranspose()) {
auto trans = getTranspose().value();
@@ -353,9 +353,9 @@ LogicalResult LoadGatherOp::verify() {
return emitOpError(
"Value should have the same element type as TensorDesc.");
- std::vector<int64_t> maskShape = getShapeOf(maskTy);
- std::vector<int64_t> valueShape = getShapeOf(valueTy);
- std::vector<int64_t> tdescShape = getShapeOf(tdescTy);
+ auto maskShape = getShapeOf(maskTy);
+ auto valueShape = getShapeOf(valueTy);
+ auto tdescShape = getShapeOf(tdescTy);
if (tdescShape[0] != maskShape[0])
return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
@@ -394,8 +394,8 @@ LogicalResult StoreScatterOp::verify() {
return emitOpError("invlid l3_hint: ") << getL3HintAttr();
auto maskTy = getMaskType();
- std::vector<int64_t> maskShape = getShapeOf(maskTy);
- std::vector<int64_t> tdescShape = getShapeOf(tdescTy);
+ auto maskShape = getShapeOf(maskTy);
+ auto tdescShape = getShapeOf(tdescTy);
if (tdescShape[0] != maskShape[0])
return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
>From ff28836cc06a52a2262410c674f0fcd391921180 Mon Sep 17 00:00:00 2001
From: Chao Chen <116223022+chencha3 at users.noreply.github.com>
Date: Tue, 26 Mar 2024 12:38:55 -0500
Subject: [PATCH 5/9] Update mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
Co-authored-by: Adam Siemieniuk <adam.siemieniuk at intel.com>
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 5cea38a78be7de..41fe0ea77e5e6c 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -393,7 +393,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
let description = [{
"create_tdesc" is similar to "create_nd_tdesc" in terms that it creates
a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc"
- is for creating continious subviews, "create_tdesc" is for creating non-continious
+ is for creating continuous subviews, "create_tdesc" is for creating non-continuous
(scattered) subviews, allowing each work-item in a subgroup specifying their own offset.
It accepts the following parameters:
>From a375116a4c7938d7a3b812c871a0e26b37ff45f5 Mon Sep 17 00:00:00 2001
From: Chao Chen <116223022+chencha3 at users.noreply.github.com>
Date: Tue, 26 Mar 2024 12:39:11 -0500
Subject: [PATCH 6/9] Update mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
Co-authored-by: Mehdi Amini <joker.eph at gmail.com>
---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index dc18d8c9b40366..4e12cc4f3857a9 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -54,7 +54,7 @@ static int64_t getRankOf(Value val) {
auto type = val.getType();
if (auto ty = llvm::dyn_cast<ShapedType>(type))
return ty.getRank();
- return (int64_t)0;
+ return 0;
};
static bool isReadHintOrNone(const CachePolicyAttr &attr) {
>From 4ca38ed33e5e6bcb8d483b2f22a7aed790217726 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 26 Mar 2024 17:40:40 +0000
Subject: [PATCH 7/9] improve doc
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 24 ++++++++++++++++---
1 file changed, 21 insertions(+), 3 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 5a05462b3579de..6579d07ec26215 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -19,6 +19,23 @@ class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
}
def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
+ let summary = [{a composite attribute for `TensorDescType`}];
+ let description = [{`TensorDescAttr` (or `tdesc_attr`) is a composite
+ attribute defined for `TensorDescType` for describing following
+ properties of a `TensorDesc`.
+ 1. `memory_scope`: It describes where the data block described by the
+ TensorDesc is located, `Global` device memory or `Shared` local memory.
+ It is default to `Global`.
+ 2. `array_length`: It describes how many horizontally consecutive blocks
+ will be loaded by a hardware load instruction. If the TensorDesc shape
+ is 8x16, with array_length = 2. The loaded block shape will be acctually
+ 8x32. Its default value is 1.
+ 3. `boundary_check`: It is used to indicates the hardware whether to do
+ out-of-boundary check. The default value is true.
+ 4. `scattered`: It is used to differenciate TensorDescs created from
+ `create_nd_tdesc` vs from `create_tdesc`.
+ }];
+
let parameters = (ins
OptionalParameter<"MemoryScopeAttr">: $memory_scope,
OptionalParameter<"IntegerAttr", "1">: $array_length,
@@ -52,6 +69,8 @@ def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope",
def XeGPU_MemoryScopeAttr:
EnumAttr<XeGPU_Dialect, XeGPU_MemoryScope, "memory_scope"> {
+ let summary = [{Describe the location of data described by a `TensorDesc`:
+ Global device memory (`Global`) or Shared local memory (`SLM`).}];
let assemblyFormat = "$value";
}
@@ -75,9 +94,8 @@ def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy",
def XeGPU_CacheHintAttr
: EnumAttr<XeGPU_Dialect, XeGPU_CachePolicyEnums, "cache_hint"> {
+ let summary = [{Describe the cache settings for prefetch/load/store operators}];
let assemblyFormat = "`<` $value `>`";
}
-
-
-#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
+#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
\ No newline at end of file
>From 253b96f12c377753f8f9383a20f8c1541fcce850 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 26 Mar 2024 20:58:18 +0000
Subject: [PATCH 8/9] add invalid test cases
---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 17 ++-
mlir/test/Dialect/XeGPU/invalid.mlir | 159 +++++++++++++++++++++++++
2 files changed, 170 insertions(+), 6 deletions(-)
create mode 100644 mlir/test/Dialect/XeGPU/invalid.mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 6c644679fd1a9f..621986c54d492c 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -264,8 +264,12 @@ LogicalResult StoreNdOp::verify() {
// XeGPU_UpdateNDOffsetOp
//===----------------------------------------------------------------------===//
LogicalResult UpdateNdOffsetOp::verify() {
+ auto ty = getTensorDescType();
+ if (ty.getScattered())
+ return emitOpError("Expects a non-scattered TensorDesc.\n");
+
// number of offsets specified must match the rank of the tensor descriptor
- if (getTensorDescType().getRank() != (int64_t)getNumOffsets()) {
+ if (ty.getRank() != (int64_t)getNumOffsets()) {
return emitOpError("Invalid number of offsets.");
}
return success();
@@ -289,20 +293,21 @@ LogicalResult CreateDescOp::verify() {
auto tdescTy = getTensorDescType();
auto chunkSize = getChunkSize();
- if (getRankOf(getSource()) > 2)
+ if (getRankOf(getSource()) > 1)
return emitOpError(
"Expecting the source is a 1D memref or pointer (uint64_t).");
if (!tdescTy.getScattered())
return emitOpError("Expects a scattered TensorDesc.\n");
- std::vector<int64_t> shape({(int64_t)getNumOffsets()});
+ SmallVector<int64_t> shape({(int64_t)getNumOffsets()});
if (chunkSize != 1)
shape.push_back(chunkSize);
- auto tdescShape = tdescTy.getShape();
- if (shape != tdescShape.vec())
- return emitOpError("Expecting the size of offsets matchs TensorDesc[0].");
+ auto tdescShape = getShapeOf(tdescTy);
+ if (shape != tdescShape)
+ return emitOpError("Incorrect TensorDesc shape. ")
+ << "Expected is " << makeString(shape) << "\n";
return success();
}
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
new file mode 100644
index 00000000000000..5e29361ec69087
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -0,0 +1,159 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// -----
+func.func @test_create_nd_tdesc_vc_1(%src: memref<24xf32>) {
+ // expected-error at +1 {{Expecting the rank of shape, strides, offsets, source memref type (if source is a memref) and TensorDesc should match with each other. They currenlty are 2D.}}
+ %1 = xegpu.create_nd_tdesc %src[0] : memref<24xf32> -> !xegpu.tensor_desc<8x16xf32>
+ return
+}
+
+// -----
+
+func.func @test_create_nd_tdesc_vc_2(%src: memref<24x32xf32>) {
+ // expected-error at +1 {{TensorDesc should have the same element type with the source if it is a memref}}
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf16>
+ return
+}
+
+// -----
+func.func @test_prefetch_nd_vc_1(%src: memref<24x32xf16>) {
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+ // expected-error at +1 {{invlid l1_hint: #xegpu.cache_hint<write_back>}}
+ xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<8x16xf16>
+ return
+}
+
+// -----
+func.func @test_prefetch_nd_vc_2(%src: memref<24xf16>) {
+ %1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7]
+ : memref<24xf16> -> !xegpu.tensor_desc<8xf16, #xegpu.tdesc_attr<scattered=true>>
+ // expected-error at +1 {{Expects a non-scattered TensorDesc}}
+ xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>}>
+ : !xegpu.tensor_desc<8xf16, #xegpu.tdesc_attr<scattered=true>>
+ return
+}
+
+// -----
+func.func @test_load_nd_vc_1(%src: memref<8x16xf16>) {
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ // expected-error at +1 {{invlid l1_hint: #xegpu.cache_hint<write_back>}}
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<write_back>}>
+ : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+ return
+}
+
+// -----
+func.func @test_load_nd_vc_2(%src: memref<16xf16>) {
+ %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2}
+ : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>>
+ // expected-error at +1 {{Expects a non-scattered TensorDesc.}}
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>}>
+ : !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>> -> vector<8x2xf16>
+ return
+}
+
+// -----
+func.func @test_store_nd_vc_1(%dst: memref<24x32xf16>) {
+ %1 = arith.constant dense<1.0>: vector<24x32xf16>
+ %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+ // expected-error at +1 {{invlid l1_hint: #xegpu.cache_hint<streaming>}}
+ xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<streaming>}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
+ return
+}
+
+// -----
+func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
+ %1 = arith.constant dense<1.0>: vector<8x2xf16>
+ %2 = xegpu.create_tdesc %dst[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2}
+ : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>>
+ // expected-error at +1 {{Expects a non-scattered TensorDesc}}
+ xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<streaming>}>
+ : vector<8x2xf16>, !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>>
+ return
+}
+
+// -----
+func.func @test_update_nd_offset_1(%dst: memref<16xf16>) {
+ %1 = xegpu.create_tdesc %dst[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2}
+ : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>>
+ // expected-error at +1 {{Expects a non-scattered TensorDesc}}
+ xegpu.update_nd_offset %1, [0, 2] : !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>>
+ return
+}
+
+// -----
+func.func @test_create_tdesc_vc_1(%src: ui64) {
+ // expected-error at +1 {{Expects a scattered TensorDesc}}
+ %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2}
+ : ui64 -> !xegpu.tensor_desc<8x2xf16>
+ return
+}
+
+// -----
+func.func @test_create_tdesc_vc_2(%src: ui64) {
+ // expected-error at +1 {{Incorrect TensorDesc shape}}
+ %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2}
+ : ui64 -> !xegpu.tensor_desc<8x4xf16, #xegpu.tdesc_attr<scattered = true>>
+ return
+}
+
+// -----
+func.func @test_prefetch_vc_1(%src: memref<24x32xf16>) {
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+ // expected-error at +1 {{Expects a scattered TensorDesc}}
+ xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<24x32xf16>
+ return
+}
+
+// -----
+func.func @test_prefetch_vc_2(%src: ui64) {
+ %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ // expected-error at +1 {{invlid l1_hint: #xegpu.cache_hint<write_back>}}
+ xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ return
+}
+
+// -----
+func.func @test_load_gather_vc_1(%src: memref<24x32xf16>) {
+ %0 = arith.constant dense<1>: vector<4xi1>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<4x2xf16>
+ // expected-error at +1 {{Expects a scattered TensorDesc}}
+ %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>}>
+ : !xegpu.tensor_desc<4x2xf16>, vector<4xi1> -> vector<4x2xf16>
+ return
+}
+
+// -----
+func.func @test_load_gather_vc_2(%src: ui64) {
+ %0 = arith.constant dense<1>: vector<4xi1>
+ %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64
+ -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ // expected-error at +1 {{invlid l1_hint: #xegpu.cache_hint<write_back>}}
+ %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<write_back>}>
+ : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1>
+ -> vector<4x2xf32>
+ return
+}
+
+// -----
+func.func @test_store_scatter_vc_1(%src: memref<24x32xf32>) {
+ %0 = arith.constant dense<1>: vector<4xi1>
+ %1 = arith.constant dense<2.9>: vector<4x2xf32>
+ %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<4x2xf32>
+ // expected-error at +1 {{Expects a scattered TensorDesc}}
+ xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint<cached>}>
+ : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32>, vector<4xi1>
+ return
+}
+
+// -----
+func.func @test_store_scatter_vc_2(%src: ui64) {
+ %0 = arith.constant dense<1>: vector<4xi1>
+ %1 = arith.constant dense<2.9>: vector<4x2xf32>
+ %2 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2}
+ : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+ // expected-error at +1 {{invlid l1_hint: #xegpu.cache_hint<streaming>}}
+ xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint<streaming>}> : vector<4x2xf32>,
+ !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1>
+ return
+}
\ No newline at end of file
>From ba62715a93a1a864b0ef8fd79468ae2b0714269f Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 26 Mar 2024 21:12:35 +0000
Subject: [PATCH 9/9] add an overlapping example for createDesc.
---
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 15 ++++++++++++
.../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 24 +++++++++----------
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 4 ++--
3 files changed, 29 insertions(+), 14 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 41fe0ea77e5e6c..a031a75984a536 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -406,13 +406,28 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
elements accessed for each offset, default is 1.
Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
+ ```
%a = memref.alloc() : memref<1024xf32>
%1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32>
+ ```
Example 2. It assumes subgroup size is 4, and each workitem access 8 elements.
It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
+ ```
%0 = memref.alloc() : memref<1024xf32>
%1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
+ ```
+
+ Example 3. It is similar to Example 2, but there is some overlaps among workitems.
+ It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
+ ```
+ %0 = memref.alloc() : memref<1024xf32>
+ %1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
+ ```
+
+
+
+
}];
let arguments = (ins XeGPU_BaseAddrType: $source,
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 0c62e513bee4f3..4cd4e5411653c1 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -34,10 +34,10 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
[ShapedTypeInterface], "::mlir::TensorType"> {
let summary = "TensorDesc describing regions of interested data.";
let description = [{
- TensorDesc is a type designed to describe regions of the interested data as well as some
- features that are unique to Intel hardware. Different with the builtin tensor type in MLIR,
- it essentially only contains the meta data, and doesn't hold the data by itself. It is designed
- to mainly support 2D block load/store and DPAS (matrix multiplication instruction) on Intel GPU.
+ TensorDesc is a type designed to describe regions of the interested data as well as some
+ features that are unique to Intel hardware. Different with the builtin tensor type in MLIR,
+ it essentially only contains the meta data, and doesn't hold the data by itself. It is designed
+ to mainly support 2D block load/store and DPAS (matrix multiplication instruction) on Intel GPU.
It encodes the following information:
* shape: the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows
@@ -46,15 +46,15 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
is set or not.
* element_type: the data type of the data element, e.g., f16, f32.
- Similar to the builtin tensor, it also provides an optinal attribute to encoding
+ Similar to the builtin tensor, it also provides an optinal attribute to encoding
the following information via the TensorDescAttr object:
- * memory_scope (xegpu::MemoryScope): [optional] where the data is located,
+ * memory_scope (xegpu::MemoryScope): [optional] where the data is located,
global memory or shared memory. It is default to Global.
* array_length (int): [optional] The number of contiguous blocks with size as `shape`,
that will be loaded by block load at a time. It is default to 1.
- * boundary_check (bool): [optional] indicates whether the operation detects the boundary
+ * boundary_check (bool): [optional] indicates whether the operation detects the boundary
and pads with zero for out-of-boundary access. It is default to do boundary check.
-
+
Syntax:
@@ -85,8 +85,8 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
OptionalParameter<"mlir::Attribute">: $encoding);
let builders = [
- TypeBuilder<(ins
- "llvm::ArrayRef<int64_t>": $shape,
+ TypeBuilderWithInferredContext<(ins
+ "llvm::ArrayRef<int64_t>": $shape,
"mlir::Type": $elementType,
CArg<"bool", "false">: $scattered,
CArg<"int", "1">: $array_length,
@@ -127,7 +127,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
if (attr && attr.getArrayLength())
return attr.getArrayLength().getInt();
// return default value
- return 1;
+ return 1;
}
bool getBoundaryCheck() {
@@ -148,7 +148,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
}];
let hasCustomAssemblyFormat = true;
-
+
}
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 858cda32013eae..24719fe748fe4f 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -107,11 +107,11 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const {
printer << ">";
}
-TensorDescType TensorDescType::get(mlir::MLIRContext *context,
- llvm::ArrayRef<int64_t> shape,
+TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
mlir::Type elementType, bool scattered,
int array_length, MemoryScope memory_scope,
bool boundary_check) {
+ auto context = elementType.getContext();
auto attr = TensorDescAttr::get(context, memory_scope, array_length,
boundary_check, scattered);
return Base::get(context, shape, elementType, attr);
More information about the Mlir-commits
mailing list