[Mlir-commits] [mlir] [MLIR][XeGPU] Add XeGPU scattered ops (PR #86594)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Mon Mar 25 15:38:47 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir
Author: Chao Chen (chencha3)
<details>
<summary>Changes</summary>
This PR adds definitions of XeGPU scattered ops and some misc updates, including:
- Extended TensorDescAttr with scattered attribute
- Add scattered ops: CreateDescOp, PrefetchOp, LoadGatherOp, StoreScatterOp, UpdateOffsetOp
- Add a block op: UpdateNdOffsetOp
---
Patch is 47.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/86594.diff
7 Files Affected:
- (modified) mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h (+1)
- (modified) mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td (+10-8)
- (modified) mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td (+386-63)
- (modified) mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td (+20-1)
- (modified) mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp (+21)
- (modified) mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp (+223-18)
- (modified) mlir/test/Dialect/XeGPU/XeGPUOps.mlir (+62)
``````````diff
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 87aabdc015fea5..eca9255ff3974b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -12,6 +12,7 @@
#include "mlir/Bytecode/BytecodeOpInterface.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Dialect.h"
+#include "mlir/IR/TypeUtilities.h"
#include "mlir/Interfaces/ShapedOpInterfaces.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Interfaces/ViewLikeInterface.h"
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index cd38549f1ccf43..5a05462b3579de 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -22,14 +22,16 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
let parameters = (ins
OptionalParameter<"MemoryScopeAttr">: $memory_scope,
OptionalParameter<"IntegerAttr", "1">: $array_length,
- OptionalParameter<"BoolAttr", "true">: $boundary_check
+ OptionalParameter<"BoolAttr", "true">: $boundary_check,
+ OptionalParameter<"BoolAttr", "false">: $scattered
);
let builders = [
AttrBuilder<(ins
CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
CArg<"int", "1">:$array_length,
- CArg<"bool", "true">: $boundary_check
+ CArg<"bool", "true">: $boundary_check,
+ CArg<"bool", "false">: $scattered
)>
];
@@ -41,14 +43,14 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
//===----------------------------------------------------------------------===//
def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">;
def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">;
-def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope",
- "The address space of the memory the tensor descritor is created for",
+def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope",
+ "The address space of the memory the tensor descritor is created for",
[XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::xegpu";
}
-def XeGPU_MemoryScopeAttr:
+def XeGPU_MemoryScopeAttr:
EnumAttr<XeGPU_Dialect, XeGPU_MemoryScope, "memory_scope"> {
let assemblyFormat = "$value";
}
@@ -63,15 +65,15 @@ def XeGPU_CachePolicyInvalid: I32EnumAttrCase<"READ_INVALIDATE", 3, "read_
def XeGPU_CachePolicyWriteBack: I32EnumAttrCase<"WRITE_BACK", 4, "write_back">; // valid for write only
def XeGPU_CachePolicyWriteThrough: I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">; // valid for write only
-def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy",
- [XeGPU_CachePolicyCached, XeGPU_CachePolicyUncached,
+def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy",
+ [XeGPU_CachePolicyCached, XeGPU_CachePolicyUncached,
XeGPU_CachePolicyStreaming, XeGPU_CachePolicyInvalid,
XeGPU_CachePolicyWriteBack, XeGPU_CachePolicyWriteThrough]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::xegpu";
}
-def XeGPU_CacheHintAttr
+def XeGPU_CacheHintAttr
: EnumAttr<XeGPU_Dialect, XeGPU_CachePolicyEnums, "cache_hint"> {
let assemblyFormat = "`<` $value `>`";
}
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 93c56ad05b432c..0380ff83581517 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -46,36 +46,35 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
}
-def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface,
+def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface,
AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> {
let summary = "Create nd-tensor descriptor operation";
let description = [{
The "create_nd_tdesc" operation creates a TensorDescType which represents
a sub-view of a 2D memory region (It can be extended to support n-D memory
- region if needed in future). Elements in the subview continuous in each
- dimention. It encodes the following important information for supporting
+ region if needed in future). Elements in the subview continuous in each
+ dimention. It encodes the following important information for supporting
Intel hardware features:
- * source: an object representing (starting address/pointer of) a 2D memory region.
+ * source: an object representing (starting address/pointer of) a 2D memory region.
It can be either a 2D memref object, or simply a pointer represented by uint64_t type.
- for the later case, the shape and layout information of the 2D memory region should
- be explicitly passed via `dynamic_shape` and `dynamic_strides` parameters.
- * offsets: two index values represents offsets from the "source" at the each dimension
+ for the later case, the shape and layout information of the 2D memory region should
+ be explicitly passed via `shape` and `strides` parameters.
+ * offsets: two index values represents offsets from the "source" at the each dimension
at which the subview of the target memory will be created. It is encoded via two
- variables, including "dynamic_offsets" and "static_offsets", such that it can
- accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])).
- * shape: the shape information of the memory region pointed by the "source". It is
- typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>.
- But if "source" is simply a pointer represented as uint64_t type, or a memref
- type without shape information e.g., memref<?x?xf16>, the shape information has
- to be explicitly passed via the "dynamic_shape" argument. Currently "dynamic_shape"
- only accepts operands(e.g., [%c4096, %c4096]), not attributes(e.g., [4096, 4096]).
- * strides: the strides of the memory region pointed by the "source". Similar to shape,
- it is typically encoded via the MemRefType of the source too. But if "source" is
- simply a pointer represented as uint64_t type, or a memref type without shape
- information e.g., memref<?x?xf16>, the strides information has to be explicitly
- passed via the "dynamic_strides" argument. And it currently only accepts operands two.
+ variables, including "offsets" and "const_offsets", such that it can
+ accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4]).
+ * shape: the shape information of the memory region pointed by the "source". It is
+ typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>.
+ But if "source" is simply a pointer represented as uint64_t type, or a memref
+ type without shape information e.g., memref<?x?xf16>, the shape information has
+ to be explicitly passed via the "shape" and "const_shape" arguments.
+ * strides: the strides of the memory region pointed by the "source". Similar to shape,
+ it is typically encoded via the MemRefType of the source too. But if "source" is
+ simply a pointer represented as uint64_t type, or a memref type without shape
+ information e.g., memref<?x?xf16>, the strides information has to be explicitly
+ passed via the "strides" and "const_strides" argument.
Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
%0 = memref.alloc() : memref<1024x1024xf32>
@@ -96,10 +95,10 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
%1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
}];
- let arguments = (ins
- XeGPU_BaseAddrType: $source,
- Variadic<Index>: $offsets,
- Variadic<Index>: $shape,
+ let arguments = (ins
+ XeGPU_BaseAddrType: $source,
+ Variadic<Index>: $offsets,
+ Variadic<Index>: $shape,
Variadic<Index>: $strides,
DenseI64ArrayAttr: $const_offsets,
OptionalAttr<DenseI64ArrayAttr>: $const_shape,
@@ -118,12 +117,12 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
let hasVerifier = 1;
let builders = [
- OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source,
+ OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source,
"llvm::ArrayRef<OpFoldResult>": $offsets)>,
- OpBuilder<(ins "Type": $tdesc, "TypedValue<IntegerType> ": $source,
+ OpBuilder<(ins "Type": $tdesc, "TypedValue<IntegerType> ": $source,
"llvm::ArrayRef<OpFoldResult>": $offsets,
- "llvm::ArrayRef<OpFoldResult>": $shape,
+ "llvm::ArrayRef<OpFoldResult>": $shape,
"llvm::ArrayRef<OpFoldResult>": $strides)>
];
@@ -158,41 +157,41 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
}
/// wrapper for matching with OffsetSizeAndStrideOpInterface
- /// If source is IntegerType or `const_shape` is filled,
+ /// If source is IntegerType or `const_shape` is filled,
/// it will return `const_shape`, such that mixes of `shape`
- /// and `const_shape` will be used to represent the shape of
+ /// and `const_shape` will be used to represent the shape of
/// source operand. They overide static shape from source memref type.
ArrayRef<int64_t> getStaticSizes() {
auto attr = getConstShapeAttr();
if (getSourceType().isa<IntegerType>() || attr)
return attr;
-
+
auto memrefType = getSourceType().dyn_cast<MemRefType>();
assert(memrefType && "Incorrect use of getStaticSizes");
return memrefType.getShape();
}
/// wrapper for matching with OffsetSizeAndStrideOpInterface
- /// If source is IntegerType or `const_strides` is filled, it
+ /// If source is IntegerType or `const_strides` is filled, it
/// will return `const_strides`, such that mixes of `strides`
- /// and `const_strides` will be used to represent the strides of
+ /// and `const_strides` will be used to represent the strides of
/// source operand. They overide static strides from source memref type.
ArrayRef<int64_t> getStaticStrides() {
auto attr = getConstStridesAttr();
if (getSourceType().isa<IntegerType>() || attr)
return attr;
-
+
auto memrefType = getSourceType().dyn_cast<MemRefType>();
assert(memrefType && "Incorrect use of getStaticStrides");
auto [strides, offset] = getStridesAndOffset(memrefType);
- // reuse the storage of ConstStridesAttr since strides from
+ // reuse the storage of ConstStridesAttr since strides from
// memref is not persistant
setConstStrides(strides);
attr = getConstStridesAttr();
return attr;
}
- /// Return the expected rank of each of the`static_offsets`,
+ /// Return the expected rank of each of the`static_offsets`,
/// `static_shape` and `static_strides` attributes.
std::array<unsigned, 3> getArrayAttrMaxRanks() {
unsigned rank;
@@ -203,8 +202,8 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
}
return {rank, rank, rank};
}
-
- /// Return the number of leading operands before the `offsets`,
+
+ /// Return the number of leading operands before the `offsets`,
/// `shape` and `strides` operands.
static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
@@ -213,15 +212,15 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
}
def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
- let summary = "prefetches a nD block to cache";
+ let summary = "prefetches a n-D block to cache";
let description = [{
- It issues an instruction to prefetch the data from memory to each
- level of the cache based on their cache policy.
+ It issues an instruction to prefetch a block of data from continuous
+ memory regions to each level of the cache based on their cache policy.
Example:
```
- xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>,
- l2_hint = #xegpu.cache_hint<cached>,
+ xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>,
+ l2_hint = #xegpu.cache_hint<cached>,
l3_hint = #xegpu.cache_hint<cached>}
: !xegpu.tensor_desc<8x16xf16>
```
@@ -232,34 +231,41 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-
- let extraClassDeclaration = extraBaseClassDeclaration;
+
+ let extraClassDeclaration = extraBaseClassDeclaration # [{
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
+ }];
let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc))";
+
+ let hasVerifier = 1;
}
-def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> {
- let summary = "loads a n-D block from memory (represented by TensorDesc)"
+def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "TensorDesc"]>,
+ AllElementCountsMatch<["value", "TensorDesc"]>]> {
+ let summary = "loads a n-D block from memory (represented by TensorDesc)"
"to registers (represented by vector)";
let description = [{
- LoadNdOp essentially mimics the hardware block read instruction to read
- a block of data from memory to register. It takes a set of optional cache
- hints for each level of cache, L1, L2 and L3. If hardware does not have a
+ LoadNdOp essentially mimics the hardware block read instruction to read
+ a block of data from memory to register. It takes a set of optional cache
+ hints for each level of cache, L1, L2 and L3. If hardware does not have a
correspoding cache, Corresponding cache hint attribute will be masked.
- vnni transform is an hardware feature for Intel GPU, which is used to
- do data packing during the load for B operand of matrix operation, if
- the bit width of the data type is less then 32 bits, e.g., fp16. And
+ vnni transform is an hardware feature for Intel GPU, which is used to
+ do data packing during the load for B operand of matrix operation, if
+ the bit width of the data type is less then 32 bits, e.g., fp16. And
transpose is another Intel hardware feature, which will do transpose
- operation when loading the data if the bit width of the data type is
- fp32 or fp64. It implies that vnni and transpose cannot exit at the
+ operation when loading the data if the bit width of the data type is
+ fp32 or fp64. It implies that vnni and transpose cannot exit at the
same time.
Example:
```
xegpu.load_nd %1 {transpose = [1, 0],
- l1_hint = #xegpu.cache_hint<cached>,
- l2_hint = #xegpu.cache_hint<uncached>,
+ l1_hint = #xegpu.cache_hint<cached>,
+ l2_hint = #xegpu.cache_hint<uncached>,
l3_hint = #xegpu.cache_hint<streaming>}
: !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
```
@@ -290,20 +296,21 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> {
let hasVerifier = 1;
}
-def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> {
+def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [AllShapesMatch<["value", "TensorDesc"]>,
+ AllElementTypesMatch<["value", "TensorDesc"]>]> {
let summary = "stores a n-D block register region back to memory, currently only supports 2D";
let description = [{
StoreNdOp essentially mimics the hardware block write instruction io
- write a block of data from register into the memory region as described
- by the TensorDesc. It takes a set of optional cache hints for each level
- of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
+ write a block of data from register into the memory region as described
+ by the TensorDesc. It takes a set of optional cache hints for each level
+ of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
Corresponding cache hint attribute will be masked.
Example:
```
xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
- l2_hint = #xegpu.cache_hint<write_back>,
+ l2_hint = #xegpu.cache_hint<write_back>,
l3_hint = #xegpu.cache_hint<write_through>}
: vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
```
@@ -317,11 +324,327 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> {
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
- let extraClassDeclaration = extraBaseClassDeclaration;
+ let extraClassDeclaration = extraBaseClassDeclaration # [{
+ VectorType getValueType() {
+ return llvm::dyn_cast<VectorType>(getValue().getType());
+ }
+
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
+ }];
- let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict
+ let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict
`:` type($value) `,` qualified(type($TensorDesc))}];
let hasVerifier = 1;
}
+def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
+ [AllTypesMatch<["TensorDesc", "result"]>]> {
+ let summary = "It updates the offsets for the TensorDesc.";
+ let description = [{The op updates the offset of the given TensorDesc.
+ The offsets are relative offset to the current position in the number
+ of elements. It will result in a same type TensorDesc as the input.
+
+ example:
+ ```
+ %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32>
+ ```
+ }];
+
+ let arguments = (ins
+ XeGPU_TensorDesc: $TensorDesc,
+ Variadic<Index>: $offsets,
+ DenseI64ArrayAttr: $const_offsets);
+
+ let results = (outs XeGPU_TensorDesc: $result);
+
+ let extraClassDeclaration = extraBaseClassDeclaration # [{
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
+
+ SmallVector<OpFoldResult> getMixedOffsets() {
+ Builder b(getContext());
+ return getMixedValues(getConstOffsets(), getOffsets(), b);
+ }
+
+ size_t getNumOffsets() {
+ return getMixedOffsets().size();
+ }
+
+ OpFoldResult getOffset(unsigned idx) {
+ assert(idx < getNumOffsets() && "Invalid out of bound access.");
+ return getMixedOffsets()[idx];
+ }
+ }];
+
+ let assemblyFormat = [{
+ $TensorDesc `,`
+ custom<DynamicIndexList>($offsets, $const_offsets)
+ attr-dict `:` qualified(type($result))
+ }];
+
+ let hasVerifier = 1;
+}
+
+def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
+ let summary = "create scattered tensor descriptors (TensorDesc).";
+ let description = [{
+ "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates
+ a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc"
+ is for creating continious subviews, "create_tdesc" is for creating non-continious
+ (scattered) subviews, allowing each work-item in a subgroup specifying their own offset.
+ It accepts the following parameters:
+
+ * source: a 1D memref or pointer (uint64_t) represents the flattened memory object.
+ * offsets: a array containing offsets of each access point. Its size
+ is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
+ implying each element in the array corresponds to a work-item (SIMT lane)
+ in the subgroup.
+ * chunk_size: [optional attribute] indicates number...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/86594
More information about the Mlir-commits
mailing list