[Mlir-commits] [mlir] [MLIR] Add XeGPU dialect for Intel GPU (PR #78483)
Mehdi Amini
llvmlistbot at llvm.org
Wed Feb 14 18:31:27 PST 2024
================
@@ -0,0 +1,505 @@
+//===- XeGPUOps.td - XeGPU dialect operations definition ----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
+#define MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
+
+include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
+include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
+include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td"
+
+
+// Base class for dialect operations. This operation inherits from the base
+// `Op` class in OpBase.td, and provides:
+// * The parent dialect of the operation.
+// * The mnemonic for the operation, or the name without the dialect prefix.
+// * A list of traits for the operation.
+class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
+ Op<XeGPU_Dialect, mnemonic, traits>;
+
+def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSegments]> {
+
+ let summary = "create nd tensor descriptor operation";
+ let description = [{
+ The "create_nd_tdesc" operation creates a TensorDescType which represents
+ a sub-view of a 2D memory region (It can be extended to support N-D memory
+ region if needed in future). Elements in the subview continuous in each
+ dimention. It encodes the following important information for supporting
+ Intel hardware features:
+
+ * source: an object representing (starting address/pointer of) a 2D memory reagion.
+ It can be either a 2D memref object, or simply a pointer represented by uint64_t type.
+ * offsets: two index values represents offsets from the "source" at the each dimension
+ at which the subview of the target memory will be created. It is encoded via two
+ variables, including "dynamic_offsets" and "static_offsets", such that it can
+ accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])).
+ * shape: the shape information of the memory region pointed by the "source". It is
+ typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>.
+ But if "source" is simply a pointer represented as uint64_t type, or a memref
+ type without shape information e.g., memref<?x?xf16>, the shape information has
+ to be explicitly passed via the "dynamic_shape" argument. Currently "dynamic_shape"
+ only accepts operands(e.g., [%c4096, %c4096]), not attributes(e.g., [4096, 4096]).
+ * strides: the strides of the memory region pointed by the "source". Similar to shape,
+ it is typically encoded via the MemRefType of the source too. But if "source" is
+ simply a pointer represented as uint64_t type, or a memref type without shape
+ information e.g., memref<?x?xf16>, the strides information has to be explicitly
+ passed via the "dynamic_strides" argument. And it currently only accepts operands two.
+
+ Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
+ %0 = memref.alloc() : memref<32x24xf32>
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %1 = xegpu.create_nd_tdesc %0[%c0, %c1]: memref<32x24xf32> -> TensorDesc<8x16xf32>
+
+ Example 2 (suppose the tensor shape inferred by the compiler is 8x16):
+ %0 = memref.alloc(%h, %w) : memref<?x?xf32>
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %1 = xegpu.create_nd_tdesc %0[%c0, %c1], [%h, %w], [%w, %c1]: memref<?x?xf32> -> TensorDesc<8x16xf32>
+
+ Example 3 (suppose the tensor shape inferred by the compiler is 8x16):
+ %0 = ... : ui64
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %1 = xegpu.create_nd_tdesc %0[%c0, %c1], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
+ }];
+
+ let arguments = (ins XeGPU_BaseAddrType: $source,
+ Variadic<Index>: $dynamic_offsets,
+ Variadic<Index>: $dynamic_shape,
+ Variadic<Index>: $dynamic_strides,
+ DenseI64ArrayAttr: $static_offsets,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+ let results = (outs XeGPU_TensorDesc:$TensorDesc);
+
+ let hasCustomAssemblyFormat = 1;
+ let skipDefaultBuilders = 1;
+ let hasVerifier = 1;
+
+ let builders = [
+ OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets,
+ "ValueRange": $shape, "ValueRange": $strides,
+ "llvm::ArrayRef<int64_t>": $static_offsets,
+ CArg<"xegpu::ModeKind", "xegpu::ModeKind::SIMT">: $mode)>,
+
+ OpBuilder<(ins "Type": $tdesc, "Value": $source,
+ "llvm::ArrayRef<OpFoldResult>": $offsets,
+ CArg<"xegpu::ModeKind", "xegpu::ModeKind::SIMT">: $mode)>,
+
+ OpBuilder<(ins "Type": $tdesc, "Value": $source,
+ "llvm::ArrayRef<OpFoldResult>": $offsets,
+ "ValueRange": $shape, "ValueRange": $stride,
+ CArg<"xegpu::ModeKind", "xegpu::ModeKind::SIMT">: $mode)>
+ ];
+
+ let extraClassDeclaration = [{
+ /// Returns the type of the source memref operand.
+ Type getSourceType() {
+ return getSource().getType();
+ }
+
+ /// Returns the type of the result TensorDesc.
+ xegpu::TensorDescType getTensorDescType();
+
+ /// Returns the offsets info to the source. It consolidates
+ /// information from both dynamic_offsets and static_offsets
+ /// parameters. static_offsets parameter always has the expected
+ /// ranks with some dim could have ShapeType::kDynamic value
+ /// indicating the corresponding value should be from dynamic_offsets.
+ llvm::SmallVector<OpFoldResult> getOffsets();
+
+ /// returns the shape info of the source. It is either from the
+ /// memref type, if source is a memref with static shape
+ /// information or from the dynamic_shape parameter. If both
+ /// exists, the dynamic_shape parameter will be used and the
+ /// shape information from memref type will be ignored.
+ llvm::SmallVector<OpFoldResult> getShape();
+
+ /// returns the strides info of the source. It is either from the
+ /// memref type, if source is a memref with static shape
+ /// information or from the dynamic_stride parameter. If both
+ /// exists, the dynamic_strides parameter will be used and the
+ /// strides information from memref type will be ignored.
+ llvm::SmallVector<OpFoldResult> getStrides();
+
+ /// return the shape embeded in the memref type of the source.
+ /// If source is not memref type. array of kDynamic will be returned.
+ llvm::ArrayRef<int64_t> getStaticShape();
+
+ /// return the strides embeded in the memref type of the source.
+ /// If source is not memref type. array of kDynamic will be returned.
+ llvm::ArrayRef<int64_t> getStaticStrides();
+
+ /// Return the element type of the TensorDesc
+ Type getElementType();
+
+ /// Return the shape of the TensorDesc
+ llvm::ArrayRef<int64_t> getTensorDescShape();
+ }];
+
+}
+
+def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> {
+ let summary = "loads a n-D block from memory (represented by TensorDesc)"
+ "to registers (represented by vector)";
+ let description = [{
+ LoadNDOp essentially mimics the hardware block read instruction to read
+ a block of data from memory to register. It takes a set of cache hints
+ for each level of cache, L1, L2 and L3. If hardware does not have a
+ correspoding cache, Corresponding cache hint attribute will be masked.
+ If both transpose and vnni_axis present at the same time. It assume to
+ perform transpose first and then vnni transform.
+ }];
+
+ let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+ OptionalAttr<I64Attr>: $vnni_axis,
+ OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+ OptionalAttr<DenseI64ArrayAttr>: $transpose,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+ let results = (outs XeGPU_ValueType: $value);
+
+ let extraClassDeclaration = [{
+ VectorType getValueType() {
+ return llvm::dyn_cast<VectorType>(getValue().getType());
+ }
+
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
+ }];
+
+ // Format: xegpu.load_nd %1 {transpose = [1, 0], l1_hint = cached, l2_hint = uncached, l3_hint=streaming}
+ // : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
+ let hasCustomAssemblyFormat = 1;
+ let hasVerifier = 1;
+}
+
+def XeGPU_StoreNDOp : XeGPU_Op<"store_nd", []> {
+ let summary = "stores a n-D block register region back to memory, currently only supports 2D";
+ let arguments = (ins XeGPU_ValueType: $value,
+ XeGPU_TensorDesc: $TensorDesc,
+ OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+
+ // Format: xegpu.store_nd %3, %2 {l1_hint = write_back, l2_hint = uncached}
+ // : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+ let hasCustomAssemblyFormat = 1;
+ let hasVerifier = 1;
+}
+
+def XeGPU_PrefetchNDOp : XeGPU_Op<"prefetch_nd", []> {
+ let summary = "prefetches a nD block to cache";
+ let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+ OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+
+ // Format: xegpu.prefetch_nd %tdesc {l1_hint = cached, l2_hint = uncached}:
+ // !xegpu.tensor_desc<8x16xf16>
+ let hasCustomAssemblyFormat = 1;
+}
+
+def XeGPU_UpdateNDOffsetOp : XeGPU_Op<"update_nd_offset", []> {
+ let summary = "update the offsets for the given tensor descriptor";
+
+ let arguments = (ins
+ XeGPU_TensorDesc: $TensorDesc,
+ Variadic<Index>: $offsets,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+
+ let results = (outs XeGPU_TensorDesc: $result);
+
+ let hasCustomAssemblyFormat = 1;
+ let hasVerifier = 1;
+}
+
+def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure]> {
+ let summary = "create scattered tensor descritors (TensorDesc).";
+ let description = [{
+ "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates
+ a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc"
+ is for creating continious subviews, "create_tdesc" is for creating non-continious
+ (scattered) subviews. It is designed only works with VectorCompute (VC) mode and
+ accepts the following parameters:
+
+ * source: a 1D memref or pointer (uint64_t) represents the memory object.
+ * offsets: It is a 1D vector containing offsets of each access point, the supportted
+ group size, e.g., vector<16xindex>. And each element in the vector corresponds
+ to a work item (SIMT lane) in the subgroup.
+ * chunk_size_per_lane: [optional attribute] indicates number of continious elements
+ accessed for each offset, default is 1.
+
+ Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
+ %a = memref.alloc() : memref<1024xf32>
+ %c0 = arith.constant dense<0, 16, 32, 64> : vector<4xindex>
+ %1 = xegpu.create_tdesc %a, %c0: memref<1024xf32> -> TensorDesc<4xf32>
+
+ Example 2. It assumes subgroup size is 4, and each workitem access 8 elements.
+ It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
+ %0 = memref.alloc() : memref<1024xf32>
+ %c0 = arith.constant dense<0, 16, 32, 64> : vector<4xindex>
+ %1 = xegpu.create_tdesc %0, %c0 {chunk_size_per_lane = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
+ }];
+
+ let arguments = (ins XeGPU_BaseAddrType: $source,
+ XeGPU_OffsetType: $offsets,
+ DefaultValuedAttr<I64Attr, "1">: $chunk_size_per_lane,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+ let results = (outs XeGPU_TensorDesc:$TensorDesc);
+
+ let builders = [
+ OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source,
+ "Value": $offsets, CArg<"uint32_t", "1"> : $chunk_size_per_lane)>,
+ OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source,
+ "Value": $offsets, "IntegerAttr": $chunk_size_per_lane)>
+ ];
+ let skipDefaultBuilders = 1;
+
+ // Format: xegpu.create_tdesc %src, %offsets {mode=simt, chunk_size_per_lane=1}
+ // : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+ let hasCustomAssemblyFormat = 1;
+ let hasVerifier = 1;
+}
+
+def XeGPU_LoadGatherOp : XeGPU_Op<"load"> {
+ let summary = "load a scalar at source[offset].";
+
+ let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+ XeGPU_MaskType: $mask,
+ OptionalAttr<I64Attr>: $vnni_axis,
+ OptionalAttr<DenseI64ArrayAttr>: $transpose,
+ OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+ let results = (outs XeGPU_ValueType: $value);
+
+ let builders = [
+ OpBuilder<(ins "mlir::Type": $value, "mlir::Value": $TensorDesc,
+ "mlir::Value": $mask, "mlir::IntegerAttr": $vnni_axis,
+ CArg<"mlir::DenseI64ArrayAttr", "mlir::DenseI64ArrayAttr()">: $transpose,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l1_hint,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l2_hint,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l3_hint)>,
+
+ OpBuilder<(ins "mlir::Type": $value, "mlir::Value": $TensorDesc,
+ "mlir::Value": $mask, "mlir::IntegerAttr": $vnni_axis,
+ CArg<"DenseI64ArrayAttr", "DenseI64ArrayAttr()">: $transpose,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l1_hint,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l2_hint,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l3_hint)>
+ ];
+ let skipDefaultBuilders = 1;
+
+ // Format: %2 = xegpu.load %1, %0 {transpose = [1, 0], l1_hint = cached, l2_hint = uncached}
+ // : !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>, vector<16x8xi1> -> vector<8x16xf32>
+ let hasCustomAssemblyFormat = 1;
+ let hasVerifier = 1;
+}
+
+def XeGPU_StoreScatterOp : XeGPU_Op<"store", []> {
+ let summary = "store a scalar to source[offset].";
+
+ let arguments = (ins
+ XeGPU_ValueType: $value,
+ XeGPU_TensorDesc: $TensorDesc,
+ XeGPU_MaskType: $mask,
+ OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode
+ );
+
+ let builders = [
+ OpBuilder<(ins "Value": $value, "Value": $TensorDesc, "Value": $mask,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l1_hint,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l2_hint,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l3_hint)>,
+ OpBuilder<(ins "Value": $value, "Value": $TensorDesc, "Value": $mask,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::WRITE_BACK">: $l1_hint,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::WRITE_BACK">: $l2_hint,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::WRITE_BACK">: $l3_hint)>
+ ];
+ let skipDefaultBuilders = 1;
+
+ // Format: %3 = xegpu.load %1, %0 {l1_hint = cached, l2_hint = uncached}
+ // : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
+ let hasCustomAssemblyFormat = 1;
+ let hasVerifier = 1;
+}
+
+def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
+ let summary = "prefetches a nD block to cache";
+ let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+ OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+
+ let builders = [
+ OpBuilder<(ins "Value": $TensorDesc,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l1_hint,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l2_hint,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l3_hint)>,
+ OpBuilder<(ins "Value": $TensorDesc,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l1_hint,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l2_hint,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l3_hint)>
+ ];
+
+ let skipDefaultBuilders = 1;
+ let hasVerifier = 1;
+
+ // Format: xegpu.prefetch %tdesc {l1_hint = cached, l2_hint = uncached}:
+ // !xegpu.tensor_desc<8x16xf16>
+ let hasCustomAssemblyFormat = 1;
+}
+
+def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset", []> {
+ let summary = "update the offsets for the given tensor descriptor";
+ let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+ XeGPU_OffsetType: $offsets,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+ let results = (outs XeGPU_TensorDesc: $result);
+
+ let builders = [
+ OpBuilder<(ins "Type": $result, "Value": $TensorDesc, "Value": $offsets)>
+ ];
+
+ let skipDefaultBuilders = 1;
+ let hasCustomAssemblyFormat = 1;
+ let hasVerifier = 1;
+}
+
+def XeGPU_DpasOp : XeGPU_Op<"dpas"> {
+ let summary = "performs dpas computation";
+ let arguments = (ins
+ XeGPU_DpasOpType : $lhs,
+ XeGPU_DpasOpType : $rhs,
+ Optional<XeGPU_Vector2DType>: $acc,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode
+ );
+ let results = (outs XeGPU_Vector2DType: $result);
+ let hasCustomAssemblyFormat = 1;
+
+ let extraClassDeclaration = [{
+ VectorType getLhsType() {
+ return ::llvm::cast<VectorType>(getLhs().getType());
+ }
+
+ VectorType getRhsType() {
+ return ::llvm::cast<VectorType>(getRhs().getType());
+ }
+
+ VectorType getAccType() {
+ return ::llvm::cast<VectorType>(getAcc().getType());
+ }
+
+ VectorType getResultType() {
+ return getResult().getType();
+ }
+ }];
+
+ let hasVerifier = 1;
+}
+
+def XeGPU_InvokeSIMDOp : XeGPU_Op<"invoke_SIMD", []> {
+ let summary = "Invoke_SIMD operation";
+ let description = [{
+ The `xegpu.invoke_SIMD` operation works similar to a direct call to a function.
+ But it is special to Intel GPU.
----------------
joker-eph wrote:
The description "it is like call, but special" would deserve more details.
https://github.com/llvm/llvm-project/pull/78483
More information about the Mlir-commits
mailing list