[Mlir-commits] [mlir] [mlir][nvgpu] separate ops, types, attributes definitions in NVGPU dialect. (PR #129846)
lonely eagle
llvmlistbot at llvm.org
Wed Mar 5 01:02:55 PST 2025
https://github.com/linuxlonelyeagle updated https://github.com/llvm/llvm-project/pull/129846
>From 797a4f1ba85e45b65a1b66083416e6abf21c80af Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038 at qq.com>
Date: Wed, 5 Mar 2025 15:47:07 +0800
Subject: [PATCH 1/2] separate ops, types, attributes definitions in NVGPU
dialect.
---
.../mlir/Dialect/NVGPU/IR/CMakeLists.txt | 13 +-
mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td | 721 +-----------------
.../mlir/Dialect/NVGPU/IR/NVGPUDialect.h | 4 +-
.../include/mlir/Dialect/NVGPU/IR/NVGPUOps.td | 633 +++++++++++++++
.../mlir/Dialect/NVGPU/IR/NVGPUTypes.td | 112 +++
mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt | 2 +-
mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp | 8 +-
7 files changed, 762 insertions(+), 731 deletions(-)
create mode 100644 mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
create mode 100644 mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt
index 13d754ca06316..ecdaae7f24d93 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt
@@ -1,5 +1,10 @@
add_mlir_dialect(NVGPU nvgpu)
-add_mlir_doc(NVGPU NVGPU Dialects/ -gen-dialect-doc)
+add_mlir_doc(NVGPUOps NVGPU Dialects/ -gen-dialect-doc)
+
+set(LLVM_TARGET_DEFINITIONS NVGPUOps.td)
+mlir_tablegen(NVGPUOps.h.inc -gen-op-decls)
+mlir_tablegen(NVGPUOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRNVGPUOpsIncGen)
set(LLVM_TARGET_DEFINITIONS NVGPU.td)
mlir_tablegen(NVGPUEnums.h.inc -gen-enum-decls)
@@ -11,7 +16,7 @@ mlir_tablegen(NVGPUAttrDefs.h.inc -gen-attrdef-decls)
mlir_tablegen(NVGPUAttrDefs.cpp.inc -gen-attrdef-defs)
add_public_tablegen_target(MLIRNVGPUAttributesIncGen)
-set(LLVM_TARGET_DEFINITIONS NVGPU.td)
-mlir_tablegen(NVGPUAttrTypes.h.inc -gen-typedef-decls)
-mlir_tablegen(NVGPUAttrTypes.cpp.inc -gen-typedef-decls)
+set(LLVM_TARGET_DEFINITIONS NVGPUTypes.td)
+mlir_tablegen(NVGPUTypeDefs.h.inc -gen-typedef-decls)
+mlir_tablegen(NVGPUTypeDefs.cpp.inc -gen-typedef-defs)
add_public_tablegen_target(MLIRNVGPUTypesIncGen)
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index f48fa9976da12..6b5470310e4a1 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -1,21 +1,10 @@
-//===-- NVGPU.td - NVGPU dialect operation definitions *- tablegen -*------===//
+//===-- NVGPU.td - Attribute defs for NVGPU dialect *- tablegen -*---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-//
-// This file defines the basic operations for the NVGPU dialect.
-//
-// This NVGPU provides a bridge between the target agnostic GPU and Vector
-// dialects and lower level NVVM dialect. This allow representing PTX specific
-// operations while using MLIR high level concepts like memref and 2-D vector.
-//
-// Ops semantic are going to be based on vendor specific PTX defintion:
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html
-//
-//===----------------------------------------------------------------------===//
#ifndef NVGPU
#define NVGPU
@@ -127,712 +116,4 @@ def TensorMapOOBAttr : EnumAttr<NVGPU_Dialect, TensorMapOOBKind, "oob">;
def TensorMapInterleaveAttr : EnumAttr<NVGPU_Dialect, TensorMapInterleaveKind, "interleave">;
def RcpRoundingModeAttr : EnumAttr<NVGPU_Dialect, RcpRoundingMode, "rcp_rounding_mode">;
-//===----------------------------------------------------------------------===//
-// NVGPU Type Definitions
-//===----------------------------------------------------------------------===//
-
-class NVGPU_Type<string name, string typeMnemonic,
- list<Trait> traits = []> : TypeDef<NVGPU_Dialect, name, traits> {
- let mnemonic = typeMnemonic;
-}
-
-def NVGPU_DeviceAsyncToken : NVGPU_Type<"DeviceAsyncToken",
- "device.async.token", []> {
- let summary = "device async token type";
- let description = [{
- `nvgpu.device.async.token` is a type returned by an asynchronous operation
- that runs on the GPU (device). It is used to establish an SSA-based link
- between the async operation (e.g. DeviceAsyncCopy) and operations that
- group or synchronize the async operations (e.g. DeviceAsyncCreateGroupOp,
- DeviceAsyncWaitOp).
- }];
-}
-
-def NVGPU_MBarrierGroup : NVGPU_Type<"MBarrierGroup", "mbarrier.group", []> {
- let summary = "mbarrier barrier type";
- let description = [{
- This is the type for one or more mbarrier object in shared memory that is
- used to synchronize a variable number of threads.
-
- If `num_barriers` is not set, the number of mbarrier objects is 1.
-
- A mbarrier object is 64 bit with 8 byte alignment. The mbarrier object
- can be initiated and invalidated.
-
- [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#size-and-alignment-of-mbarrier-object)
- }];
- let parameters = (ins "Attribute":$memorySpace, DefaultValuedParameter<"unsigned", "1">:$num_barriers);
- let assemblyFormat = "`<` struct(params) `>`";
- let builders = [
- TypeBuilder<(ins "Attribute":$memorySpace), [{
- return $_get($_ctxt, memorySpace, 1);
- }]>
- ];
-}
-
-def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { }
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-map
-def NVGPU_TensorMapDescriptor : NVGPU_Type<"TensorMapDescriptor", "tensormap.descriptor", []> {
- let summary = "TensorMap descriptor";
- let parameters = (ins "MemRefType":$tensor,
- EnumParameter<TensorMapSwizzleKind>:$swizzle,
- EnumParameter<TensorMapL2PromoKind>:$l2promo,
- EnumParameter<TensorMapOOBKind>:$oob,
- EnumParameter<TensorMapInterleaveKind>:$interleave);
- let description = [{
- `nvgpu.tma.descriptor` is a type that represents a TMA descriptor. It is
- 128-byte object either in constant space or kernel paramater.
- }];
- let assemblyFormat = "`<` struct(params) `>`";
-}
-
-def NVGPU_WarpgroupMatrixDescriptor : NVGPU_Type<"WarpgroupMatrixDescriptor", "warpgroup.descriptor", []> {
- let summary = "Warpgroup matrix descriptor type";
- let description = [{
- The descriptor specifies the properties of the matrix in shared memory that
- is a multiplicand in the matrix multiply and accumulate operation.
-
- The descriptor is a 64-bit value contained in a register with the following:
- ```
- +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
- | 0-13 |14-15| 16-29 |30-31| 32-45 |46-48|49-51| 52-61 |62-63|
- +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
- | 14bits |2bits| 14bits |2bits| 14bits |2bits|3bits| 10bits |2bits|
- +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
- | BaseAddr| 0 | LeadingDim| 0 | Stride | 0 |Offst| 0 |Swzle|
- +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
- ```
-
- [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-shared-memory-layout-matrix-descriptor)
-
- }];
- let parameters = (ins "MemRefType":$tensor);
- let assemblyFormat = "`<` struct(params) `>`";
-}
-
-def NVGPU_WarpgroupAccumulator : NVGPU_Type<"WarpgroupAccumulator", "warpgroup.accumulator", []> {
- let parameters = (ins "VectorType":$fragmented);
- let assemblyFormat = "`<` struct(params) `>`";
- let description = [{
- This type represents the result matrix obtained from `nvgpu.warpgroup.mma`.
- The `$fragmented` type signifies the distributed or fragmented result
- vector that is collectively owned by all the threads in the warp-group
- that executed `nvgpu.warpgroup.mma`.
- [See the details of register fragment layout for accumulator matrix D]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d)
- }];
-}
-
-//===----------------------------------------------------------------------===//
-// NVGPU Op Definitions
-//===----------------------------------------------------------------------===//
-
-class NVGPU_Op<string mnemonic, list<Trait> traits = []> :
- Op<NVGPU_Dialect, mnemonic, traits> {}
-
-def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [
- MemoryEffects<[MemRead]>,
- PredOpTrait<"srcMemref and res have same element type",
- TCresVTEtIsSameAsOp<0, 0>>]> {
- let description = [{
- The `nvgpu.ldmatrix` op represents loading a matrix fragment from
- memory to registers. The source and result type must be compatible
- with lowering to the `nvvm.ldmatrix` instruction. This op represents
- the distributed version of a `vector.transfer_read` as an intermediate
- step between lowering from `vector.transfer_read` to `nvvm.ldmatrix`.
-
- This operation is meant to follow the semantic of described here:
- https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix
-
- Example:
- ```mlir
- %0 = nvgpu.ldmatrix %sm[%c0, %c0] {numTiles = 4 : i32, transpose = false} :
- memref<?x?xf16, 3> -> vector<4x2xf16>
- ```
- }];
-
- let arguments = (ins Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$srcMemref,
- Variadic<Index>:$indices, BoolAttr:$transpose,
- I32Attr:$numTiles);
- let results = (outs AnyVectorOfNonZeroRank:$res);
- let assemblyFormat = [{
- $srcMemref`[` $indices `]` attr-dict `:` type($srcMemref) `->` type($res)
- }];
-
- let hasVerifier = 1;
-}
-
-class NVGPU_MmaSyncOp<string mnemonic> :
- NVGPU_Op<mnemonic, [Pure,
- PredOpTrait<"matrixA and matrixB have same element type",
- TCopVTEtIsSameAs<0, 1>>]> {
- code extraBaseClassDeclaration = [{
- std::array<int64_t, 3> getMmaShapeAsArray() {
- ArrayAttr mmaShape = this->getMmaShape();
- assert(mmaShape.size() == 3 && "mmaShape should be three integers");
- return {::llvm::cast<IntegerAttr>(mmaShape[0]).getInt(),
- ::llvm::cast<IntegerAttr>(mmaShape[1]).getInt(),
- ::llvm::cast<IntegerAttr>(mmaShape[2]).getInt()};
- }
- }];
-
- let hasVerifier = 1;
-}
-
-def NVGPU_MmaSyncOp : NVGPU_MmaSyncOp<"mma.sync"> {
- let description = [{
- The `nvgpu.mma.sync` op represents the warp-level matrix-multiply-and-
- accumulate (mma) operation that is compatible with `nvvm.mma.sync`.
- The operands and results vector sizes are thread-level onwership to
- the warp-level mma operation shape. `mmaShape` attribute holds the
- warp-level matrix-multiply shape.
-
- The `nvgpu.mma.sync` op serves as an intermediate point between lowering from
- `vector.contract` to `nvvm.mma.sync`.
-
- This operation is meant to follow the semantic of described here:
- https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma
-
- Example:
-
- ```mlir
- %res = nvgpu.mma.sync (%matrixA, %matrixB, %matrixC) {mmaShape = [16, 8, 16]} :
- (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf32>) -> vector<2x2xf32>
- ```
- }];
- let arguments = (ins AnyVectorOfNonZeroRank:$matrixA,
- AnyVectorOfNonZeroRank:$matrixB,
- AnyVectorOfNonZeroRank:$matrixC,
- I64ArrayAttr:$mmaShape,
- OptionalAttr<UnitAttr>:$tf32Enabled);
-
- let results = (outs AnyVectorOfNonZeroRank:$res);
-
- let builders = [
- OpBuilder<(ins "Value":$matrixA,
- "Value":$matrixB,
- "Value":$matrixC,
- "ArrayAttr":$mmaShape)>,
- OpBuilder<(ins "Value":$matrixA,
- "Value":$matrixB,
- "Value":$matrixC,
- "ArrayRef<int64_t>":$mmaShape,
- CArg<"bool", "false">:$tf32Enabled)>
- ];
-
- let assemblyFormat = [{
- `(` $matrixA`,` $matrixB`,` $matrixC `)` attr-dict
- `:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
- }];
-
- let extraClassDeclaration = extraBaseClassDeclaration;
-}
-
-def NVGPU_MmaSparseSyncMetadataType : FixedVectorOfLengthAndType<[2], [I16]>,
- BuildableType<"::mlir::VectorType::get("
- "{2},$_builder.getI16Type())">;
-
-def NVGPU_MmaSparseSyncOp : NVGPU_MmaSyncOp<"mma.sp.sync"> {
- let description = [{
- The `nvgu.mma.sp.sync` operation performs a warp-distributed MMA operation
- where operand A is "structured sparse". In this case, the `matrixA` operand
- represents the (warp-distributed) non-zero values of operand A, and the
- `sparse_metadata` operand provides the indices.
-
- The full description of the sparsity storage format and distribution scheme is
- described in the PTX docs. This operation is meant to follow the semantic
- described in the PTX documentation here:
- https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-sparse-mma
-
- The way the indices are distributed among the threads in a warp is controlled
- by the optional `sparsity_selector` operand, which is `0` by default. For
- more information, please consult the PTX documentation linked above.
-
- Example (targetingthe f16 16x8x32 `mma.sp` PTX instruction):
-
- ```mlir
- nvgpu.mma.sp.sync (%a, %b, %c) metadata (%meta) {mmaShape = [16, 8, 32]} :
- (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
- ```
- }];
-
- let arguments = (ins AnyVectorOfNonZeroRank:$matrixA,
- AnyVectorOfNonZeroRank:$matrixB,
- AnyVectorOfNonZeroRank:$matrixC,
- NVGPU_MmaSparseSyncMetadataType:$sparseMetadata,
- I64ArrayAttr:$mmaShape,
- DefaultValuedAttr<I32Attr, "0">:$sparsitySelector,
- OptionalAttr<UnitAttr>:$tf32Enabled
- );
-
- let results = (outs AnyVectorOfNonZeroRank:$res);
-
- let builders = [
- OpBuilder<(ins "Value":$matrixA,
- "Value":$matrixB,
- "Value":$matrixC,
- "Value":$sparseMetadata,
- "ArrayRef<int64_t>":$mmaShape)>
- ];
-
- let assemblyFormat = [{
- `(` $matrixA`,` $matrixB`,` $matrixC `)` `metadata` `(` $sparseMetadata `)` attr-dict
- `:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
- }];
-
- let extraClassDeclaration = extraBaseClassDeclaration;
-}
-
-def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [
- AttrSizedOperandSegments]> {
- let summary = "device-side asynchronous copy";
- let description = [{
- The `nvgpu.device_async_copy` op initiates an asynchronous copy operation of
- elements from source (global memory) to the destination (shared memory)
- without blocking the thread. The async copy is added to a group.
-
- This op is meant to be used with `nvgpu.device_async_create_group` and
- `nvgpu.device_async_wait` to synchronize copies as explained in those ops
- descriptions.
-
- `bypassL1` attribute is hint to the hardware to bypass the L1 cache during
- async copy, this hint may be ignored by the hardware.
-
- `dstElements` attribute is the total number of elements written to
- destination (shared memory).
-
- `srcElements` argument is the total number of elements read from
- source (global memory).
-
- `srcElements` is an optional argument and when present the op only reads
- `srcElements` number of elements from the source (global memory) and zero fills
- the rest of the elements in the destination (shared memory).
-
- In order to do a copy and wait for the result we need the following
- combination:
- ```
- // copy 1.
- %cp1 = nvgpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
- // copy 2.
- %cp2 = nvgpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
- // group 1 contains copy 1 and copy 2.
- %token1 = nvgpu.device_async_create_group %cp1, %cp2
- // copy 3.
- %cp3 = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
- // group 2 contains copy 3.
- %token2 = nvgpu.device_async_create_group %cp3
- // after the wait copy 1 and copy 2 are complete.
- nvgpu.device_async_wait %token1
- // after the wait copy 3 is complete.
- nvgpu.device_async_wait %token2
- ```
-
- Example:
-
- ```mlir
- %0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 :
- memref<4x5xf32> to memref<2x7x5xf32, 3>
- ```
- }];
- let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
- let arguments = (ins Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst,
- Variadic<Index>:$dstIndices,
- Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src,
- Variadic<Index>:$srcIndices,
- IndexAttr:$dstElements,
- Optional<Index>:$srcElements,
- OptionalAttr<UnitAttr>:$bypassL1);
- let assemblyFormat = [{
- $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` `,` $dstElements (`,` $srcElements^)?
- attr-dict `:` type($src) `to` type($dst)
- }];
- let hasVerifier = 1;
-}
-
-def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
- let summary = "device side asynchronous create group operation";
- let description = [{
- The `nvgpu.device_async_create_group` op creates a group of memory accesses
- containing all the pending `device_async_copy` operations associated with
- argument tokens. Each token can only be part of one group.
-
- It returns a token that can be use to wait until the group fully completes.
-
- This is meant to be used with `nvgpu.device_async_wait` to synchronize copies
- as explained in those ops descriptions.
-
- Groups are executed in the order they are created.
-
- Example:
-
- ```mlir
- %0 = nvgpu.device_async_create_group
- ```
- }];
- let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
- let arguments = (ins Variadic<NVGPU_DeviceAsyncToken>:$inputTokens);
- let assemblyFormat = [{
- $inputTokens attr-dict
- }];
-}
-
-def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
- let summary = "Wait for async gpu ops to complete.";
- let description = [{
- The `nvgpu.device_async_wait` op will block the execution thread until the group
- associated with the source token is fully completed.
-
- The optional `$numGroups` attribute gives an upper bound of the number of
- groups uncompleted when the wait can unblock the thread. For example, if
- 16 async groups are pushe and `$numGroups` is set to 12, then the thread
- will unblock when 12 groups or fewer are in flight (4 groups have
- completed).
-
- Example:
-
- ```mlir
- nvgpu.device_async_wait %0
- ```
- }];
- let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies,
- OptionalAttr<I32Attr>:$numGroups);
- let assemblyFormat = [{
- $asyncDependencies attr-dict
- }];
-}
-
-def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> {
- let summary = "Creates a `nvgpu.mbarrier` object.";
- let description = [{
- The Op generates one or more `mbarrier` object, which is a barrier created in
- shared memory and supports various synchronization behaviors for threads.
-
- The `mbarrier` object has the following type and alignment requirements:
- Type: .b64, Alignment: 8, Memory space: .shared
-
- Example:
- ```mlir
- %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
- ```
- }];
- let arguments = (ins);
- let results = (outs NVGPU_MBarrierGroup:$barriers);
- let assemblyFormat = [{
- attr-dict `->` type($barriers)
- }];
-}
-
-def NVGPU_MBarrierInitOp : NVGPU_Op<"mbarrier.init", []> {
- let summary = "Initialize the `nvgpu.mbarrier`.";
- let description = [{
- The Op initializes the `mbarrier` object with the given number of threads.
-
- Example:
- ```mlir
- %num_threads = gpu.block_dim x
- %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
- nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
- ```
- }];
- let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$count, Index:$mbarId, Optional<I1>:$predicate);
- let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)";
-}
-
-def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> {
- let summary = "Checks if the `nvgpu.mbarrier` has completed its current phase.";
- let description = [{
- Checks whether the mbarrier object has completed the phase. It is is a
- non-blocking instruction which tests for the completion of the phase.
-
- Example:
- ```mlir
- %isComplete = nvgpu.mbarrier.test.wait %barrier, %token : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>, !nvgpu.mbarrier.token
- ```
- }];
- let arguments = (ins NVGPU_MBarrierGroup:$barriers, NVGPU_MBarrierToken:$token, Index:$mbarId);
- let results = (outs I1:$waitComplete);
- let assemblyFormat = "$barriers `[` $mbarId `]` `,` $token attr-dict `:` type($barriers) `,` type($token)";
-}
-
-def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> {
- let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive`.";
- let description = [{
- The Op performs arrive-on operation on the `mbarrier` object and returns a
- `nvgpu.mbarrier.token`.
-
- For more information, see
- https://docs.nvidia.com/cuda/parallel-thread-execution/#arrive-on-operation-on-mbarrier-object
-
- Example:
- ```mlir
- %token = nvgpu.mbarrier.arrive %barrier : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
- ```
- }];
- let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId);
- let results = (outs NVGPU_MBarrierToken:$token);
-let assemblyFormat = "$barriers `[` $mbarId `]` attr-dict `:` type($barriers) `->` type($token)";
-}
-
-def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", []> {
- let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive.nocomplete` as non-blocking.";
- let description = [{
- The Op performs arrive-on operation on the `mbarrier` object and returns a
- `nvgpu.mbarrier.token`.
-
- The Op does not cause the `nvgpu.mbarrier` to complete its current phase.
-
- Example:
- ```mlir
- %token = nvgpu.mbarrier.arrive.noComplete %barrier, %count : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
- ```
- }];
- let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId,
- Index:$count);
- let results = (outs NVGPU_MBarrierToken:$token);
- let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count attr-dict `:` type($barriers) `->` type($token)";
-}
-
-def NVGPU_MBarrierArriveExpectTxOp : NVGPU_Op<"mbarrier.arrive.expect_tx", []> {
- let summary = "Performs expect_tx operation on the `nvgpu.mbarrier.arrive`";
- let description = [{
- A thread executing the Op performs an expect-tx operation on the mbarrier
- object at the location specified by the address operand $barrier. The
- expect-tx operation, with an $txcount argument, increases the tx-count of
- an mbarrier object by the value specified by $txcount. This makes the
- current phase of the mbarrier object to expect and track the completion of
- additional asynchronous transactions.
-
- The `$txCount` specifies the number of element to the expect-tx operation.
-
- Example:
- ```mlir
- nvgpu.mbarrier.arrive.expect_tx %barrier, %ic0 : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
- ```
- }];
- let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$txcount, Index:$mbarId, Optional<I1>:$predicate);
- let assemblyFormat = "$barriers `[` $mbarId `]` `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)";
-}
-
-def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
- let summary = "Waits for the `nvgpu.mbarrier` to complete its current phase.";
- let description = [{
- Checks whether the mbarrier object has completed the phase. It is is a
- potentially blocking instruction which tests for the completion of the
- phase. Suspended thread resumes execution when the specified phase completes
- OR before the phase completes following a system-dependent time limit.
-
- The `$phaseParity` specifies either even phase (0) or odd phase (1) to
- wait.
-
- Example:
- ```mlir
- nvgpu.mbarrier.try_wait.parity %barrier, %phaseParity, %ticks : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
- ```
- }];
- let arguments = (ins NVGPU_MBarrierGroup:$barriers, I1:$phaseParity, Index:$ticks, Index:$mbarId);
- let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phaseParity `,` $ticks attr-dict `:` type($barriers)";
-}
-
-def NVGPU_TmaPrefetchOp : NVGPU_Op<"tma.prefetch.descriptor", []> {
- let summary = "Prefetch given `nvgpu.tensormap.descriptor` ";
- let description = [{
- The Op brings the cache line containing the given `$tmaDescriptor` for
- subsequent use by the `tma.async.load` instruction.
- }];
- let arguments = (ins NVGPU_TensorMapDescriptor:$tensorMapDescriptor, Optional<I1>:$predicate);
- let assemblyFormat = [{
- $tensorMapDescriptor (`,` `predicate` `=` $predicate^)? attr-dict `:` type($tensorMapDescriptor)
- }];
-}
-
-def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", [AttrSizedOperandSegments]> {
- let summary = "TMA asynchronous load";
- let description = [{
- The Op loads a tile memory region from global memory to shared memory by
- Tensor Memory Access (TMA).
-
- `$tensorMapDescriptor` is tensor map descriptor which has information about
- tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
-
- The Op uses `$barrier` mbarrier based completion mechanism.
- }];
- let arguments = (ins Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst,
- NVGPU_MBarrierGroup:$barriers,
- NVGPU_TensorMapDescriptor:$tensorMapDescriptor,
- Variadic<Index>:$coordinates,
- Index:$mbarId,
- Optional<I16>:$multicastMask,
- Optional<I1>:$predicate);
- let assemblyFormat = [{
- $tensorMapDescriptor `[` $coordinates `]` `,` $barriers `[` $mbarId `]`
- `to` $dst
- (`multicast_mask` `=` $multicastMask^ )?
- (`,` `predicate` `=` $predicate^)?
- attr-dict `:` type($tensorMapDescriptor) `,` type($barriers)
- `->` type($dst)
- }];
- let hasVerifier = 1;
-
-}
-
-def NVGPU_TmaAsyncStoreOp : NVGPU_Op<"tma.async.store", [AttrSizedOperandSegments]> {
- let summary = "TMA asynchronous store";
- let description = [{
- The Op store a tile memory region from global memory to shared memory by
- Tensor Memory Access (TMA).
-
- `$tensorMapDescriptor` is tensor map descriptor which has information about
- tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
- }];
- let arguments = (ins Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src,
- Arg<NVGPU_TensorMapDescriptor, "", [MemWriteAt<0, FullEffect>]>:$tensorMapDescriptor,
- Variadic<Index>:$coordinates,
- Optional<I1>:$predicate);
- let assemblyFormat = [{
- $src `to` $tensorMapDescriptor `[` $coordinates `]`
- (`,` `predicate` `=` $predicate^)?
- attr-dict `:` type($src)
- `->` type($tensorMapDescriptor)
- }];
- let hasVerifier = 1;
-}
-
-def NVGPU_TmaCreateDescriptorOp : NVGPU_Op<"tma.create.descriptor", []> {
- let summary = "TMA create descriptor";
- let description = [{
- The Op creates a tensor map descriptor object representing tiled memory
- region. To do that it calls CUDA Driver's `cuTensorMapEncodeTiled`. The
- descriptor is used by Tensor Memory Access (TMA).
-
- The `tensor` is the source tensor to be tiled.
-
- The `boxDimensions` is the size of the tiled memory region in each dimension.
-
- For more information see below:
- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
- }];
-
- let arguments = (ins AnyUnrankedMemRef:$tensor,
- Variadic<Index>:$boxDimensions);
- let results = (outs NVGPU_TensorMapDescriptor:$tensorMap);
- let assemblyFormat = [{
- $tensor `box` `[` $boxDimensions `]` attr-dict `:` type($tensor) `->` type($tensorMap)
- }];
- let hasVerifier = 1;
-}
-
-def NVGPU_WarpgroupGenerateDescriptorOp : NVGPU_Op<"warpgroup.generate.descriptor", []> {
- let summary = "Generate a warpgroup matrix descriptor";
- let description = [{
- This Op builds a `nvgpu.warpgroup.descriptor` that is used by
- `nvgpu.warpgroup.mma` to perform warpgroup-level matrix multiply and
- accumulate.
-
- The descriptor specifies the properties of the matrix in shared memory that
- is a multiplicand in the matrix multiply and accumulate operation.
- }];
- let results = (outs NVGPU_WarpgroupMatrixDescriptor:$descriptor);
- let arguments = (ins Arg<AnyMemRef, "", [MemRead]>:$tensor,
- NVGPU_TensorMapDescriptor:$tensorMap);
- let assemblyFormat = [{$tensor `,` $tensorMap attr-dict `:` type($tensor) `,` type($tensorMap) `->` type($descriptor)}];
- let hasVerifier = 1;
-}
-
-def NVGPU_WarpgroupMmaOp : NVGPU_Op<"warpgroup.mma"> {
- let description = [{
- The `nvgpu.warpgroup.mma` op performs the warpgroup-level (4 warps)
- matrix-multiply-and-accumulate (mma) operation that results in
- `nvvm.wgmma.mma_async`.
-
- The operands are `descriptorA` and `descriptorB` that are wgmma matrix
- descriptors that shows the properties of the matrix in shared memory. The
- results are thread-level ownership to the warpgroup-level mma operation
- shape. The shape is deduced from the descriptor types and output vector.
-
- The Op encapsulates multiple `nvvm.wgmma.mma_async` operations to complete
- the given shape. As `nvvm.wgmma.async` Op, or its corresponding PTX
- instruction, is asynchronous, this Op groups the `nvvm.wgmma.async` and
- surrounds them between `wgmma.fence.aligned` and
- `wgmma.commit.group.sync.aligned`, `wgmma.wait.group.sync.aligned` Ops.
-
- Example:
- ```mlir
- %r1,%r2 = nvgpu.warpgroup.mma %descA, %descB, %acc1, %acc2:
- !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>,
- !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>,
- !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
- !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
- ->
- !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
- !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
- ```
- }];
-
- let arguments = (ins NVGPU_WarpgroupMatrixDescriptor:$descriptorA,
- NVGPU_WarpgroupMatrixDescriptor:$descriptorB,
- DefaultValuedOptionalAttr<I64Attr, "1">:$waitGroup,
- OptionalAttr<UnitAttr>:$transposeA,
- OptionalAttr<UnitAttr>:$transposeB,
- NVGPU_WarpgroupAccumulator:$matrixC);
- let results = (outs NVGPU_WarpgroupAccumulator:$matrixD);
- let assemblyFormat = [{
- $descriptorA`,` $descriptorB`,` $matrixC attr-dict
- `:` type($descriptorA) `,` type($descriptorB) `,` type($matrixC) `->` type($matrixD)
- }];
- let hasVerifier = 1;
-}
-
-def NVGPU_WarpgroupMmaStoreOp : NVGPU_Op<"warpgroup.mma.store"> {
- let description = [{
- The `nvgpu.warpgroup.mma.store` op performs the store of fragmented result
- in $matrixD to given memref.
-
- [See the details of register fragment layout for accumulator matrix D]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d)
-
- Note that, the op must be run with warp group.
- }];
-
- let arguments = (ins NVGPU_WarpgroupAccumulator:$matrixD,
- Arg<AnyMemRef, "", [MemWrite]>:$dstMemref);
-
- let assemblyFormat = [{
- $matrixD `,` $dstMemref attr-dict `:` type($matrixD) `to` type($dstMemref)
- }];
- let hasVerifier = 1;
-}
-
-def NVGPU_WarpgroupMmaInitAccumulatorOp : NVGPU_Op<"warpgroup.mma.init.accumulator"> {
- let summary = "Initializes the accumulator matrix";
-
- let description = [{
- This Op generates and initializes the accumulator matrix for
- `nvgpu.warpgroup.mma` op to perform matrix-multiply-and-accumulate.
- }];
- let results = (outs NVGPU_WarpgroupAccumulator:$matrixC);
- let assemblyFormat = "attr-dict `->` type($matrixC)";
- let hasVerifier = 1;
-}
-
-def NVGPU_RcpOp : NVGPU_Op<"rcp", [Pure,
- SameOperandsAndResultType]> {
- let summary = "The reciprocal calculation for vector types";
- let description = [{
- Reciprocal calculation for `vector` types using `nvvm.rcp` OPs.
-
- Currently, only the `approx` rounding mode and `ftz` are supported, and only for the `f32` type.
-
- The input and output must be of the same vector type and shape.
- }];
- let arguments = (ins VectorOfNonZeroRankOf<[F32]>:$in,
- DefaultValuedAttr<RcpRoundingModeAttr, "RcpRoundingMode::APPROX">:$rounding,
- UnitAttr:$ftz);
- let results = (outs VectorOfNonZeroRankOf<[F32]>:$out);
- let assemblyFormat = [{
- $in `{` `rounding` `=` $rounding (`,` `ftz` $ftz^)? `}`
- attr-dict `:` type($out)
- }];
- let hasVerifier = 1;
-}
#endif // NVGPU
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
index db4c63b3390eb..61a57fb60bda4 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
@@ -53,11 +53,11 @@ constexpr unsigned kMaxTMALastdimByte = 128;
#include "mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.h.inc"
#define GET_TYPEDEF_CLASSES
-#include "mlir/Dialect/NVGPU/IR/NVGPUTypes.h.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUTypeDefs.h.inc"
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h.inc"
#define GET_OP_CLASSES
-#include "mlir/Dialect/NVGPU/IR/NVGPU.h.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUOps.h.inc"
#endif // MLIR_DIALECT_NVGPU_NVGPUDIALECT_H_
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
new file mode 100644
index 0000000000000..ab2ddf278b4a4
--- /dev/null
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
@@ -0,0 +1,633 @@
+//===-- NVGPUOps.td - NVGPU dialect operation definitions *- tablegen -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the basic operations for the NVGPU dialect.
+//
+// This NVGPU provides a bridge between the target agnostic GPU and Vector
+// dialects and lower level NVVM dialect. This allow representing PTX specific
+// operations while using MLIR high level concepts like memref and 2-D vector.
+//
+// Ops semantic are going to be based on vendor specific PTX defintion:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html
+//
+//===----------------------------------------------------------------------===//
+
+include "mlir/Dialect/NVGPU/IR/NVGPU.td"
+include "mlir/Dialect/NVGPU/IR/NVGPUTypes.td"
+
+//===----------------------------------------------------------------------===//
+// NVGPU Op Definitions
+//===----------------------------------------------------------------------===//
+
+class NVGPU_Op<string mnemonic, list<Trait> traits = []> :
+ Op<NVGPU_Dialect, mnemonic, traits> {}
+
+def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [
+ MemoryEffects<[MemRead]>,
+ PredOpTrait<"srcMemref and res have same element type",
+ TCresVTEtIsSameAsOp<0, 0>>]> {
+ let description = [{
+ The `nvgpu.ldmatrix` op represents loading a matrix fragment from
+ memory to registers. The source and result type must be compatible
+ with lowering to the `nvvm.ldmatrix` instruction. This op represents
+ the distributed version of a `vector.transfer_read` as an intermediate
+ step between lowering from `vector.transfer_read` to `nvvm.ldmatrix`.
+
+ This operation is meant to follow the semantic of described here:
+ https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix
+
+ Example:
+ ```mlir
+ %0 = nvgpu.ldmatrix %sm[%c0, %c0] {numTiles = 4 : i32, transpose = false} :
+ memref<?x?xf16, 3> -> vector<4x2xf16>
+ ```
+ }];
+
+ let arguments = (ins Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$srcMemref,
+ Variadic<Index>:$indices, BoolAttr:$transpose,
+ I32Attr:$numTiles);
+ let results = (outs AnyVectorOfNonZeroRank:$res);
+ let assemblyFormat = [{
+ $srcMemref`[` $indices `]` attr-dict `:` type($srcMemref) `->` type($res)
+ }];
+
+ let hasVerifier = 1;
+}
+
+class NVGPU_MmaSyncOp<string mnemonic> :
+ NVGPU_Op<mnemonic, [Pure,
+ PredOpTrait<"matrixA and matrixB have same element type",
+ TCopVTEtIsSameAs<0, 1>>]> {
+ code extraBaseClassDeclaration = [{
+ std::array<int64_t, 3> getMmaShapeAsArray() {
+ ArrayAttr mmaShape = this->getMmaShape();
+ assert(mmaShape.size() == 3 && "mmaShape should be three integers");
+ return {::llvm::cast<IntegerAttr>(mmaShape[0]).getInt(),
+ ::llvm::cast<IntegerAttr>(mmaShape[1]).getInt(),
+ ::llvm::cast<IntegerAttr>(mmaShape[2]).getInt()};
+ }
+ }];
+
+ let hasVerifier = 1;
+}
+
+def NVGPU_MmaSyncOp : NVGPU_MmaSyncOp<"mma.sync"> {
+ let description = [{
+ The `nvgpu.mma.sync` op represents the warp-level matrix-multiply-and-
+ accumulate (mma) operation that is compatible with `nvvm.mma.sync`.
+ The operands and results vector sizes are thread-level onwership to
+ the warp-level mma operation shape. `mmaShape` attribute holds the
+ warp-level matrix-multiply shape.
+
+ The `nvgpu.mma.sync` op serves as an intermediate point between lowering from
+ `vector.contract` to `nvvm.mma.sync`.
+
+ This operation is meant to follow the semantic of described here:
+ https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma
+
+ Example:
+
+ ```mlir
+ %res = nvgpu.mma.sync (%matrixA, %matrixB, %matrixC) {mmaShape = [16, 8, 16]} :
+ (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf32>) -> vector<2x2xf32>
+ ```
+ }];
+ let arguments = (ins AnyVectorOfNonZeroRank:$matrixA,
+ AnyVectorOfNonZeroRank:$matrixB,
+ AnyVectorOfNonZeroRank:$matrixC,
+ I64ArrayAttr:$mmaShape,
+ OptionalAttr<UnitAttr>:$tf32Enabled);
+
+ let results = (outs AnyVectorOfNonZeroRank:$res);
+
+ let builders = [
+ OpBuilder<(ins "Value":$matrixA,
+ "Value":$matrixB,
+ "Value":$matrixC,
+ "ArrayAttr":$mmaShape)>,
+ OpBuilder<(ins "Value":$matrixA,
+ "Value":$matrixB,
+ "Value":$matrixC,
+ "ArrayRef<int64_t>":$mmaShape,
+ CArg<"bool", "false">:$tf32Enabled)>
+ ];
+
+ let assemblyFormat = [{
+ `(` $matrixA`,` $matrixB`,` $matrixC `)` attr-dict
+ `:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
+ }];
+
+ let extraClassDeclaration = extraBaseClassDeclaration;
+}
+
+def NVGPU_MmaSparseSyncMetadataType : FixedVectorOfLengthAndType<[2], [I16]>,
+ BuildableType<"::mlir::VectorType::get("
+ "{2},$_builder.getI16Type())">;
+
+def NVGPU_MmaSparseSyncOp : NVGPU_MmaSyncOp<"mma.sp.sync"> {
+ let description = [{
+ The `nvgu.mma.sp.sync` operation performs a warp-distributed MMA operation
+ where operand A is "structured sparse". In this case, the `matrixA` operand
+ represents the (warp-distributed) non-zero values of operand A, and the
+ `sparse_metadata` operand provides the indices.
+
+ The full description of the sparsity storage format and distribution scheme is
+ described in the PTX docs. This operation is meant to follow the semantic
+ described in the PTX documentation here:
+ https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-sparse-mma
+
+ The way the indices are distributed among the threads in a warp is controlled
+ by the optional `sparsity_selector` operand, which is `0` by default. For
+ more information, please consult the PTX documentation linked above.
+
+ Example (targetingthe f16 16x8x32 `mma.sp` PTX instruction):
+
+ ```mlir
+ nvgpu.mma.sp.sync (%a, %b, %c) metadata (%meta) {mmaShape = [16, 8, 32]} :
+ (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
+ ```
+ }];
+
+ let arguments = (ins AnyVectorOfNonZeroRank:$matrixA,
+ AnyVectorOfNonZeroRank:$matrixB,
+ AnyVectorOfNonZeroRank:$matrixC,
+ NVGPU_MmaSparseSyncMetadataType:$sparseMetadata,
+ I64ArrayAttr:$mmaShape,
+ DefaultValuedAttr<I32Attr, "0">:$sparsitySelector,
+ OptionalAttr<UnitAttr>:$tf32Enabled
+ );
+
+ let results = (outs AnyVectorOfNonZeroRank:$res);
+
+ let builders = [
+ OpBuilder<(ins "Value":$matrixA,
+ "Value":$matrixB,
+ "Value":$matrixC,
+ "Value":$sparseMetadata,
+ "ArrayRef<int64_t>":$mmaShape)>
+ ];
+
+ let assemblyFormat = [{
+ `(` $matrixA`,` $matrixB`,` $matrixC `)` `metadata` `(` $sparseMetadata `)` attr-dict
+ `:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
+ }];
+
+ let extraClassDeclaration = extraBaseClassDeclaration;
+}
+
+def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [
+ AttrSizedOperandSegments]> {
+ let summary = "device-side asynchronous copy";
+ let description = [{
+ The `nvgpu.device_async_copy` op initiates an asynchronous copy operation of
+ elements from source (global memory) to the destination (shared memory)
+ without blocking the thread. The async copy is added to a group.
+
+ This op is meant to be used with `nvgpu.device_async_create_group` and
+ `nvgpu.device_async_wait` to synchronize copies as explained in those ops
+ descriptions.
+
+ `bypassL1` attribute is hint to the hardware to bypass the L1 cache during
+ async copy, this hint may be ignored by the hardware.
+
+ `dstElements` attribute is the total number of elements written to
+ destination (shared memory).
+
+ `srcElements` argument is the total number of elements read from
+ source (global memory).
+
+ `srcElements` is an optional argument and when present the op only reads
+ `srcElements` number of elements from the source (global memory) and zero fills
+ the rest of the elements in the destination (shared memory).
+
+ In order to do a copy and wait for the result we need the following
+ combination:
+ ```
+ // copy 1.
+ %cp1 = nvgpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
+ // copy 2.
+ %cp2 = nvgpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
+ // group 1 contains copy 1 and copy 2.
+ %token1 = nvgpu.device_async_create_group %cp1, %cp2
+ // copy 3.
+ %cp3 = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
+ // group 2 contains copy 3.
+ %token2 = nvgpu.device_async_create_group %cp3
+ // after the wait copy 1 and copy 2 are complete.
+ nvgpu.device_async_wait %token1
+ // after the wait copy 3 is complete.
+ nvgpu.device_async_wait %token2
+ ```
+
+ Example:
+
+ ```mlir
+ %0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 :
+ memref<4x5xf32> to memref<2x7x5xf32, 3>
+ ```
+ }];
+ let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
+ let arguments = (ins Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst,
+ Variadic<Index>:$dstIndices,
+ Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src,
+ Variadic<Index>:$srcIndices,
+ IndexAttr:$dstElements,
+ Optional<Index>:$srcElements,
+ OptionalAttr<UnitAttr>:$bypassL1);
+ let assemblyFormat = [{
+ $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` `,` $dstElements (`,` $srcElements^)?
+ attr-dict `:` type($src) `to` type($dst)
+ }];
+ let hasVerifier = 1;
+}
+
+def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
+ let summary = "device side asynchronous create group operation";
+ let description = [{
+ The `nvgpu.device_async_create_group` op creates a group of memory accesses
+ containing all the pending `device_async_copy` operations associated with
+ argument tokens. Each token can only be part of one group.
+
+ It returns a token that can be use to wait until the group fully completes.
+
+ This is meant to be used with `nvgpu.device_async_wait` to synchronize copies
+ as explained in those ops descriptions.
+
+ Groups are executed in the order they are created.
+
+ Example:
+
+ ```mlir
+ %0 = nvgpu.device_async_create_group
+ ```
+ }];
+ let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
+ let arguments = (ins Variadic<NVGPU_DeviceAsyncToken>:$inputTokens);
+ let assemblyFormat = [{
+ $inputTokens attr-dict
+ }];
+}
+
+def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
+ let summary = "Wait for async gpu ops to complete.";
+ let description = [{
+ The `nvgpu.device_async_wait` op will block the execution thread until the group
+ associated with the source token is fully completed.
+
+ The optional `$numGroups` attribute gives an upper bound of the number of
+ groups uncompleted when the wait can unblock the thread. For example, if
+ 16 async groups are pushe and `$numGroups` is set to 12, then the thread
+ will unblock when 12 groups or fewer are in flight (4 groups have
+ completed).
+
+ Example:
+
+ ```mlir
+ nvgpu.device_async_wait %0
+ ```
+ }];
+ let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies,
+ OptionalAttr<I32Attr>:$numGroups);
+ let assemblyFormat = [{
+ $asyncDependencies attr-dict
+ }];
+}
+
+def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> {
+ let summary = "Creates a `nvgpu.mbarrier` object.";
+ let description = [{
+ The Op generates one or more `mbarrier` object, which is a barrier created in
+ shared memory and supports various synchronization behaviors for threads.
+
+ The `mbarrier` object has the following type and alignment requirements:
+ Type: .b64, Alignment: 8, Memory space: .shared
+
+ Example:
+ ```mlir
+ %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+ ```
+ }];
+ let arguments = (ins);
+ let results = (outs NVGPU_MBarrierGroup:$barriers);
+ let assemblyFormat = [{
+ attr-dict `->` type($barriers)
+ }];
+}
+
+def NVGPU_MBarrierInitOp : NVGPU_Op<"mbarrier.init", []> {
+ let summary = "Initialize the `nvgpu.mbarrier`.";
+ let description = [{
+ The Op initializes the `mbarrier` object with the given number of threads.
+
+ Example:
+ ```mlir
+ %num_threads = gpu.block_dim x
+ %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+ nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+ ```
+ }];
+ let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$count, Index:$mbarId, Optional<I1>:$predicate);
+ let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)";
+}
+
+def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> {
+ let summary = "Checks if the `nvgpu.mbarrier` has completed its current phase.";
+ let description = [{
+ Checks whether the mbarrier object has completed the phase. It is is a
+ non-blocking instruction which tests for the completion of the phase.
+
+ Example:
+ ```mlir
+ %isComplete = nvgpu.mbarrier.test.wait %barrier, %token : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>, !nvgpu.mbarrier.token
+ ```
+ }];
+ let arguments = (ins NVGPU_MBarrierGroup:$barriers, NVGPU_MBarrierToken:$token, Index:$mbarId);
+ let results = (outs I1:$waitComplete);
+ let assemblyFormat = "$barriers `[` $mbarId `]` `,` $token attr-dict `:` type($barriers) `,` type($token)";
+}
+
+def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> {
+ let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive`.";
+ let description = [{
+ The Op performs arrive-on operation on the `mbarrier` object and returns a
+ `nvgpu.mbarrier.token`.
+
+ For more information, see
+ https://docs.nvidia.com/cuda/parallel-thread-execution/#arrive-on-operation-on-mbarrier-object
+
+ Example:
+ ```mlir
+ %token = nvgpu.mbarrier.arrive %barrier : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
+ ```
+ }];
+ let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId);
+ let results = (outs NVGPU_MBarrierToken:$token);
+let assemblyFormat = "$barriers `[` $mbarId `]` attr-dict `:` type($barriers) `->` type($token)";
+}
+
+def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", []> {
+ let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive.nocomplete` as non-blocking.";
+ let description = [{
+ The Op performs arrive-on operation on the `mbarrier` object and returns a
+ `nvgpu.mbarrier.token`.
+
+ The Op does not cause the `nvgpu.mbarrier` to complete its current phase.
+
+ Example:
+ ```mlir
+ %token = nvgpu.mbarrier.arrive.noComplete %barrier, %count : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
+ ```
+ }];
+ let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId,
+ Index:$count);
+ let results = (outs NVGPU_MBarrierToken:$token);
+ let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count attr-dict `:` type($barriers) `->` type($token)";
+}
+
+def NVGPU_MBarrierArriveExpectTxOp : NVGPU_Op<"mbarrier.arrive.expect_tx", []> {
+ let summary = "Performs expect_tx operation on the `nvgpu.mbarrier.arrive`";
+ let description = [{
+ A thread executing the Op performs an expect-tx operation on the mbarrier
+ object at the location specified by the address operand $barrier. The
+ expect-tx operation, with an $txcount argument, increases the tx-count of
+ an mbarrier object by the value specified by $txcount. This makes the
+ current phase of the mbarrier object to expect and track the completion of
+ additional asynchronous transactions.
+
+ The `$txCount` specifies the number of element to the expect-tx operation.
+
+ Example:
+ ```mlir
+ nvgpu.mbarrier.arrive.expect_tx %barrier, %ic0 : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+ ```
+ }];
+ let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$txcount, Index:$mbarId, Optional<I1>:$predicate);
+ let assemblyFormat = "$barriers `[` $mbarId `]` `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)";
+}
+
+def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
+ let summary = "Waits for the `nvgpu.mbarrier` to complete its current phase.";
+ let description = [{
+ Checks whether the mbarrier object has completed the phase. It is is a
+ potentially blocking instruction which tests for the completion of the
+ phase. Suspended thread resumes execution when the specified phase completes
+ OR before the phase completes following a system-dependent time limit.
+
+ The `$phaseParity` specifies either even phase (0) or odd phase (1) to
+ wait.
+
+ Example:
+ ```mlir
+ nvgpu.mbarrier.try_wait.parity %barrier, %phaseParity, %ticks : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+ ```
+ }];
+ let arguments = (ins NVGPU_MBarrierGroup:$barriers, I1:$phaseParity, Index:$ticks, Index:$mbarId);
+ let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phaseParity `,` $ticks attr-dict `:` type($barriers)";
+}
+
+def NVGPU_TmaPrefetchOp : NVGPU_Op<"tma.prefetch.descriptor", []> {
+ let summary = "Prefetch given `nvgpu.tensormap.descriptor` ";
+ let description = [{
+ The Op brings the cache line containing the given `$tmaDescriptor` for
+ subsequent use by the `tma.async.load` instruction.
+ }];
+ let arguments = (ins NVGPU_TensorMapDescriptor:$tensorMapDescriptor, Optional<I1>:$predicate);
+ let assemblyFormat = [{
+ $tensorMapDescriptor (`,` `predicate` `=` $predicate^)? attr-dict `:` type($tensorMapDescriptor)
+ }];
+}
+
+def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", [AttrSizedOperandSegments]> {
+ let summary = "TMA asynchronous load";
+ let description = [{
+ The Op loads a tile memory region from global memory to shared memory by
+ Tensor Memory Access (TMA).
+
+ `$tensorMapDescriptor` is tensor map descriptor which has information about
+ tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
+
+ The Op uses `$barrier` mbarrier based completion mechanism.
+ }];
+ let arguments = (ins Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst,
+ NVGPU_MBarrierGroup:$barriers,
+ NVGPU_TensorMapDescriptor:$tensorMapDescriptor,
+ Variadic<Index>:$coordinates,
+ Index:$mbarId,
+ Optional<I16>:$multicastMask,
+ Optional<I1>:$predicate);
+ let assemblyFormat = [{
+ $tensorMapDescriptor `[` $coordinates `]` `,` $barriers `[` $mbarId `]`
+ `to` $dst
+ (`multicast_mask` `=` $multicastMask^ )?
+ (`,` `predicate` `=` $predicate^)?
+ attr-dict `:` type($tensorMapDescriptor) `,` type($barriers)
+ `->` type($dst)
+ }];
+ let hasVerifier = 1;
+
+}
+
+def NVGPU_TmaAsyncStoreOp : NVGPU_Op<"tma.async.store", [AttrSizedOperandSegments]> {
+ let summary = "TMA asynchronous store";
+ let description = [{
+ The Op store a tile memory region from global memory to shared memory by
+ Tensor Memory Access (TMA).
+
+ `$tensorMapDescriptor` is tensor map descriptor which has information about
+ tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
+ }];
+ let arguments = (ins Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src,
+ Arg<NVGPU_TensorMapDescriptor, "", [MemWriteAt<0, FullEffect>]>:$tensorMapDescriptor,
+ Variadic<Index>:$coordinates,
+ Optional<I1>:$predicate);
+ let assemblyFormat = [{
+ $src `to` $tensorMapDescriptor `[` $coordinates `]`
+ (`,` `predicate` `=` $predicate^)?
+ attr-dict `:` type($src)
+ `->` type($tensorMapDescriptor)
+ }];
+ let hasVerifier = 1;
+}
+
+def NVGPU_TmaCreateDescriptorOp : NVGPU_Op<"tma.create.descriptor", []> {
+ let summary = "TMA create descriptor";
+ let description = [{
+ The Op creates a tensor map descriptor object representing tiled memory
+ region. To do that it calls CUDA Driver's `cuTensorMapEncodeTiled`. The
+ descriptor is used by Tensor Memory Access (TMA).
+
+ The `tensor` is the source tensor to be tiled.
+
+ The `boxDimensions` is the size of the tiled memory region in each dimension.
+
+ For more information see below:
+ https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
+ }];
+
+ let arguments = (ins AnyUnrankedMemRef:$tensor,
+ Variadic<Index>:$boxDimensions);
+ let results = (outs NVGPU_TensorMapDescriptor:$tensorMap);
+ let assemblyFormat = [{
+ $tensor `box` `[` $boxDimensions `]` attr-dict `:` type($tensor) `->` type($tensorMap)
+ }];
+ let hasVerifier = 1;
+}
+
+def NVGPU_WarpgroupGenerateDescriptorOp : NVGPU_Op<"warpgroup.generate.descriptor", []> {
+ let summary = "Generate a warpgroup matrix descriptor";
+ let description = [{
+ This Op builds a `nvgpu.warpgroup.descriptor` that is used by
+ `nvgpu.warpgroup.mma` to perform warpgroup-level matrix multiply and
+ accumulate.
+
+ The descriptor specifies the properties of the matrix in shared memory that
+ is a multiplicand in the matrix multiply and accumulate operation.
+ }];
+ let results = (outs NVGPU_WarpgroupMatrixDescriptor:$descriptor);
+ let arguments = (ins Arg<AnyMemRef, "", [MemRead]>:$tensor,
+ NVGPU_TensorMapDescriptor:$tensorMap);
+ let assemblyFormat = [{$tensor `,` $tensorMap attr-dict `:` type($tensor) `,` type($tensorMap) `->` type($descriptor)}];
+ let hasVerifier = 1;
+}
+
+def NVGPU_WarpgroupMmaOp : NVGPU_Op<"warpgroup.mma"> {
+ let description = [{
+ The `nvgpu.warpgroup.mma` op performs the warpgroup-level (4 warps)
+ matrix-multiply-and-accumulate (mma) operation that results in
+ `nvvm.wgmma.mma_async`.
+
+ The operands are `descriptorA` and `descriptorB` that are wgmma matrix
+ descriptors that shows the properties of the matrix in shared memory. The
+ results are thread-level ownership to the warpgroup-level mma operation
+ shape. The shape is deduced from the descriptor types and output vector.
+
+ The Op encapsulates multiple `nvvm.wgmma.mma_async` operations to complete
+ the given shape. As `nvvm.wgmma.async` Op, or its corresponding PTX
+ instruction, is asynchronous, this Op groups the `nvvm.wgmma.async` and
+ surrounds them between `wgmma.fence.aligned` and
+ `wgmma.commit.group.sync.aligned`, `wgmma.wait.group.sync.aligned` Ops.
+
+ Example:
+ ```mlir
+ %r1,%r2 = nvgpu.warpgroup.mma %descA, %descB, %acc1, %acc2:
+ !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>,
+ !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>,
+ !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
+ !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
+ ->
+ !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
+ !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
+ ```
+ }];
+
+ let arguments = (ins NVGPU_WarpgroupMatrixDescriptor:$descriptorA,
+ NVGPU_WarpgroupMatrixDescriptor:$descriptorB,
+ DefaultValuedOptionalAttr<I64Attr, "1">:$waitGroup,
+ OptionalAttr<UnitAttr>:$transposeA,
+ OptionalAttr<UnitAttr>:$transposeB,
+ NVGPU_WarpgroupAccumulator:$matrixC);
+ let results = (outs NVGPU_WarpgroupAccumulator:$matrixD);
+ let assemblyFormat = [{
+ $descriptorA`,` $descriptorB`,` $matrixC attr-dict
+ `:` type($descriptorA) `,` type($descriptorB) `,` type($matrixC) `->` type($matrixD)
+ }];
+ let hasVerifier = 1;
+}
+
+def NVGPU_WarpgroupMmaStoreOp : NVGPU_Op<"warpgroup.mma.store"> {
+ let description = [{
+ The `nvgpu.warpgroup.mma.store` op performs the store of fragmented result
+ in $matrixD to given memref.
+
+ [See the details of register fragment layout for accumulator matrix D]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d)
+
+ Note that, the op must be run with warp group.
+ }];
+
+ let arguments = (ins NVGPU_WarpgroupAccumulator:$matrixD,
+ Arg<AnyMemRef, "", [MemWrite]>:$dstMemref);
+
+ let assemblyFormat = [{
+ $matrixD `,` $dstMemref attr-dict `:` type($matrixD) `to` type($dstMemref)
+ }];
+ let hasVerifier = 1;
+}
+
+def NVGPU_WarpgroupMmaInitAccumulatorOp : NVGPU_Op<"warpgroup.mma.init.accumulator"> {
+ let summary = "Initializes the accumulator matrix";
+
+ let description = [{
+ This Op generates and initializes the accumulator matrix for
+ `nvgpu.warpgroup.mma` op to perform matrix-multiply-and-accumulate.
+ }];
+ let results = (outs NVGPU_WarpgroupAccumulator:$matrixC);
+ let assemblyFormat = "attr-dict `->` type($matrixC)";
+ let hasVerifier = 1;
+}
+
+def NVGPU_RcpOp : NVGPU_Op<"rcp", [Pure,
+ SameOperandsAndResultType]> {
+ let summary = "The reciprocal calculation for vector types";
+ let description = [{
+ Reciprocal calculation for `vector` types using `nvvm.rcp` OPs.
+
+ Currently, only the `approx` rounding mode and `ftz` are supported, and only for the `f32` type.
+
+ The input and output must be of the same vector type and shape.
+ }];
+ let arguments = (ins VectorOfNonZeroRankOf<[F32]>:$in,
+ DefaultValuedAttr<RcpRoundingModeAttr, "RcpRoundingMode::APPROX">:$rounding,
+ UnitAttr:$ftz);
+ let results = (outs VectorOfNonZeroRankOf<[F32]>:$out);
+ let assemblyFormat = [{
+ $in `{` `rounding` `=` $rounding (`,` `ftz` $ftz^)? `}`
+ attr-dict `:` type($out)
+ }];
+ let hasVerifier = 1;
+}
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td
new file mode 100644
index 0000000000000..8966ec359843b
--- /dev/null
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td
@@ -0,0 +1,112 @@
+//===- NVGPUTypes.td - NVGPU types -------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVGPU dialect types.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// NVGPU Type Definitions
+//===----------------------------------------------------------------------===//
+
+include "mlir/IR/AttrTypeBase.td"
+include "mlir/Dialect/NVGPU/IR/NVGPU.td"
+
+class NVGPU_Type<string name, string typeMnemonic,
+ list<Trait> traits = []> : TypeDef<NVGPU_Dialect, name, traits> {
+ let mnemonic = typeMnemonic;
+}
+
+def NVGPU_DeviceAsyncToken : NVGPU_Type<"DeviceAsyncToken",
+ "device.async.token", []> {
+ let summary = "device async token type";
+ let description = [{
+ `nvgpu.device.async.token` is a type returned by an asynchronous operation
+ that runs on the GPU (device). It is used to establish an SSA-based link
+ between the async operation (e.g. DeviceAsyncCopy) and operations that
+ group or synchronize the async operations (e.g. DeviceAsyncCreateGroupOp,
+ DeviceAsyncWaitOp).
+ }];
+}
+
+def NVGPU_MBarrierGroup : NVGPU_Type<"MBarrierGroup", "mbarrier.group", []> {
+ let summary = "mbarrier barrier type";
+ let description = [{
+ This is the type for one or more mbarrier object in shared memory that is
+ used to synchronize a variable number of threads.
+
+ If `num_barriers` is not set, the number of mbarrier objects is 1.
+
+ A mbarrier object is 64 bit with 8 byte alignment. The mbarrier object
+ can be initiated and invalidated.
+
+ [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#size-and-alignment-of-mbarrier-object)
+ }];
+ let parameters = (ins "Attribute":$memorySpace, DefaultValuedParameter<"unsigned", "1">:$num_barriers);
+ let assemblyFormat = "`<` struct(params) `>`";
+ let builders = [
+ TypeBuilder<(ins "Attribute":$memorySpace), [{
+ return $_get($_ctxt, memorySpace, 1);
+ }]>
+ ];
+}
+
+def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { }
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-map
+def NVGPU_TensorMapDescriptor : NVGPU_Type<"TensorMapDescriptor", "tensormap.descriptor", []> {
+ let summary = "TensorMap descriptor";
+ let parameters = (ins "MemRefType":$tensor,
+ EnumParameter<TensorMapSwizzleKind>:$swizzle,
+ EnumParameter<TensorMapL2PromoKind>:$l2promo,
+ EnumParameter<TensorMapOOBKind>:$oob,
+ EnumParameter<TensorMapInterleaveKind>:$interleave);
+ let description = [{
+ `nvgpu.tma.descriptor` is a type that represents a TMA descriptor. It is
+ 128-byte object either in constant space or kernel paramater.
+ }];
+ let assemblyFormat = "`<` struct(params) `>`";
+}
+
+def NVGPU_WarpgroupMatrixDescriptor : NVGPU_Type<"WarpgroupMatrixDescriptor", "warpgroup.descriptor", []> {
+ let summary = "Warpgroup matrix descriptor type";
+ let description = [{
+ The descriptor specifies the properties of the matrix in shared memory that
+ is a multiplicand in the matrix multiply and accumulate operation.
+
+ The descriptor is a 64-bit value contained in a register with the following:
+ ```
+ +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
+ | 0-13 |14-15| 16-29 |30-31| 32-45 |46-48|49-51| 52-61 |62-63|
+ +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
+ | 14bits |2bits| 14bits |2bits| 14bits |2bits|3bits| 10bits |2bits|
+ +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
+ | BaseAddr| 0 | LeadingDim| 0 | Stride | 0 |Offst| 0 |Swzle|
+ +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
+ ```
+
+ [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-shared-memory-layout-matrix-descriptor)
+
+ }];
+ let parameters = (ins "MemRefType":$tensor);
+ let assemblyFormat = "`<` struct(params) `>`";
+}
+
+def NVGPU_WarpgroupAccumulator : NVGPU_Type<"WarpgroupAccumulator", "warpgroup.accumulator", []> {
+ let parameters = (ins "VectorType":$fragmented);
+ let assemblyFormat = "`<` struct(params) `>`";
+ let description = [{
+ This type represents the result matrix obtained from `nvgpu.warpgroup.mma`.
+ The `$fragmented` type signifies the distributed or fragmented result
+ vector that is collectively owned by all the threads in the warp-group
+ that executed `nvgpu.warpgroup.mma`.
+ [See the details of register fragment layout for accumulator matrix D]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d)
+ }];
+}
diff --git a/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt
index 4d47ce4746dbb..10aa502ee67f8 100644
--- a/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt
@@ -5,7 +5,7 @@ add_mlir_dialect_library(MLIRNVGPUDialect
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/NVGPU
DEPENDS
- MLIRNVGPUIncGen
+ MLIRNVGPUOpsIncGen
MLIRNVGPUEnumsIncGen
MLIRNVGPUAttributesIncGen
MLIRNVGPUTypesIncGen
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index ba86e8d6ceaf9..abbdb6a0f53ec 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -35,7 +35,7 @@ using namespace mlir::nvgpu;
void nvgpu::NVGPUDialect::initialize() {
addTypes<
#define GET_TYPEDEF_LIST
-#include "mlir/Dialect/NVGPU/IR/NVGPUTypes.cpp.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUTypeDefs.cpp.inc"
>();
addAttributes<
#define GET_ATTRDEF_LIST
@@ -43,7 +43,7 @@ void nvgpu::NVGPUDialect::initialize() {
>();
addOperations<
#define GET_OP_LIST
-#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUOps.cpp.inc"
>();
}
@@ -681,7 +681,7 @@ LogicalResult RcpOp::verify() {
#include "mlir/Dialect/NVGPU/IR/NVGPUEnums.cpp.inc"
#define GET_OP_CLASSES
-#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUOps.cpp.inc"
#define GET_TYPEDEF_CLASSES
-#include "mlir/Dialect/NVGPU/IR/NVGPUTypes.cpp.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUTypeDefs.cpp.inc"
>From 861c6b157a019a3967f493fec431a5ef05cef3a1 Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038 at qq.com>
Date: Wed, 5 Mar 2025 17:02:40 +0800
Subject: [PATCH 2/2] fix python test and nit.
---
mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td | 6 +++---
mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td | 5 +++++
mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td | 11 ++++++++---
mlir/python/mlir/dialects/NVGPUOps.td | 2 +-
4 files changed, 17 insertions(+), 7 deletions(-)
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index 6b5470310e4a1..7f7a54cb0c57e 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -6,8 +6,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef NVGPU
-#define NVGPU
+#ifndef MLIR_DIALECT_NVGPU_IR_NVGPU_TD
+#define MLIR_DIALECT_NVGPU_IR_NVGPU_TD
include "mlir/Interfaces/InferTypeOpInterface.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
@@ -116,4 +116,4 @@ def TensorMapOOBAttr : EnumAttr<NVGPU_Dialect, TensorMapOOBKind, "oob">;
def TensorMapInterleaveAttr : EnumAttr<NVGPU_Dialect, TensorMapInterleaveKind, "interleave">;
def RcpRoundingModeAttr : EnumAttr<NVGPU_Dialect, RcpRoundingMode, "rcp_rounding_mode">;
-#endif // NVGPU
+#endif // MLIR_DIALECT_NVGPU_IR_NVGPU_TD
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
index ab2ddf278b4a4..eb0fb90d271ed 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
@@ -17,6 +17,9 @@
//
//===----------------------------------------------------------------------===//
+#ifndef MLIR_DIALECT_NVGPU_IR_NVGPUOPS_TD
+#define MLIR_DIALECT_NVGPU_IR_NVGPUOPS_TD
+
include "mlir/Dialect/NVGPU/IR/NVGPU.td"
include "mlir/Dialect/NVGPU/IR/NVGPUTypes.td"
@@ -631,3 +634,5 @@ def NVGPU_RcpOp : NVGPU_Op<"rcp", [Pure,
}];
let hasVerifier = 1;
}
+
+#endif // MLIR_DIALECT_NVGPU_IR_NVGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td
index 8966ec359843b..8836a1a9dfcd8 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td
@@ -11,13 +11,16 @@
//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// NVGPU Type Definitions
-//===----------------------------------------------------------------------===//
+#ifndef MLIR_DIALECT_NVGPU_IR_NVGPUTYPES_TD
+#define MLIR_DIALECT_NVGPU_IR_NVGPUTYPES_TD
include "mlir/IR/AttrTypeBase.td"
include "mlir/Dialect/NVGPU/IR/NVGPU.td"
+//===----------------------------------------------------------------------===//
+// NVGPU Type Definitions
+//===----------------------------------------------------------------------===//
+
class NVGPU_Type<string name, string typeMnemonic,
list<Trait> traits = []> : TypeDef<NVGPU_Dialect, name, traits> {
let mnemonic = typeMnemonic;
@@ -110,3 +113,5 @@ def NVGPU_WarpgroupAccumulator : NVGPU_Type<"WarpgroupAccumulator", "warpgroup.a
(https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d)
}];
}
+
+#endif //MLIR_DIALECT_NVGPU_IR_NVGPUTYPES_TD
diff --git a/mlir/python/mlir/dialects/NVGPUOps.td b/mlir/python/mlir/dialects/NVGPUOps.td
index ae54822cd9070..cdf651901e074 100644
--- a/mlir/python/mlir/dialects/NVGPUOps.td
+++ b/mlir/python/mlir/dialects/NVGPUOps.td
@@ -9,6 +9,6 @@
#ifndef PYTHON_BINDINGS_NVGPU_OPS
#define PYTHON_BINDINGS_NVGPU_OPS
-include "mlir/Dialect/NVGPU/IR/NVGPU.td"
+include "mlir/Dialect/NVGPU/IR/NVGPUOps.td"
#endif
More information about the Mlir-commits
mailing list