[Mlir-commits] [mlir] 9c68f8e - [mlir][NVGPU] Documentation only update to nvgpu dialect (NFC).
Manish Gupta
llvmlistbot at llvm.org
Mon Oct 24 12:14:57 PDT 2022
Author: Manish Gupta
Date: 2022-10-24T12:13:34-07:00
New Revision: 9c68f8ea666dffd3424d672d24ecb6dd2bb03e47
URL: https://github.com/llvm/llvm-project/commit/9c68f8ea666dffd3424d672d24ecb6dd2bb03e47
DIFF: https://github.com/llvm/llvm-project/commit/9c68f8ea666dffd3424d672d24ecb6dd2bb03e47.diff
LOG: [mlir][NVGPU] Documentation only update to nvgpu dialect (NFC).
Differential Revision: https://reviews.llvm.org/D136313
Added:
Modified:
mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index 26b7dd7bb3fc2..138ffc896cb2a 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -28,10 +28,11 @@ def NVGPU_Dialect : Dialect {
let name = "nvgpu";
let cppNamespace = "::mlir::nvgpu";
let description = [{
- This `NVGPU` dialect provides a bridge between the target agnostic GPU and
- Vector dialects and the lower level LLVM IR based NVVM dialect. This allow
- representing PTX specific operations while using MLIR high level concepts
- like memref and 2-D vector.
+ The `NVGPU` dialect provides a bridge between higher-level target-agnostic
+ dialects (GPU and Vector) and the lower-level target-specific dialect
+ (LLVM IR based NVVM dialect) for NVIDIA GPUs. This allow representing PTX
+ specific operations while using MLIR high level dialects such as Memref
+ and Vector for memory and target-specific register operands, respectively.
}];
let useDefaultTypePrinterParser = 1;
@@ -70,20 +71,20 @@ def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [
PredOpTrait<"srcMemref and res have same element type",
TCresVTEtIsSameAsOp<0, 0>>]> {
let description = [{
- The `nvgpu.ldmatrix` op represents loading a matrix fragment from
- memory. The load source and result type must be compatible with lowering
- to the `nvvm.ldmatrix` instruction. This op is meant to represent
- the distributed version of a `vector.transfer_read` as an intermediate
- step between lowering from `vector.transfer_read` to `nvvm.ldmatrix`.
-
- This operation is meant to follow the semantic of described here:
- https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix
-
- Example:
- ```mlir
- %0 = nvgpu.ldmatrix %sm[%c0, %c0] {numTiles = 4 : i32, transpose = false} :
- memref<?x?xf16, 3> -> vector<4x2xf16>
- ```
+ The `nvgpu.ldmatrix` op represents loading a matrix fragment from
+ memory to registers. The source and result type must be compatible
+ with lowering to the `nvvm.ldmatrix` instruction. This op represents
+ the distributed version of a `vector.transfer_read` as an intermediate
+ step between lowering from `vector.transfer_read` to `nvvm.ldmatrix`.
+
+ This operation is meant to follow the semantic of described here:
+ https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix
+
+ Example:
+ ```mlir
+ %0 = nvgpu.ldmatrix %sm[%c0, %c0] {numTiles = 4 : i32, transpose = false} :
+ memref<?x?xf16, 3> -> vector<4x2xf16>
+ ```
}];
let arguments = (ins Arg<AnyMemRef, "", [MemRead]>:$srcMemref,
@@ -102,25 +103,24 @@ def NVGPU_MmaSyncOp : NVGPU_Op<"mma.sync", [
PredOpTrait<"matrixA and matrixB have same element type",
TCopVTEtIsSameAs<0, 1>>]> {
let description = [{
- The `nvgpu.mma.sync` op represents the distributed form of a collective
- matrix-multiply-and-accumulate (mma) operation that is compatible with
- `nvvm.mma.sync`. The operands and results are fragments of the full matrix
- operands. The full shape of the distributed mma operation is given by the
- `mmaShape` attribute in the form of a list of dimensions `[m, n, k]`.
-
- This operation is meant to be lowered to the `nvvm.mma.sync` instruction, and
- is an intermediate point between lowering from `vector.contract` to
- `nvvm.mma.sync`.
+ The `nvgpu.mma.sync` op represents the warp-level matrix-multiply-and-
+ accumulate (mma) operation that is compatible with `nvvm.mma.sync`.
+ The operands and results vector sizes are thread-level onwership to
+ the warp-level mma operation shape. `mmaShape` attribute holds the
+ warp-level matrix-multiply shape.
+
+ The `nvgpu.mma.sync` op serves as an intermediate point between lowering from
+ `vector.contract` to `nvvm.mma.sync`.
- This operation is meant to follow the semantic of described here:
- https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma
+ This operation is meant to follow the semantic of described here:
+ https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma
- Example:
+ Example:
- ```mlir
- nvgpu.mma.sync (%a, %b, %c) :
- (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
- ```
+ ```mlir
+ %res = nvgpu.mma.sync (%matrixA, %matrixB, %matrixC) {mmaShape = [16, 8, 16]} :
+ (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf32>) -> vector<2x2xf32>
+ ```
}];
let arguments = (ins AnyVector:$matrixA,
AnyVector:$matrixB,
@@ -152,25 +152,25 @@ def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [
let summary = "device-side asynchronous copy";
let description = [{
The `nvgpu.device_async_copy` op initiates an asynchronous copy operation of
- `$size` elements from source to the destination without blocking the thread.
- The destination has to be in shared memory.
+ elements from source (global memory) to the destination (shared memory)
+ without blocking the thread. The async copy is added to a group.
- This is memory access will be pending to be added to a group.
-
- This op is meant to be used with `gpu.device_async_create_group` and
- `gpu.device_async_wait` to synchronize copies as explained in those ops
+ This op is meant to be used with `nvgpu.device_async_create_group` and
+ `nvgpu.device_async_wait` to synchronize copies as explained in those ops
descriptions.
- `bypassL1` attribute is hint to the backend and hardware that
- the copy should by pass the L1 cache, this may be dropped by the backend or
- hardware.
+
+ `bypassL1` attribute is hint to the hardware to bypass the L1 cache during
+ async copy, this hint may be ignored by the hardware.
+
`dstElements` attribute is the total number of elements written to
destination (shared memory).
+
`srcElements` argument is the total number of elements read from
source (global memory).
- srcElements` is an optional argument and when present it only reads
- srcElements number of elements from the source global memory and zero fills
- the rest of the elements in the destination shared memory.
+ `srcElements` is an optional argument and when present the op only reads
+ `srcElements` number of elements from the source (global memory) and zero fills
+ the rest of the elements in the destination (shared memory).
In order to do a copy and wait for the result we need the following
combination:
@@ -216,21 +216,21 @@ def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [
def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
let summary = "device side asynchronous create group operation";
let description = [{
- The `nvgpu.device_async_create_group` op creates a group of memory accesses
- containing all the pending `device_async_copy` operations associated with
- argument tokens. Each token can only be part of one group.
+ The `nvgpu.device_async_create_group` op creates a group of memory accesses
+ containing all the pending `device_async_copy` operations associated with
+ argument tokens. Each token can only be part of one group.
- It returns a token that can be use to wait until the group fully completes.
+ It returns a token that can be use to wait until the group fully completes.
- This is meant to be used with `nvgpu.device_async_wait` to synchronize copies
- as explained in those ops descriptions.
+ This is meant to be used with `nvgpu.device_async_wait` to synchronize copies
+ as explained in those ops descriptions.
- Groups are executed in the order they are created.
+ Groups are executed in the order they are created.
- Example:
+ Example:
- ```mlir
- %0 = nvgpu.device_async_create_group
+ ```mlir
+ %0 = nvgpu.device_async_create_group
```
}];
let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
@@ -243,16 +243,17 @@ def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
let summary = "Wait for async gpu ops to complete.";
let description = [{
- The `nvgpu.device_async_wait` op will block the execution thread until the group
- associated with the source token is fully completed.
+ The `nvgpu.device_async_wait` op will block the execution thread until the group
+ associated with the source token is fully completed.
The optional `$numGroup` attribute gives a lower bound of the number of
groups uncompleted when the wait can unblock the thread.
- Example:
- ```mlir
- nvgpu.device_async_wait %0
- ```
+ Example:
+
+ ```mlir
+ nvgpu.device_async_wait %0
+ ```
}];
let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies,
OptionalAttr<I32Attr>:$numGroups);
More information about the Mlir-commits
mailing list