[Mlir-commits] [mlir] [mlir][nvgpu] separate ops, types, attributes definitions in NVGPU dialect. (PR #129846)

Thu Mar 6 17:25:12 PST 2025

https://github.com/linuxlonelyeagle updated https://github.com/llvm/llvm-project/pull/129846

>From 797a4f1ba85e45b65a1b66083416e6abf21c80af Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038 at qq.com>
Date: Wed, 5 Mar 2025 15:47:07 +0800
Subject: [PATCH 1/2] separate ops, types, attributes definitions in NVGPU
 dialect.

---
 .../mlir/Dialect/NVGPU/IR/CMakeLists.txt      |  13 +-
 mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td   | 721 +-----------------
 .../mlir/Dialect/NVGPU/IR/NVGPUDialect.h      |   4 +-
 .../include/mlir/Dialect/NVGPU/IR/NVGPUOps.td | 633 +++++++++++++++
 .../mlir/Dialect/NVGPU/IR/NVGPUTypes.td       | 112 +++
 mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt      |   2 +-
 mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp    |   8 +-
 7 files changed, 762 insertions(+), 731 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
 create mode 100644 mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td

diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt
index 13d754ca06316..ecdaae7f24d93 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt
@@ -1,5 +1,10 @@
 add_mlir_dialect(NVGPU nvgpu)
-add_mlir_doc(NVGPU NVGPU Dialects/ -gen-dialect-doc)
+add_mlir_doc(NVGPUOps NVGPU Dialects/ -gen-dialect-doc)
+
+set(LLVM_TARGET_DEFINITIONS NVGPUOps.td)
+mlir_tablegen(NVGPUOps.h.inc -gen-op-decls)
+mlir_tablegen(NVGPUOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRNVGPUOpsIncGen)
 
 set(LLVM_TARGET_DEFINITIONS NVGPU.td)
 mlir_tablegen(NVGPUEnums.h.inc -gen-enum-decls)
@@ -11,7 +16,7 @@ mlir_tablegen(NVGPUAttrDefs.h.inc -gen-attrdef-decls)
 mlir_tablegen(NVGPUAttrDefs.cpp.inc -gen-attrdef-defs)
 add_public_tablegen_target(MLIRNVGPUAttributesIncGen)
 
-set(LLVM_TARGET_DEFINITIONS NVGPU.td)
-mlir_tablegen(NVGPUAttrTypes.h.inc -gen-typedef-decls)
-mlir_tablegen(NVGPUAttrTypes.cpp.inc -gen-typedef-decls)
+set(LLVM_TARGET_DEFINITIONS NVGPUTypes.td)
+mlir_tablegen(NVGPUTypeDefs.h.inc -gen-typedef-decls)
+mlir_tablegen(NVGPUTypeDefs.cpp.inc -gen-typedef-defs)
 add_public_tablegen_target(MLIRNVGPUTypesIncGen)
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index f48fa9976da12..6b5470310e4a1 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -1,21 +1,10 @@
-//===-- NVGPU.td - NVGPU dialect operation definitions *- tablegen -*------===//
+//===-- NVGPU.td - Attribute defs for NVGPU dialect *- tablegen -*---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// This file defines the basic operations for the NVGPU dialect.
-//
-// This NVGPU provides a bridge between the target agnostic GPU and Vector
-// dialects and lower level NVVM dialect. This allow representing PTX specific
-// operations while using MLIR high level concepts like memref and 2-D vector.
-//
-// Ops semantic are going to be based on vendor specific PTX defintion:
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html
-//
-//===----------------------------------------------------------------------===//
 
 #ifndef NVGPU
 #define NVGPU
@@ -127,712 +116,4 @@ def TensorMapOOBAttr : EnumAttr<NVGPU_Dialect, TensorMapOOBKind, "oob">;
 def TensorMapInterleaveAttr : EnumAttr<NVGPU_Dialect, TensorMapInterleaveKind, "interleave">;
 def RcpRoundingModeAttr : EnumAttr<NVGPU_Dialect, RcpRoundingMode, "rcp_rounding_mode">;
 
-//===----------------------------------------------------------------------===//
-// NVGPU Type Definitions
-//===----------------------------------------------------------------------===//
-
-class NVGPU_Type<string name, string typeMnemonic,
-        list<Trait> traits = []> : TypeDef<NVGPU_Dialect, name, traits> {
-  let mnemonic = typeMnemonic;
-}
-
-def NVGPU_DeviceAsyncToken : NVGPU_Type<"DeviceAsyncToken",
-                                        "device.async.token", []> {
-  let summary = "device async token type";
-  let description = [{
-    `nvgpu.device.async.token` is a type returned by an asynchronous operation
-    that runs on the GPU (device). It is used to establish an SSA-based link
-    between the async operation (e.g. DeviceAsyncCopy) and operations that
-    group or synchronize the async operations (e.g. DeviceAsyncCreateGroupOp,
-    DeviceAsyncWaitOp).
-  }];
-}
-
-def NVGPU_MBarrierGroup : NVGPU_Type<"MBarrierGroup", "mbarrier.group", []> {
-  let summary = "mbarrier barrier type";
-  let description = [{
-    This is the type for one or more mbarrier object in shared memory that is 
-    used to synchronize a variable number of threads.
-
-    If `num_barriers` is not set, the number of mbarrier objects is 1.
-
-    A mbarrier object is 64 bit with 8 byte alignment. The mbarrier object 
-    can be initiated and invalidated.
-
-    [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#size-and-alignment-of-mbarrier-object)
-  }];    
-  let parameters = (ins "Attribute":$memorySpace, DefaultValuedParameter<"unsigned", "1">:$num_barriers);
-  let assemblyFormat = "`<` struct(params) `>`";
-  let builders = [
-    TypeBuilder<(ins "Attribute":$memorySpace), [{
-      return $_get($_ctxt, memorySpace, 1);
-    }]>
-  ];
-}
-
-def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { }
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-map
-def NVGPU_TensorMapDescriptor : NVGPU_Type<"TensorMapDescriptor", "tensormap.descriptor", []> {
-  let summary = "TensorMap descriptor";
-  let parameters = (ins "MemRefType":$tensor,
-                        EnumParameter<TensorMapSwizzleKind>:$swizzle,
-                        EnumParameter<TensorMapL2PromoKind>:$l2promo,
-                        EnumParameter<TensorMapOOBKind>:$oob,
-                        EnumParameter<TensorMapInterleaveKind>:$interleave);
-  let description = [{
-    `nvgpu.tma.descriptor` is a type that represents a TMA descriptor. It is 
-    128-byte object either in constant space or kernel paramater.    
-  }];
-  let assemblyFormat = "`<` struct(params) `>`";
-}
-
-def NVGPU_WarpgroupMatrixDescriptor : NVGPU_Type<"WarpgroupMatrixDescriptor", "warpgroup.descriptor", []> {
-  let summary = "Warpgroup matrix descriptor type";
-  let description = [{
-  The descriptor specifies the properties of the matrix in shared memory that 
-  is a multiplicand in the matrix multiply and accumulate operation. 
-  
-  The descriptor is a 64-bit value contained in a register with the following:
-  ```
-  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
-  |   0-13  |14-15|   16-29   |30-31|   32-45   |46-48|49-51|   52-61   |62-63|
-  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
-  |  14bits |2bits|   14bits  |2bits|   14bits  |2bits|3bits|   10bits  |2bits|
-  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
-  | BaseAddr|  0  | LeadingDim|  0  |   Stride  |  0  |Offst|     0     |Swzle|
-  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
-  ```
-   
-  [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-shared-memory-layout-matrix-descriptor) 
-  
-  }];  
-  let parameters = (ins "MemRefType":$tensor);
-  let assemblyFormat = "`<` struct(params) `>`";
-}
-
-def NVGPU_WarpgroupAccumulator : NVGPU_Type<"WarpgroupAccumulator", "warpgroup.accumulator", []> {
-  let parameters = (ins "VectorType":$fragmented);
-  let assemblyFormat = "`<` struct(params) `>`";
-  let description = [{
-    This type represents the result matrix obtained from `nvgpu.warpgroup.mma`. 
-    The `$fragmented` type signifies the distributed or fragmented result 
-    vector that is collectively owned by all the threads in the warp-group 
-    that executed `nvgpu.warpgroup.mma`.
-    [See the details of register fragment layout for accumulator matrix D]
-    (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d) 
-  }];
-}
-
-//===----------------------------------------------------------------------===//
-// NVGPU Op Definitions
-//===----------------------------------------------------------------------===//
-
-class NVGPU_Op<string mnemonic, list<Trait> traits = []> :
-  Op<NVGPU_Dialect, mnemonic, traits> {}
-
-def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [
-                                MemoryEffects<[MemRead]>,
-                                PredOpTrait<"srcMemref and res have same element type",
-                                            TCresVTEtIsSameAsOp<0, 0>>]> {
-  let description = [{
-    The `nvgpu.ldmatrix` op represents loading a matrix fragment from
-    memory to registers. The source and result type must be compatible
-    with lowering to the `nvvm.ldmatrix` instruction. This op represents
-    the distributed version of a `vector.transfer_read` as an intermediate
-    step between lowering from `vector.transfer_read` to `nvvm.ldmatrix`.
-
-    This operation is meant to follow the semantic of described here:
-    https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix
-
-    Example:
-    ```mlir
-    %0 = nvgpu.ldmatrix %sm[%c0, %c0] {numTiles = 4 : i32, transpose = false} :
-      memref<?x?xf16, 3> -> vector<4x2xf16>
-    ```
-  }];
-
-  let arguments = (ins Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$srcMemref,
-                           Variadic<Index>:$indices, BoolAttr:$transpose,
-                           I32Attr:$numTiles);
-  let results = (outs AnyVectorOfNonZeroRank:$res);
-  let assemblyFormat = [{
-    $srcMemref`[` $indices `]` attr-dict `:` type($srcMemref) `->` type($res)
-  }];
-
-  let hasVerifier = 1;
-}
-
-class NVGPU_MmaSyncOp<string mnemonic> :
-        NVGPU_Op<mnemonic,  [Pure,
-                             PredOpTrait<"matrixA and matrixB have same element type",
-                                         TCopVTEtIsSameAs<0, 1>>]> {
-  code extraBaseClassDeclaration = [{
-    std::array<int64_t, 3> getMmaShapeAsArray() {
-      ArrayAttr mmaShape = this->getMmaShape();
-      assert(mmaShape.size() == 3 && "mmaShape should be three integers");
-      return {::llvm::cast<IntegerAttr>(mmaShape[0]).getInt(),
-              ::llvm::cast<IntegerAttr>(mmaShape[1]).getInt(),
-              ::llvm::cast<IntegerAttr>(mmaShape[2]).getInt()};
-    }
-  }];
-
-  let hasVerifier = 1;
-}
-
-def NVGPU_MmaSyncOp : NVGPU_MmaSyncOp<"mma.sync"> {
-  let description = [{
-    The `nvgpu.mma.sync` op represents the warp-level matrix-multiply-and-
-    accumulate (mma) operation that is compatible with `nvvm.mma.sync`.
-    The operands and results vector sizes are thread-level onwership to
-    the warp-level mma operation shape. `mmaShape` attribute holds the
-    warp-level matrix-multiply shape.
-
-    The `nvgpu.mma.sync` op serves as an intermediate point between lowering from
-    `vector.contract` to `nvvm.mma.sync`.
-
-    This operation is meant to follow the semantic of described here:
-      https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma
-
-    Example:
-
-    ```mlir
-    %res = nvgpu.mma.sync (%matrixA, %matrixB, %matrixC) {mmaShape = [16, 8, 16]} :
-        (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf32>) -> vector<2x2xf32>
-    ```
-  }];
-  let arguments = (ins AnyVectorOfNonZeroRank:$matrixA,
-                       AnyVectorOfNonZeroRank:$matrixB,
-                       AnyVectorOfNonZeroRank:$matrixC,
-                       I64ArrayAttr:$mmaShape,
-                       OptionalAttr<UnitAttr>:$tf32Enabled);
-
-  let results = (outs AnyVectorOfNonZeroRank:$res);
-
-  let builders = [
-    OpBuilder<(ins "Value":$matrixA,
-                   "Value":$matrixB,
-                   "Value":$matrixC,
-                   "ArrayAttr":$mmaShape)>,
-    OpBuilder<(ins "Value":$matrixA,
-                   "Value":$matrixB,
-                   "Value":$matrixC,
-                   "ArrayRef<int64_t>":$mmaShape,
-                   CArg<"bool", "false">:$tf32Enabled)>
-  ];
-
-  let assemblyFormat = [{
-    `(` $matrixA`,` $matrixB`,` $matrixC `)` attr-dict
-    `:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
-  }];
-
-  let extraClassDeclaration = extraBaseClassDeclaration;
-}
-
-def NVGPU_MmaSparseSyncMetadataType : FixedVectorOfLengthAndType<[2], [I16]>,
-                        BuildableType<"::mlir::VectorType::get("
-                          "{2},$_builder.getI16Type())">;
-
-def NVGPU_MmaSparseSyncOp : NVGPU_MmaSyncOp<"mma.sp.sync"> {
-  let description = [{
-  The `nvgu.mma.sp.sync` operation performs a warp-distributed MMA operation
-  where operand A is "structured sparse". In this case, the `matrixA` operand
-  represents the (warp-distributed) non-zero values of operand A, and the
-  `sparse_metadata` operand provides the indices.
-
-  The full description of the sparsity storage format and distribution scheme is
-  described in the PTX docs. This operation is meant to follow the semantic
-  described in the PTX documentation here:
-  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-sparse-mma
-
-  The way the indices are distributed among the threads in a warp is controlled
-  by the optional `sparsity_selector` operand, which is `0` by default. For
-  more information, please consult the PTX documentation linked above.
-
-  Example (targetingthe f16 16x8x32 `mma.sp` PTX instruction):
-
-  ```mlir
-  nvgpu.mma.sp.sync (%a, %b, %c) metadata (%meta) {mmaShape = [16, 8, 32]} :
-    (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
-  ```
-  }];
-
-  let arguments = (ins AnyVectorOfNonZeroRank:$matrixA,
-                       AnyVectorOfNonZeroRank:$matrixB,
-                       AnyVectorOfNonZeroRank:$matrixC,
-                       NVGPU_MmaSparseSyncMetadataType:$sparseMetadata,
-                       I64ArrayAttr:$mmaShape,
-                       DefaultValuedAttr<I32Attr, "0">:$sparsitySelector,
-                       OptionalAttr<UnitAttr>:$tf32Enabled
-                       );
-
-  let results = (outs AnyVectorOfNonZeroRank:$res);
-
-  let builders = [
-    OpBuilder<(ins "Value":$matrixA,
-                   "Value":$matrixB,
-                   "Value":$matrixC,
-                   "Value":$sparseMetadata,
-                   "ArrayRef<int64_t>":$mmaShape)>
-  ];
-
-  let assemblyFormat = [{
-    `(` $matrixA`,` $matrixB`,` $matrixC `)` `metadata` `(` $sparseMetadata `)` attr-dict
-    `:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
-  }];
-
-  let extraClassDeclaration = extraBaseClassDeclaration;
-}
-
-def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [
-                                       AttrSizedOperandSegments]> {
-  let summary = "device-side asynchronous copy";
-  let description = [{
-    The `nvgpu.device_async_copy` op initiates an asynchronous copy operation of
-    elements from source (global memory) to the destination (shared memory)
-    without blocking the thread. The async copy is added to a group.
-
-    This op is meant to be used with `nvgpu.device_async_create_group` and
-    `nvgpu.device_async_wait` to synchronize copies as explained in those ops
-    descriptions.
-
-    `bypassL1` attribute is hint to the hardware to bypass the L1 cache during
-    async copy, this hint may be ignored by the hardware.
-
-    `dstElements` attribute is the total number of elements written to
-    destination (shared memory).
-
-    `srcElements` argument is the total number of elements read from
-    source (global memory).
-
-    `srcElements` is an optional argument and when present the op only reads
-    `srcElements` number of elements from the source (global memory) and zero fills
-    the rest of the elements in the destination (shared memory).
-
-    In order to do a copy and wait for the result we need the following
-    combination:
-    ```
-    // copy 1.
-    %cp1 = nvgpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
-    // copy 2.
-    %cp2 = nvgpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
-    // group 1 contains copy 1 and copy 2.
-    %token1 = nvgpu.device_async_create_group %cp1, %cp2
-    // copy 3.
-    %cp3 = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
-    // group 2 contains copy 3.
-    %token2 = nvgpu.device_async_create_group %cp3
-    // after the wait copy 1 and copy 2 are complete.
-    nvgpu.device_async_wait %token1
-    // after the wait copy 3 is complete.
-    nvgpu.device_async_wait %token2
-    ```
-
-    Example:
-
-    ```mlir
-    %0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 :
-      memref<4x5xf32> to memref<2x7x5xf32, 3>
-    ```
-  }];
-  let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
-  let arguments = (ins Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst,
-                       Variadic<Index>:$dstIndices,
-                       Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src,
-                       Variadic<Index>:$srcIndices,
-                       IndexAttr:$dstElements,
-                       Optional<Index>:$srcElements,
-                       OptionalAttr<UnitAttr>:$bypassL1);
-  let assemblyFormat = [{
-    $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` `,` $dstElements (`,` $srcElements^)?
-      attr-dict `:` type($src) `to` type($dst)
-  }];
-  let hasVerifier = 1;
-}
-
-def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
-  let summary = "device side asynchronous create group operation";
-  let description = [{
-    The `nvgpu.device_async_create_group` op creates a group of memory accesses
-    containing all the pending `device_async_copy` operations associated with
-    argument tokens. Each token can only be part of one group.
-
-    It returns a token that can be use to wait until the group fully completes.
-
-    This is meant to be used with `nvgpu.device_async_wait` to synchronize copies
-    as explained in those ops descriptions.
-
-    Groups are executed in the order they are created.
-
-    Example:
-
-    ```mlir
-    %0 = nvgpu.device_async_create_group
-  ```
-  }];
-  let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
-  let arguments = (ins Variadic<NVGPU_DeviceAsyncToken>:$inputTokens);
-  let assemblyFormat = [{
-    $inputTokens attr-dict
-  }];
-}
-
-def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
-  let summary = "Wait for async gpu ops to complete.";
-  let description = [{
-    The `nvgpu.device_async_wait` op will block the execution thread until the group
-    associated with the source token is fully completed.
-
-    The optional `$numGroups` attribute gives an upper bound of the number of
-    groups uncompleted when the wait can unblock the thread. For example,  if
-    16 async groups are pushe and `$numGroups` is set to 12, then the thread
-    will unblock when 12 groups or fewer are in flight (4 groups have
-    completed).
-
-    Example:
-
-    ```mlir
-    nvgpu.device_async_wait %0
-    ```
-  }];
-  let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies,
-                       OptionalAttr<I32Attr>:$numGroups);
-  let assemblyFormat = [{
-    $asyncDependencies attr-dict
-  }];
-}
-
-def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> {
-  let summary = "Creates a `nvgpu.mbarrier` object.";
-  let description = [{
-    The Op generates one or more `mbarrier` object, which is a barrier created in 
-    shared memory and supports various synchronization behaviors for threads.
-
-    The `mbarrier` object has the following type and alignment requirements:
-      Type: .b64, Alignment: 8, Memory space: .shared
-    
-    Example:
-    ```mlir
-      %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
-    ```
-    }];
-  let arguments = (ins);
-  let results = (outs NVGPU_MBarrierGroup:$barriers);
-  let assemblyFormat = [{
-     attr-dict `->` type($barriers)
-  }];
-}
-
-def NVGPU_MBarrierInitOp : NVGPU_Op<"mbarrier.init", []> {
-  let summary = "Initialize the `nvgpu.mbarrier`.";
-  let description = [{
-    The Op initializes the `mbarrier` object with the given number of threads.
-
-    Example:
-    ```mlir
-      %num_threads = gpu.block_dim x
-      %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
-      nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
-    ```
-  }];
-  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$count, Index:$mbarId, Optional<I1>:$predicate);
-  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)";
-}
-
-def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> {
-  let summary = "Checks if the `nvgpu.mbarrier` has completed its current phase.";
-  let description = [{
-    Checks whether the mbarrier object has completed the phase. It is is a 
-    non-blocking instruction which tests for the completion of the phase.
-
-    Example:
-    ```mlir
-      %isComplete = nvgpu.mbarrier.test.wait %barrier, %token : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>, !nvgpu.mbarrier.token
-    ```
-  }];
-  let arguments = (ins NVGPU_MBarrierGroup:$barriers, NVGPU_MBarrierToken:$token, Index:$mbarId);
-  let results = (outs I1:$waitComplete);
-  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $token attr-dict `:` type($barriers) `,` type($token)";
-}
-
-def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> {
-  let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive`.";
-  let description = [{
-    The Op performs arrive-on operation on the `mbarrier` object and returns a 
-    `nvgpu.mbarrier.token`.
-
-    For more information, see
-    https://docs.nvidia.com/cuda/parallel-thread-execution/#arrive-on-operation-on-mbarrier-object
-
-    Example:
-    ```mlir
-      %token = nvgpu.mbarrier.arrive %barrier : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
-    ```
-  }];
-  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId);
-  let results = (outs NVGPU_MBarrierToken:$token);
-let assemblyFormat = "$barriers `[` $mbarId `]` attr-dict `:` type($barriers) `->` type($token)";
-}
-
-def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", []> {
-  let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive.nocomplete` as non-blocking.";
-  let description = [{
-    The Op performs arrive-on operation on the `mbarrier` object and returns a 
-    `nvgpu.mbarrier.token`.
-
-    The Op does not cause the `nvgpu.mbarrier` to complete its current phase.
-
-    Example:
-    ```mlir
-      %token = nvgpu.mbarrier.arrive.noComplete %barrier, %count : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
-    ```
-  }];
-  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId,
-                       Index:$count);
-  let results = (outs NVGPU_MBarrierToken:$token);
-  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count attr-dict `:` type($barriers) `->` type($token)";
-}
-
-def NVGPU_MBarrierArriveExpectTxOp : NVGPU_Op<"mbarrier.arrive.expect_tx", []> {
-  let summary = "Performs expect_tx operation on the `nvgpu.mbarrier.arrive`";
-  let description = [{
-    A thread executing the Op performs an expect-tx operation on the mbarrier 
-    object at the location specified by the address operand $barrier. The 
-    expect-tx operation, with an $txcount argument, increases the tx-count of 
-    an mbarrier object by the value specified by $txcount. This makes the 
-    current phase of the mbarrier object to expect and track the completion of 
-    additional asynchronous transactions.
-    
-    The `$txCount` specifies the number of element to the expect-tx operation.
-
-    Example:
-    ```mlir
-      nvgpu.mbarrier.arrive.expect_tx %barrier, %ic0 : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
-    ```
-  }];
-  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$txcount, Index:$mbarId, Optional<I1>:$predicate);
-  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $txcount  (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)";
-}
-
-def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
-  let summary = "Waits for the `nvgpu.mbarrier` to complete its current phase.";
-  let description = [{
-    Checks whether the mbarrier object has completed the phase. It is is a 
-    potentially blocking instruction which tests for the completion of the 
-    phase. Suspended thread resumes execution when the specified phase completes 
-    OR before the phase completes following a system-dependent time limit. 
-
-    The `$phaseParity` specifies either even phase (0) or odd phase (1) to 
-    wait.
-
-    Example:
-    ```mlir
-      nvgpu.mbarrier.try_wait.parity %barrier, %phaseParity, %ticks : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
-    ```
-  }];
-  let arguments = (ins NVGPU_MBarrierGroup:$barriers, I1:$phaseParity, Index:$ticks, Index:$mbarId);
-  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phaseParity `,` $ticks attr-dict `:` type($barriers)";  
-}
-
-def NVGPU_TmaPrefetchOp : NVGPU_Op<"tma.prefetch.descriptor", []> {
-  let summary = "Prefetch given `nvgpu.tensormap.descriptor` ";
-  let description = [{
-    The Op brings the cache line containing the given `$tmaDescriptor` for 
-    subsequent use by the `tma.async.load` instruction.
-  }];
-  let arguments = (ins NVGPU_TensorMapDescriptor:$tensorMapDescriptor, Optional<I1>:$predicate);
-  let assemblyFormat = [{
-    $tensorMapDescriptor (`,` `predicate` `=` $predicate^)? attr-dict `:` type($tensorMapDescriptor)
-  }];
-}
-
-def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", [AttrSizedOperandSegments]> {
-  let summary = "TMA asynchronous load";
-  let description = [{
-    The Op loads a tile memory region from global memory to shared memory by 
-    Tensor Memory Access (TMA).
-    
-    `$tensorMapDescriptor` is tensor map descriptor which has information about
-    tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
-
-    The Op uses `$barrier` mbarrier based completion mechanism. 
-  }];  
-  let arguments = (ins  Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst,
-                        NVGPU_MBarrierGroup:$barriers,
-                        NVGPU_TensorMapDescriptor:$tensorMapDescriptor,
-                        Variadic<Index>:$coordinates, 
-                        Index:$mbarId,
-                        Optional<I16>:$multicastMask,
-                        Optional<I1>:$predicate);
-  let assemblyFormat = [{
-    $tensorMapDescriptor `[` $coordinates `]` `,` $barriers `[` $mbarId `]` 
-      `to` $dst
-      (`multicast_mask` `=` $multicastMask^ )?
-      (`,` `predicate` `=` $predicate^)?
-      attr-dict `:` type($tensorMapDescriptor) `,` type($barriers) 
-      `->` type($dst)
-  }];
-  let hasVerifier = 1;
-
-}
-
-def NVGPU_TmaAsyncStoreOp : NVGPU_Op<"tma.async.store", [AttrSizedOperandSegments]> {
-  let summary = "TMA asynchronous store";
-  let description = [{
-    The Op store a tile memory region from global memory to shared memory by 
-    Tensor Memory Access (TMA).
-    
-    `$tensorMapDescriptor` is tensor map descriptor which has information about
-    tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
-  }];  
-  let arguments = (ins  Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src,
-                        Arg<NVGPU_TensorMapDescriptor, "", [MemWriteAt<0, FullEffect>]>:$tensorMapDescriptor,
-                        Variadic<Index>:$coordinates, 
-                        Optional<I1>:$predicate);
-  let assemblyFormat = [{
-      $src `to` $tensorMapDescriptor `[` $coordinates `]`
-      (`,` `predicate` `=` $predicate^)?
-      attr-dict `:` type($src)
-      `->` type($tensorMapDescriptor)
-  }];
-  let hasVerifier = 1;
-}
-
-def NVGPU_TmaCreateDescriptorOp : NVGPU_Op<"tma.create.descriptor", []> {
-  let summary = "TMA create descriptor";
-  let description = [{
-    The Op creates a tensor map descriptor object representing tiled memory 
-    region. To do that it calls CUDA Driver's `cuTensorMapEncodeTiled`. The 
-    descriptor is used by Tensor Memory Access (TMA).
-
-    The `tensor` is the source tensor to be tiled. 
-
-    The `boxDimensions` is the size of the tiled memory region in each dimension.
-
-    For more information see below:
-    https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
-  }];
-
-  let arguments = (ins AnyUnrankedMemRef:$tensor,
-                       Variadic<Index>:$boxDimensions);
-  let results = (outs NVGPU_TensorMapDescriptor:$tensorMap);
-  let assemblyFormat = [{
-         $tensor `box` `[` $boxDimensions `]` attr-dict `:` type($tensor) `->` type($tensorMap)
-  }];
-  let hasVerifier = 1;
-}
-
-def NVGPU_WarpgroupGenerateDescriptorOp : NVGPU_Op<"warpgroup.generate.descriptor", []> {
-  let summary = "Generate a warpgroup matrix descriptor";
-  let description = [{
-  This Op builds a `nvgpu.warpgroup.descriptor` that is used by 
-  `nvgpu.warpgroup.mma` to perform warpgroup-level matrix multiply and 
-  accumulate.
-
-  The descriptor specifies the properties of the matrix in shared memory that 
-  is a multiplicand in the matrix multiply and accumulate operation. 
-  }];  
-  let results = (outs NVGPU_WarpgroupMatrixDescriptor:$descriptor);
-  let arguments = (ins Arg<AnyMemRef, "", [MemRead]>:$tensor, 
-                       NVGPU_TensorMapDescriptor:$tensorMap);
-  let assemblyFormat = [{$tensor `,` $tensorMap attr-dict `:` type($tensor) `,` type($tensorMap) `->` type($descriptor)}];
-  let hasVerifier = 1;
-}
-
-def NVGPU_WarpgroupMmaOp : NVGPU_Op<"warpgroup.mma"> {
-  let description = [{
-    The `nvgpu.warpgroup.mma` op performs the warpgroup-level (4 warps) 
-    matrix-multiply-and-accumulate (mma) operation that results in 
-    `nvvm.wgmma.mma_async`. 
-    
-    The operands are `descriptorA` and `descriptorB` that are wgmma matrix 
-    descriptors that shows the properties of the matrix in shared memory. The 
-    results are thread-level ownership to the warpgroup-level mma operation 
-    shape. The shape is deduced from the descriptor types and output vector.
-
-    The Op encapsulates multiple `nvvm.wgmma.mma_async` operations to complete 
-    the given shape. As `nvvm.wgmma.async` Op, or its corresponding PTX 
-    instruction, is asynchronous, this Op groups the `nvvm.wgmma.async` and 
-    surrounds them between `wgmma.fence.aligned` and 
-    `wgmma.commit.group.sync.aligned`, `wgmma.wait.group.sync.aligned` Ops.
-
-    Example:
-    ```mlir
-      %r1,%r2 = nvgpu.warpgroup.mma %descA, %descB, %acc1, %acc2: 
-                 !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>, 
-                 !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>, 
-                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
-                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
-                 -> 
-                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
-                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
-    ```
-  }];
-
-  let arguments = (ins NVGPU_WarpgroupMatrixDescriptor:$descriptorA, 
-                       NVGPU_WarpgroupMatrixDescriptor:$descriptorB,                                               
-                       DefaultValuedOptionalAttr<I64Attr, "1">:$waitGroup,
-                       OptionalAttr<UnitAttr>:$transposeA,
-                       OptionalAttr<UnitAttr>:$transposeB,
-                       NVGPU_WarpgroupAccumulator:$matrixC);
-  let results = (outs NVGPU_WarpgroupAccumulator:$matrixD);
-  let assemblyFormat = [{    
-    $descriptorA`,` $descriptorB`,` $matrixC attr-dict
-    `:` type($descriptorA) `,` type($descriptorB) `,` type($matrixC) `->` type($matrixD)
-  }];
-  let hasVerifier = 1;
-}
-
-def NVGPU_WarpgroupMmaStoreOp : NVGPU_Op<"warpgroup.mma.store"> {
-  let description = [{
-    The `nvgpu.warpgroup.mma.store` op performs the store of fragmented result 
-    in $matrixD to given memref. 
-
-    [See the details of register fragment layout for accumulator matrix D]
-    (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d) 
-
-    Note that, the op must be run with warp group.
-  }];
-
-  let arguments = (ins NVGPU_WarpgroupAccumulator:$matrixD,
-                       Arg<AnyMemRef, "", [MemWrite]>:$dstMemref);
-  
-  let assemblyFormat = [{
-    $matrixD `,` $dstMemref attr-dict `:` type($matrixD) `to` type($dstMemref)
-  }];
-  let hasVerifier = 1;
-}
-
-def NVGPU_WarpgroupMmaInitAccumulatorOp : NVGPU_Op<"warpgroup.mma.init.accumulator"> {  
-  let summary = "Initializes the accumulator matrix";
-
-  let description = [{
-    This Op generates and initializes the accumulator matrix for 
-    `nvgpu.warpgroup.mma` op to perform matrix-multiply-and-accumulate.
-  }];
-  let results = (outs NVGPU_WarpgroupAccumulator:$matrixC);
-  let assemblyFormat = "attr-dict `->` type($matrixC)";
-  let hasVerifier = 1;
-}
-
-def NVGPU_RcpOp : NVGPU_Op<"rcp", [Pure,
-                                   SameOperandsAndResultType]> {
-  let summary = "The reciprocal calculation for vector types";
-  let description = [{
-    Reciprocal calculation for `vector` types using `nvvm.rcp` OPs.
-
-    Currently, only the `approx` rounding mode and `ftz` are supported, and only for the `f32` type.
-
-    The input and output must be of the same vector type and shape.
-  }];
-  let arguments = (ins VectorOfNonZeroRankOf<[F32]>:$in,
-                       DefaultValuedAttr<RcpRoundingModeAttr, "RcpRoundingMode::APPROX">:$rounding,
-                       UnitAttr:$ftz);
-  let results = (outs VectorOfNonZeroRankOf<[F32]>:$out);
-  let assemblyFormat = [{
-    $in `{` `rounding` `=` $rounding (`,` `ftz` $ftz^)? `}` 
-    attr-dict `:` type($out)
-  }];
-  let hasVerifier = 1;
-}
 #endif // NVGPU
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
index db4c63b3390eb..61a57fb60bda4 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
@@ -53,11 +53,11 @@ constexpr unsigned kMaxTMALastdimByte = 128;
 #include "mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.h.inc"
 
 #define GET_TYPEDEF_CLASSES
-#include "mlir/Dialect/NVGPU/IR/NVGPUTypes.h.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUTypeDefs.h.inc"
 
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h.inc"
 
 #define GET_OP_CLASSES
-#include "mlir/Dialect/NVGPU/IR/NVGPU.h.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUOps.h.inc"
 
 #endif // MLIR_DIALECT_NVGPU_NVGPUDIALECT_H_
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
new file mode 100644
index 0000000000000..ab2ddf278b4a4
--- /dev/null
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
@@ -0,0 +1,633 @@
+//===-- NVGPUOps.td - NVGPU dialect operation definitions *- tablegen -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the basic operations for the NVGPU dialect.
+//
+// This NVGPU provides a bridge between the target agnostic GPU and Vector
+// dialects and lower level NVVM dialect. This allow representing PTX specific
+// operations while using MLIR high level concepts like memref and 2-D vector.
+//
+// Ops semantic are going to be based on vendor specific PTX defintion:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html
+//
+//===----------------------------------------------------------------------===//
+
+include "mlir/Dialect/NVGPU/IR/NVGPU.td"
+include "mlir/Dialect/NVGPU/IR/NVGPUTypes.td"
+
+//===----------------------------------------------------------------------===//
+// NVGPU Op Definitions
+//===----------------------------------------------------------------------===//
+
+class NVGPU_Op<string mnemonic, list<Trait> traits = []> :
+  Op<NVGPU_Dialect, mnemonic, traits> {}
+
+def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [
+                                MemoryEffects<[MemRead]>,
+                                PredOpTrait<"srcMemref and res have same element type",
+                                            TCresVTEtIsSameAsOp<0, 0>>]> {
+  let description = [{
+    The `nvgpu.ldmatrix` op represents loading a matrix fragment from
+    memory to registers. The source and result type must be compatible
+    with lowering to the `nvvm.ldmatrix` instruction. This op represents
+    the distributed version of a `vector.transfer_read` as an intermediate
+    step between lowering from `vector.transfer_read` to `nvvm.ldmatrix`.
+
+    This operation is meant to follow the semantic of described here:
+    https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix
+
+    Example:
+    ```mlir
+    %0 = nvgpu.ldmatrix %sm[%c0, %c0] {numTiles = 4 : i32, transpose = false} :
+      memref<?x?xf16, 3> -> vector<4x2xf16>
+    ```
+  }];
+
+  let arguments = (ins Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$srcMemref,
+                           Variadic<Index>:$indices, BoolAttr:$transpose,
+                           I32Attr:$numTiles);
+  let results = (outs AnyVectorOfNonZeroRank:$res);
+  let assemblyFormat = [{
+    $srcMemref`[` $indices `]` attr-dict `:` type($srcMemref) `->` type($res)
+  }];
+
+  let hasVerifier = 1;
+}
+
+class NVGPU_MmaSyncOp<string mnemonic> :
+        NVGPU_Op<mnemonic,  [Pure,
+                             PredOpTrait<"matrixA and matrixB have same element type",
+                                         TCopVTEtIsSameAs<0, 1>>]> {
+  code extraBaseClassDeclaration = [{
+    std::array<int64_t, 3> getMmaShapeAsArray() {
+      ArrayAttr mmaShape = this->getMmaShape();
+      assert(mmaShape.size() == 3 && "mmaShape should be three integers");
+      return {::llvm::cast<IntegerAttr>(mmaShape[0]).getInt(),
+              ::llvm::cast<IntegerAttr>(mmaShape[1]).getInt(),
+              ::llvm::cast<IntegerAttr>(mmaShape[2]).getInt()};
+    }
+  }];
+
+  let hasVerifier = 1;
+}
+
+def NVGPU_MmaSyncOp : NVGPU_MmaSyncOp<"mma.sync"> {
+  let description = [{
+    The `nvgpu.mma.sync` op represents the warp-level matrix-multiply-and-
+    accumulate (mma) operation that is compatible with `nvvm.mma.sync`.
+    The operands and results vector sizes are thread-level onwership to
+    the warp-level mma operation shape. `mmaShape` attribute holds the
+    warp-level matrix-multiply shape.
+
+    The `nvgpu.mma.sync` op serves as an intermediate point between lowering from
+    `vector.contract` to `nvvm.mma.sync`.
+
+    This operation is meant to follow the semantic of described here:
+      https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma
+
+    Example:
+
+    ```mlir
+    %res = nvgpu.mma.sync (%matrixA, %matrixB, %matrixC) {mmaShape = [16, 8, 16]} :
+        (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf32>) -> vector<2x2xf32>
+    ```
+  }];
+  let arguments = (ins AnyVectorOfNonZeroRank:$matrixA,
+                       AnyVectorOfNonZeroRank:$matrixB,
+                       AnyVectorOfNonZeroRank:$matrixC,
+                       I64ArrayAttr:$mmaShape,
+                       OptionalAttr<UnitAttr>:$tf32Enabled);
+
+  let results = (outs AnyVectorOfNonZeroRank:$res);
+
+  let builders = [
+    OpBuilder<(ins "Value":$matrixA,
+                   "Value":$matrixB,
+                   "Value":$matrixC,
+                   "ArrayAttr":$mmaShape)>,
+    OpBuilder<(ins "Value":$matrixA,
+                   "Value":$matrixB,
+                   "Value":$matrixC,
+                   "ArrayRef<int64_t>":$mmaShape,
+                   CArg<"bool", "false">:$tf32Enabled)>
+  ];
+
+  let assemblyFormat = [{
+    `(` $matrixA`,` $matrixB`,` $matrixC `)` attr-dict
+    `:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
+  }];
+
+  let extraClassDeclaration = extraBaseClassDeclaration;
+}
+
+def NVGPU_MmaSparseSyncMetadataType : FixedVectorOfLengthAndType<[2], [I16]>,
+                        BuildableType<"::mlir::VectorType::get("
+                          "{2},$_builder.getI16Type())">;
+
+def NVGPU_MmaSparseSyncOp : NVGPU_MmaSyncOp<"mma.sp.sync"> {
+  let description = [{
+  The `nvgu.mma.sp.sync` operation performs a warp-distributed MMA operation
+  where operand A is "structured sparse". In this case, the `matrixA` operand
+  represents the (warp-distributed) non-zero values of operand A, and the
+  `sparse_metadata` operand provides the indices.
+
+  The full description of the sparsity storage format and distribution scheme is
+  described in the PTX docs. This operation is meant to follow the semantic
+  described in the PTX documentation here:
+  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-sparse-mma
+
+  The way the indices are distributed among the threads in a warp is controlled
+  by the optional `sparsity_selector` operand, which is `0` by default. For
+  more information, please consult the PTX documentation linked above.
+
+  Example (targetingthe f16 16x8x32 `mma.sp` PTX instruction):
+
+  ```mlir
+  nvgpu.mma.sp.sync (%a, %b, %c) metadata (%meta) {mmaShape = [16, 8, 32]} :
+    (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
+  ```
+  }];
+
+  let arguments = (ins AnyVectorOfNonZeroRank:$matrixA,
+                       AnyVectorOfNonZeroRank:$matrixB,
+                       AnyVectorOfNonZeroRank:$matrixC,
+                       NVGPU_MmaSparseSyncMetadataType:$sparseMetadata,
+                       I64ArrayAttr:$mmaShape,
+                       DefaultValuedAttr<I32Attr, "0">:$sparsitySelector,
+                       OptionalAttr<UnitAttr>:$tf32Enabled
+                       );
+
+  let results = (outs AnyVectorOfNonZeroRank:$res);
+
+  let builders = [
+    OpBuilder<(ins "Value":$matrixA,
+                   "Value":$matrixB,
+                   "Value":$matrixC,
+                   "Value":$sparseMetadata,
+                   "ArrayRef<int64_t>":$mmaShape)>
+  ];
+
+  let assemblyFormat = [{
+    `(` $matrixA`,` $matrixB`,` $matrixC `)` `metadata` `(` $sparseMetadata `)` attr-dict
+    `:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
+  }];
+
+  let extraClassDeclaration = extraBaseClassDeclaration;
+}
+
+def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [
+                                       AttrSizedOperandSegments]> {
+  let summary = "device-side asynchronous copy";
+  let description = [{
+    The `nvgpu.device_async_copy` op initiates an asynchronous copy operation of
+    elements from source (global memory) to the destination (shared memory)
+    without blocking the thread. The async copy is added to a group.
+
+    This op is meant to be used with `nvgpu.device_async_create_group` and
+    `nvgpu.device_async_wait` to synchronize copies as explained in those ops
+    descriptions.
+
+    `bypassL1` attribute is hint to the hardware to bypass the L1 cache during
+    async copy, this hint may be ignored by the hardware.
+
+    `dstElements` attribute is the total number of elements written to
+    destination (shared memory).
+
+    `srcElements` argument is the total number of elements read from
+    source (global memory).
+
+    `srcElements` is an optional argument and when present the op only reads
+    `srcElements` number of elements from the source (global memory) and zero fills
+    the rest of the elements in the destination (shared memory).
+
+    In order to do a copy and wait for the result we need the following
+    combination:
+    ```
+    // copy 1.
+    %cp1 = nvgpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
+    // copy 2.
+    %cp2 = nvgpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
+    // group 1 contains copy 1 and copy 2.
+    %token1 = nvgpu.device_async_create_group %cp1, %cp2
+    // copy 3.
+    %cp3 = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
+    // group 2 contains copy 3.
+    %token2 = nvgpu.device_async_create_group %cp3
+    // after the wait copy 1 and copy 2 are complete.
+    nvgpu.device_async_wait %token1
+    // after the wait copy 3 is complete.
+    nvgpu.device_async_wait %token2
+    ```
+
+    Example:
+
+    ```mlir
+    %0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 :
+      memref<4x5xf32> to memref<2x7x5xf32, 3>
+    ```
+  }];
+  let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
+  let arguments = (ins Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst,
+                       Variadic<Index>:$dstIndices,
+                       Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src,
+                       Variadic<Index>:$srcIndices,
+                       IndexAttr:$dstElements,
+                       Optional<Index>:$srcElements,
+                       OptionalAttr<UnitAttr>:$bypassL1);
+  let assemblyFormat = [{
+    $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` `,` $dstElements (`,` $srcElements^)?
+      attr-dict `:` type($src) `to` type($dst)
+  }];
+  let hasVerifier = 1;
+}
+
+def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
+  let summary = "device side asynchronous create group operation";
+  let description = [{
+    The `nvgpu.device_async_create_group` op creates a group of memory accesses
+    containing all the pending `device_async_copy` operations associated with
+    argument tokens. Each token can only be part of one group.
+
+    It returns a token that can be use to wait until the group fully completes.
+
+    This is meant to be used with `nvgpu.device_async_wait` to synchronize copies
+    as explained in those ops descriptions.
+
+    Groups are executed in the order they are created.
+
+    Example:
+
+    ```mlir
+    %0 = nvgpu.device_async_create_group
+  ```
+  }];
+  let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
+  let arguments = (ins Variadic<NVGPU_DeviceAsyncToken>:$inputTokens);
+  let assemblyFormat = [{
+    $inputTokens attr-dict
+  }];
+}
+
+def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
+  let summary = "Wait for async gpu ops to complete.";
+  let description = [{
+    The `nvgpu.device_async_wait` op will block the execution thread until the group
+    associated with the source token is fully completed.
+
+    The optional `$numGroups` attribute gives an upper bound of the number of
+    groups uncompleted when the wait can unblock the thread. For example,  if
+    16 async groups are pushe and `$numGroups` is set to 12, then the thread
+    will unblock when 12 groups or fewer are in flight (4 groups have
+    completed).
+
+    Example:
+
+    ```mlir
+    nvgpu.device_async_wait %0
+    ```
+  }];
+  let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies,
+                       OptionalAttr<I32Attr>:$numGroups);
+  let assemblyFormat = [{
+    $asyncDependencies attr-dict
+  }];
+}
+
+def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> {
+  let summary = "Creates a `nvgpu.mbarrier` object.";
+  let description = [{
+    The Op generates one or more `mbarrier` object, which is a barrier created in 
+    shared memory and supports various synchronization behaviors for threads.
+
+    The `mbarrier` object has the following type and alignment requirements:
+      Type: .b64, Alignment: 8, Memory space: .shared
+    
+    Example:
+    ```mlir
+      %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+    ```
+    }];
+  let arguments = (ins);
+  let results = (outs NVGPU_MBarrierGroup:$barriers);
+  let assemblyFormat = [{
+     attr-dict `->` type($barriers)
+  }];
+}
+
+def NVGPU_MBarrierInitOp : NVGPU_Op<"mbarrier.init", []> {
+  let summary = "Initialize the `nvgpu.mbarrier`.";
+  let description = [{
+    The Op initializes the `mbarrier` object with the given number of threads.
+
+    Example:
+    ```mlir
+      %num_threads = gpu.block_dim x
+      %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+      nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+    ```
+  }];
+  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$count, Index:$mbarId, Optional<I1>:$predicate);
+  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)";
+}
+
+def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> {
+  let summary = "Checks if the `nvgpu.mbarrier` has completed its current phase.";
+  let description = [{
+    Checks whether the mbarrier object has completed the phase. It is is a 
+    non-blocking instruction which tests for the completion of the phase.
+
+    Example:
+    ```mlir
+      %isComplete = nvgpu.mbarrier.test.wait %barrier, %token : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>, !nvgpu.mbarrier.token
+    ```
+  }];
+  let arguments = (ins NVGPU_MBarrierGroup:$barriers, NVGPU_MBarrierToken:$token, Index:$mbarId);
+  let results = (outs I1:$waitComplete);
+  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $token attr-dict `:` type($barriers) `,` type($token)";
+}
+
+def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> {
+  let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive`.";
+  let description = [{
+    The Op performs arrive-on operation on the `mbarrier` object and returns a 
+    `nvgpu.mbarrier.token`.
+
+    For more information, see
+    https://docs.nvidia.com/cuda/parallel-thread-execution/#arrive-on-operation-on-mbarrier-object
+
+    Example:
+    ```mlir
+      %token = nvgpu.mbarrier.arrive %barrier : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
+    ```
+  }];
+  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId);
+  let results = (outs NVGPU_MBarrierToken:$token);
+let assemblyFormat = "$barriers `[` $mbarId `]` attr-dict `:` type($barriers) `->` type($token)";
+}
+
+def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", []> {
+  let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive.nocomplete` as non-blocking.";
+  let description = [{
+    The Op performs arrive-on operation on the `mbarrier` object and returns a 
+    `nvgpu.mbarrier.token`.
+
+    The Op does not cause the `nvgpu.mbarrier` to complete its current phase.
+
+    Example:
+    ```mlir
+      %token = nvgpu.mbarrier.arrive.noComplete %barrier, %count : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
+    ```
+  }];
+  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId,
+                       Index:$count);
+  let results = (outs NVGPU_MBarrierToken:$token);
+  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count attr-dict `:` type($barriers) `->` type($token)";
+}
+
+def NVGPU_MBarrierArriveExpectTxOp : NVGPU_Op<"mbarrier.arrive.expect_tx", []> {
+  let summary = "Performs expect_tx operation on the `nvgpu.mbarrier.arrive`";
+  let description = [{
+    A thread executing the Op performs an expect-tx operation on the mbarrier 
+    object at the location specified by the address operand $barrier. The 
+    expect-tx operation, with an $txcount argument, increases the tx-count of 
+    an mbarrier object by the value specified by $txcount. This makes the 
+    current phase of the mbarrier object to expect and track the completion of 
+    additional asynchronous transactions.
+    
+    The `$txCount` specifies the number of element to the expect-tx operation.
+
+    Example:
+    ```mlir
+      nvgpu.mbarrier.arrive.expect_tx %barrier, %ic0 : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+    ```
+  }];
+  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$txcount, Index:$mbarId, Optional<I1>:$predicate);
+  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $txcount  (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)";
+}
+
+def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
+  let summary = "Waits for the `nvgpu.mbarrier` to complete its current phase.";
+  let description = [{
+    Checks whether the mbarrier object has completed the phase. It is is a 
+    potentially blocking instruction which tests for the completion of the 
+    phase. Suspended thread resumes execution when the specified phase completes 
+    OR before the phase completes following a system-dependent time limit. 
+
+    The `$phaseParity` specifies either even phase (0) or odd phase (1) to 
+    wait.
+
+    Example:
+    ```mlir
+      nvgpu.mbarrier.try_wait.parity %barrier, %phaseParity, %ticks : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+    ```
+  }];
+  let arguments = (ins NVGPU_MBarrierGroup:$barriers, I1:$phaseParity, Index:$ticks, Index:$mbarId);
+  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phaseParity `,` $ticks attr-dict `:` type($barriers)";  
+}
+
+def NVGPU_TmaPrefetchOp : NVGPU_Op<"tma.prefetch.descriptor", []> {
+  let summary = "Prefetch given `nvgpu.tensormap.descriptor` ";
+  let description = [{
+    The Op brings the cache line containing the given `$tmaDescriptor` for 
+    subsequent use by the `tma.async.load` instruction.
+  }];
+  let arguments = (ins NVGPU_TensorMapDescriptor:$tensorMapDescriptor, Optional<I1>:$predicate);
+  let assemblyFormat = [{
+    $tensorMapDescriptor (`,` `predicate` `=` $predicate^)? attr-dict `:` type($tensorMapDescriptor)
+  }];
+}
+
+def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", [AttrSizedOperandSegments]> {
+  let summary = "TMA asynchronous load";
+  let description = [{
+    The Op loads a tile memory region from global memory to shared memory by 
+    Tensor Memory Access (TMA).
+    
+    `$tensorMapDescriptor` is tensor map descriptor which has information about
+    tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
+
+    The Op uses `$barrier` mbarrier based completion mechanism. 
+  }];  
+  let arguments = (ins  Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst,
+                        NVGPU_MBarrierGroup:$barriers,
+                        NVGPU_TensorMapDescriptor:$tensorMapDescriptor,
+                        Variadic<Index>:$coordinates, 
+                        Index:$mbarId,
+                        Optional<I16>:$multicastMask,
+                        Optional<I1>:$predicate);
+  let assemblyFormat = [{
+    $tensorMapDescriptor `[` $coordinates `]` `,` $barriers `[` $mbarId `]` 
+      `to` $dst
+      (`multicast_mask` `=` $multicastMask^ )?
+      (`,` `predicate` `=` $predicate^)?
+      attr-dict `:` type($tensorMapDescriptor) `,` type($barriers) 
+      `->` type($dst)
+  }];
+  let hasVerifier = 1;
+
+}
+
+def NVGPU_TmaAsyncStoreOp : NVGPU_Op<"tma.async.store", [AttrSizedOperandSegments]> {
+  let summary = "TMA asynchronous store";
+  let description = [{
+    The Op store a tile memory region from global memory to shared memory by 
+    Tensor Memory Access (TMA).
+    
+    `$tensorMapDescriptor` is tensor map descriptor which has information about
+    tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
+  }];  
+  let arguments = (ins  Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src,
+                        Arg<NVGPU_TensorMapDescriptor, "", [MemWriteAt<0, FullEffect>]>:$tensorMapDescriptor,
+                        Variadic<Index>:$coordinates, 
+                        Optional<I1>:$predicate);
+  let assemblyFormat = [{
+      $src `to` $tensorMapDescriptor `[` $coordinates `]`
+      (`,` `predicate` `=` $predicate^)?
+      attr-dict `:` type($src)
+      `->` type($tensorMapDescriptor)
+  }];
+  let hasVerifier = 1;
+}
+
+def NVGPU_TmaCreateDescriptorOp : NVGPU_Op<"tma.create.descriptor", []> {
+  let summary = "TMA create descriptor";
+  let description = [{
+    The Op creates a tensor map descriptor object representing tiled memory 
+    region. To do that it calls CUDA Driver's `cuTensorMapEncodeTiled`. The 
+    descriptor is used by Tensor Memory Access (TMA).
+
+    The `tensor` is the source tensor to be tiled. 
+
+    The `boxDimensions` is the size of the tiled memory region in each dimension.
+
+    For more information see below:
+    https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
+  }];
+
+  let arguments = (ins AnyUnrankedMemRef:$tensor,
+                       Variadic<Index>:$boxDimensions);
+  let results = (outs NVGPU_TensorMapDescriptor:$tensorMap);
+  let assemblyFormat = [{
+         $tensor `box` `[` $boxDimensions `]` attr-dict `:` type($tensor) `->` type($tensorMap)
+  }];
+  let hasVerifier = 1;
+}
+
+def NVGPU_WarpgroupGenerateDescriptorOp : NVGPU_Op<"warpgroup.generate.descriptor", []> {
+  let summary = "Generate a warpgroup matrix descriptor";
+  let description = [{
+  This Op builds a `nvgpu.warpgroup.descriptor` that is used by 
+  `nvgpu.warpgroup.mma` to perform warpgroup-level matrix multiply and 
+  accumulate.
+
+  The descriptor specifies the properties of the matrix in shared memory that 
+  is a multiplicand in the matrix multiply and accumulate operation. 
+  }];  
+  let results = (outs NVGPU_WarpgroupMatrixDescriptor:$descriptor);
+  let arguments = (ins Arg<AnyMemRef, "", [MemRead]>:$tensor, 
+                       NVGPU_TensorMapDescriptor:$tensorMap);
+  let assemblyFormat = [{$tensor `,` $tensorMap attr-dict `:` type($tensor) `,` type($tensorMap) `->` type($descriptor)}];
+  let hasVerifier = 1;
+}
+
+def NVGPU_WarpgroupMmaOp : NVGPU_Op<"warpgroup.mma"> {
+  let description = [{
+    The `nvgpu.warpgroup.mma` op performs the warpgroup-level (4 warps) 
+    matrix-multiply-and-accumulate (mma) operation that results in 
+    `nvvm.wgmma.mma_async`. 
+    
+    The operands are `descriptorA` and `descriptorB` that are wgmma matrix 
+    descriptors that shows the properties of the matrix in shared memory. The 
+    results are thread-level ownership to the warpgroup-level mma operation 
+    shape. The shape is deduced from the descriptor types and output vector.
+
+    The Op encapsulates multiple `nvvm.wgmma.mma_async` operations to complete 
+    the given shape. As `nvvm.wgmma.async` Op, or its corresponding PTX 
+    instruction, is asynchronous, this Op groups the `nvvm.wgmma.async` and 
+    surrounds them between `wgmma.fence.aligned` and 
+    `wgmma.commit.group.sync.aligned`, `wgmma.wait.group.sync.aligned` Ops.
+
+    Example:
+    ```mlir
+      %r1,%r2 = nvgpu.warpgroup.mma %descA, %descB, %acc1, %acc2: 
+                 !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>, 
+                 !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>, 
+                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
+                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
+                 -> 
+                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
+                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
+    ```
+  }];
+
+  let arguments = (ins NVGPU_WarpgroupMatrixDescriptor:$descriptorA, 
+                       NVGPU_WarpgroupMatrixDescriptor:$descriptorB,                                               
+                       DefaultValuedOptionalAttr<I64Attr, "1">:$waitGroup,
+                       OptionalAttr<UnitAttr>:$transposeA,
+                       OptionalAttr<UnitAttr>:$transposeB,
+                       NVGPU_WarpgroupAccumulator:$matrixC);
+  let results = (outs NVGPU_WarpgroupAccumulator:$matrixD);
+  let assemblyFormat = [{    
+    $descriptorA`,` $descriptorB`,` $matrixC attr-dict
+    `:` type($descriptorA) `,` type($descriptorB) `,` type($matrixC) `->` type($matrixD)
+  }];
+  let hasVerifier = 1;
+}
+
+def NVGPU_WarpgroupMmaStoreOp : NVGPU_Op<"warpgroup.mma.store"> {
+  let description = [{
+    The `nvgpu.warpgroup.mma.store` op performs the store of fragmented result 
+    in $matrixD to given memref. 
+
+    [See the details of register fragment layout for accumulator matrix D]
+    (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d) 
+
+    Note that, the op must be run with warp group.
+  }];
+
+  let arguments = (ins NVGPU_WarpgroupAccumulator:$matrixD,
+                       Arg<AnyMemRef, "", [MemWrite]>:$dstMemref);
+  
+  let assemblyFormat = [{
+    $matrixD `,` $dstMemref attr-dict `:` type($matrixD) `to` type($dstMemref)
+  }];
+  let hasVerifier = 1;
+}
+
+def NVGPU_WarpgroupMmaInitAccumulatorOp : NVGPU_Op<"warpgroup.mma.init.accumulator"> {  
+  let summary = "Initializes the accumulator matrix";
+
+  let description = [{
+    This Op generates and initializes the accumulator matrix for 
+    `nvgpu.warpgroup.mma` op to perform matrix-multiply-and-accumulate.
+  }];
+  let results = (outs NVGPU_WarpgroupAccumulator:$matrixC);
+  let assemblyFormat = "attr-dict `->` type($matrixC)";
+  let hasVerifier = 1;
+}
+
+def NVGPU_RcpOp : NVGPU_Op<"rcp", [Pure,
+                                   SameOperandsAndResultType]> {
+  let summary = "The reciprocal calculation for vector types";
+  let description = [{
+    Reciprocal calculation for `vector` types using `nvvm.rcp` OPs.
+
+    Currently, only the `approx` rounding mode and `ftz` are supported, and only for the `f32` type.
+
+    The input and output must be of the same vector type and shape.
+  }];
+  let arguments = (ins VectorOfNonZeroRankOf<[F32]>:$in,
+                       DefaultValuedAttr<RcpRoundingModeAttr, "RcpRoundingMode::APPROX">:$rounding,
+                       UnitAttr:$ftz);
+  let results = (outs VectorOfNonZeroRankOf<[F32]>:$out);
+  let assemblyFormat = [{
+    $in `{` `rounding` `=` $rounding (`,` `ftz` $ftz^)? `}` 
+    attr-dict `:` type($out)
+  }];
+  let hasVerifier = 1;
+}
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td
new file mode 100644
index 0000000000000..8966ec359843b
--- /dev/null
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td
@@ -0,0 +1,112 @@
+//===- NVGPUTypes.td - NVGPU types -------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVGPU dialect types.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// NVGPU Type Definitions
+//===----------------------------------------------------------------------===//
+
+include "mlir/IR/AttrTypeBase.td"
+include "mlir/Dialect/NVGPU/IR/NVGPU.td"
+
+class NVGPU_Type<string name, string typeMnemonic,
+        list<Trait> traits = []> : TypeDef<NVGPU_Dialect, name, traits> {
+  let mnemonic = typeMnemonic;
+}
+
+def NVGPU_DeviceAsyncToken : NVGPU_Type<"DeviceAsyncToken",
+                                        "device.async.token", []> {
+  let summary = "device async token type";
+  let description = [{
+    `nvgpu.device.async.token` is a type returned by an asynchronous operation
+    that runs on the GPU (device). It is used to establish an SSA-based link
+    between the async operation (e.g. DeviceAsyncCopy) and operations that
+    group or synchronize the async operations (e.g. DeviceAsyncCreateGroupOp,
+    DeviceAsyncWaitOp).
+  }];
+}
+
+def NVGPU_MBarrierGroup : NVGPU_Type<"MBarrierGroup", "mbarrier.group", []> {
+  let summary = "mbarrier barrier type";
+  let description = [{
+    This is the type for one or more mbarrier object in shared memory that is 
+    used to synchronize a variable number of threads.
+
+    If `num_barriers` is not set, the number of mbarrier objects is 1.
+
+    A mbarrier object is 64 bit with 8 byte alignment. The mbarrier object 
+    can be initiated and invalidated.
+
+    [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#size-and-alignment-of-mbarrier-object)
+  }];    
+  let parameters = (ins "Attribute":$memorySpace, DefaultValuedParameter<"unsigned", "1">:$num_barriers);
+  let assemblyFormat = "`<` struct(params) `>`";
+  let builders = [
+    TypeBuilder<(ins "Attribute":$memorySpace), [{
+      return $_get($_ctxt, memorySpace, 1);
+    }]>
+  ];
+}
+
+def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { }
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-map
+def NVGPU_TensorMapDescriptor : NVGPU_Type<"TensorMapDescriptor", "tensormap.descriptor", []> {
+  let summary = "TensorMap descriptor";
+  let parameters = (ins "MemRefType":$tensor,
+                        EnumParameter<TensorMapSwizzleKind>:$swizzle,
+                        EnumParameter<TensorMapL2PromoKind>:$l2promo,
+                        EnumParameter<TensorMapOOBKind>:$oob,
+                        EnumParameter<TensorMapInterleaveKind>:$interleave);
+  let description = [{
+    `nvgpu.tma.descriptor` is a type that represents a TMA descriptor. It is 
+    128-byte object either in constant space or kernel paramater.    
+  }];
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+def NVGPU_WarpgroupMatrixDescriptor : NVGPU_Type<"WarpgroupMatrixDescriptor", "warpgroup.descriptor", []> {
+  let summary = "Warpgroup matrix descriptor type";
+  let description = [{
+  The descriptor specifies the properties of the matrix in shared memory that 
+  is a multiplicand in the matrix multiply and accumulate operation. 
+  
+  The descriptor is a 64-bit value contained in a register with the following:
+  ```
+  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
+  |   0-13  |14-15|   16-29   |30-31|   32-45   |46-48|49-51|   52-61   |62-63|
+  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
+  |  14bits |2bits|   14bits  |2bits|   14bits  |2bits|3bits|   10bits  |2bits|
+  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
+  | BaseAddr|  0  | LeadingDim|  0  |   Stride  |  0  |Offst|     0     |Swzle|
+  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
+  ```
+   
+  [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-shared-memory-layout-matrix-descriptor) 
+  
+  }];  
+  let parameters = (ins "MemRefType":$tensor);
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+def NVGPU_WarpgroupAccumulator : NVGPU_Type<"WarpgroupAccumulator", "warpgroup.accumulator", []> {
+  let parameters = (ins "VectorType":$fragmented);
+  let assemblyFormat = "`<` struct(params) `>`";
+  let description = [{
+    This type represents the result matrix obtained from `nvgpu.warpgroup.mma`. 
+    The `$fragmented` type signifies the distributed or fragmented result 
+    vector that is collectively owned by all the threads in the warp-group 
+    that executed `nvgpu.warpgroup.mma`.
+    [See the details of register fragment layout for accumulator matrix D]
+    (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d) 
+  }];
+}
diff --git a/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt
index 4d47ce4746dbb..10aa502ee67f8 100644
--- a/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt
@@ -5,7 +5,7 @@ add_mlir_dialect_library(MLIRNVGPUDialect
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/NVGPU
 
   DEPENDS
-  MLIRNVGPUIncGen
+  MLIRNVGPUOpsIncGen
   MLIRNVGPUEnumsIncGen
   MLIRNVGPUAttributesIncGen
   MLIRNVGPUTypesIncGen
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index ba86e8d6ceaf9..abbdb6a0f53ec 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -35,7 +35,7 @@ using namespace mlir::nvgpu;
 void nvgpu::NVGPUDialect::initialize() {
   addTypes<
 #define GET_TYPEDEF_LIST
-#include "mlir/Dialect/NVGPU/IR/NVGPUTypes.cpp.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUTypeDefs.cpp.inc"
       >();
   addAttributes<
 #define GET_ATTRDEF_LIST
@@ -43,7 +43,7 @@ void nvgpu::NVGPUDialect::initialize() {
       >();
   addOperations<
 #define GET_OP_LIST
-#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUOps.cpp.inc"
       >();
 }
 
@@ -681,7 +681,7 @@ LogicalResult RcpOp::verify() {
 #include "mlir/Dialect/NVGPU/IR/NVGPUEnums.cpp.inc"
 
 #define GET_OP_CLASSES
-#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUOps.cpp.inc"
 
 #define GET_TYPEDEF_CLASSES
-#include "mlir/Dialect/NVGPU/IR/NVGPUTypes.cpp.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUTypeDefs.cpp.inc"

>From 861c6b157a019a3967f493fec431a5ef05cef3a1 Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038 at qq.com>
Date: Wed, 5 Mar 2025 17:02:40 +0800
Subject: [PATCH 2/2] fix python test and nit.

---
 mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td      |  6 +++---
 mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td   |  5 +++++
 mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td | 11 ++++++++---
 mlir/python/mlir/dialects/NVGPUOps.td            |  2 +-
 4 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index 6b5470310e4a1..7f7a54cb0c57e 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef NVGPU
-#define NVGPU
+#ifndef MLIR_DIALECT_NVGPU_IR_NVGPU_TD
+#define MLIR_DIALECT_NVGPU_IR_NVGPU_TD
 
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
@@ -116,4 +116,4 @@ def TensorMapOOBAttr : EnumAttr<NVGPU_Dialect, TensorMapOOBKind, "oob">;
 def TensorMapInterleaveAttr : EnumAttr<NVGPU_Dialect, TensorMapInterleaveKind, "interleave">;
 def RcpRoundingModeAttr : EnumAttr<NVGPU_Dialect, RcpRoundingMode, "rcp_rounding_mode">;
 
-#endif // NVGPU
+#endif // MLIR_DIALECT_NVGPU_IR_NVGPU_TD
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
index ab2ddf278b4a4..eb0fb90d271ed 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
@@ -17,6 +17,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef MLIR_DIALECT_NVGPU_IR_NVGPUOPS_TD
+#define MLIR_DIALECT_NVGPU_IR_NVGPUOPS_TD
+
 include "mlir/Dialect/NVGPU/IR/NVGPU.td"
 include "mlir/Dialect/NVGPU/IR/NVGPUTypes.td"
 
@@ -631,3 +634,5 @@ def NVGPU_RcpOp : NVGPU_Op<"rcp", [Pure,
   }];
   let hasVerifier = 1;
 }
+
+#endif // MLIR_DIALECT_NVGPU_IR_NVGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td
index 8966ec359843b..8836a1a9dfcd8 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td
@@ -11,13 +11,16 @@
 //===----------------------------------------------------------------------===//
 
 
-//===----------------------------------------------------------------------===//
-// NVGPU Type Definitions
-//===----------------------------------------------------------------------===//
+#ifndef MLIR_DIALECT_NVGPU_IR_NVGPUTYPES_TD
+#define MLIR_DIALECT_NVGPU_IR_NVGPUTYPES_TD
 
 include "mlir/IR/AttrTypeBase.td"
 include "mlir/Dialect/NVGPU/IR/NVGPU.td"
 
+//===----------------------------------------------------------------------===//
+// NVGPU Type Definitions
+//===----------------------------------------------------------------------===//
+
 class NVGPU_Type<string name, string typeMnemonic,
         list<Trait> traits = []> : TypeDef<NVGPU_Dialect, name, traits> {
   let mnemonic = typeMnemonic;
@@ -110,3 +113,5 @@ def NVGPU_WarpgroupAccumulator : NVGPU_Type<"WarpgroupAccumulator", "warpgroup.a
     (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d) 
   }];
 }
+
+#endif //MLIR_DIALECT_NVGPU_IR_NVGPUTYPES_TD
diff --git a/mlir/python/mlir/dialects/NVGPUOps.td b/mlir/python/mlir/dialects/NVGPUOps.td
index ae54822cd9070..cdf651901e074 100644
--- a/mlir/python/mlir/dialects/NVGPUOps.td
+++ b/mlir/python/mlir/dialects/NVGPUOps.td
@@ -9,6 +9,6 @@
 #ifndef PYTHON_BINDINGS_NVGPU_OPS
 #define PYTHON_BINDINGS_NVGPU_OPS
 
-include "mlir/Dialect/NVGPU/IR/NVGPU.td"
+include "mlir/Dialect/NVGPU/IR/NVGPUOps.td"
 
 #endif