[Mlir-commits] [mlir] [MLIR][NVVM] Add tcgen05.mma MLIR Ops (PR #164356)

Mon Oct 20 21:26:04 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-mlir-llvm

Author: Pradeep Kumar (schwarzschild-radius)

<details>
<summary>Changes</summary>

This commit adds support for tgen05.mma family of instructions in the NVVM MLIR dialect and lowers to LLVM Intrinsics. Please refer [PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions) for information

---

Patch is 472.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/164356.diff


15 Files Affected:

- (modified) mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td (+639) 
- (modified) mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp (+546) 
- (added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-shared.mlir (+229) 
- (added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-tensor.mlir (+229) 
- (added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-invalid.mlir (+119) 
- (added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-shared.mlir (+466) 
- (added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-shared.mlir (+229) 
- (added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-tensor.mlir (+229) 
- (added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-shared.mlir (+442) 
- (added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-tensor.mlir (+634) 
- (added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-tensor.mlir (+634) 
- (added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-shared.mlir (+133) 
- (added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-shared.mlir (+133) 
- (added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-tensor.mlir (+133) 
- (added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-tensor.mlir (+133) 


``````````diff

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index d959464836043..a580a7f42bccc 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -4537,6 +4537,645 @@ def NVVM_ClusterLaunchControlQueryCancelOp
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// NVVM tcgen05.mma attributes
+//===----------------------------------------------------------------------===//
+
+def Tcgen05MMAKindF16          : I32EnumAttrCase<"F16",    0, "f16">;
+def Tcgen05MMAKindTF32         : I32EnumAttrCase<"TF32",   1, "tf32">;
+def Tcgen05MMAKindF8F6F4       : I32EnumAttrCase<"F8F6F4", 2, "f8f6f4">;
+def Tcgen05MMAKindINT8         : I32EnumAttrCase<"I8",     3, "i8">;
+
+def Tcgen05MMAKind : I32EnumAttr<
+  "Tcgen05MMAKind",
+  "tcgen05 MMA Supported Types",
+  [Tcgen05MMAKindF8F6F4, Tcgen05MMAKindINT8, Tcgen05MMAKindF16,
+   Tcgen05MMAKindTF32]> {
+    let cppNamespace = "::mlir::NVVM";
+    let genSpecializedAttr = 0;
+}
+
+def Tcgen05MMAKindAttr : EnumAttr<NVVM_Dialect, Tcgen05MMAKind, "tcgen05_mma_kind"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+def Tcgen05MMACollectorOpDiscard  : I32EnumAttrCase<"DISCARD", 0, "discard">;
+def Tcgen05MMACollectorOpLastUse  : I32EnumAttrCase<"LASTUSE", 1, "lastuse">;
+def Tcgen05MMACollectorOpFill     : I32EnumAttrCase<"FILL",    2, "fill">;
+def Tcgen05MMACollectorOpUse      : I32EnumAttrCase<"USE",     3, "use">;
+
+def Tcgen05MMACollectorOp : I32EnumAttr<
+  "Tcgen05MMACollectorOp",
+  "tcgen05.mma Collector Buffer Operation",
+  [Tcgen05MMACollectorOpDiscard,
+   Tcgen05MMACollectorOpLastUse,
+   Tcgen05MMACollectorOpFill,
+   Tcgen05MMACollectorOpUse]> {
+    let cppNamespace = "::mlir::NVVM";
+    let genSpecializedAttr = 0;
+}
+
+def Tcgen05MMACollectorOpAttr : EnumAttr<NVVM_Dialect, Tcgen05MMACollectorOp, "tcgen05_mma_collectorop"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+//===----------------------------------------------------------------------===//
+// NVVM tcgen05.mma Ops.
+//===----------------------------------------------------------------------===//
+
+def NVVM_Tcgen05MMAOp : NVVM_Op<"tcgen05.mma", [AttrSizedOperandSegments]> {
+
+  let summary = "Performs MMA operation on 5th-gen tensor cores";
+
+  let arguments = (ins
+      // Attributes
+      Tcgen05MMAKindAttr:$kind,
+      CTAGroupKindAttr:$ctaGroup,
+      DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
+                        "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
+      UnitAttr:$ashift,
+      // Arguments
+      LLVM_PointerTensor:$d,
+      AnyTypeOf<[LLVM_PointerTensor, I64]>:$a,
+      I64:$b,
+      I32:$idesc,
+      I1:$enableInputD,
+      // Optional arguments
+      Optional<I64>:$scaleInputD,
+      Optional<FixedVectorOfLengthAndType<[4, 8], [I32]>>:$disableOutputLane
+    );
+
+  let description = [{
+    The `tcgen05.mma` is an asynchronous op which performs matrix multiplication, 
+    and accumulation using 5th generation tensor cores
+
+    ```
+    D = A * B + (D * 2^ -scaleInputD)    // if `scaleInputD` is provided
+    D = A * B                            // if `enableInputD` is false
+    D = A * B + D                        // otherwise
+    ```
+
+    where:
+    - A is an `M x K` matrix in tensor memory or described using shared memory descriptor
+    - B is a `K x N` matrix described using shared memory descriptor
+    - D is an `M x N` accumulator matrix in tensory memory
+
+    `shared memory descriptor` is a 64 bit value which describes the properties
+    of multiplicand matrix in shared memory including its location in the shared
+    memory of the current CTA. For more details, please refer the
+    [PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-shared-memory-descriptor)
+
+    - idesc is a 32-bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor)
+
+    Optional Operands:
+    - `scaleInputD` is an Immediate value operand used for scaling D matrix by 2 ^ (-scaleInputD). The valid range is [0, 15]
+
+    - `disableOutputLane` is a vector mask for selective output
+      * vector<4 x i32> when ctaGroup is CTA_1
+      * vector<8 x i32> when ctaGroup is CTA_2
+
+    Required Attributes:
+    - `kind` specifies the computation data type and precision
+      * f16    : 16-bit floating point (half precision)
+      * tf32   : Tensor Float 32 (truncated 32-bit float)
+      * f8f6f4 : Mixed precision FP8/FP6/FP4
+      * i8     : 8-bit integer operations
+
+    - `ctaGroup` specifies CTA group configuration
+      * cta_1: MMA will be performed on the current thread's CTA
+      * cta_2: MMA will be performed on the current thread and it's peer CTA
+
+    Default Attributes:
+    - collectorOp specifies the collector buffer operations for matrix A
+      * discard : Release buffer after use (default)
+      * lastuse : Mark buffer for last use
+      * fill    : Fill buffer
+      * use     : Use buffer without modification
+
+    - `ashift` shifts the rows of the A matrix down by one row
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma)
+  }];
+
+  let assemblyFormat = [{
+    $d `,` $a `,` $b `,` $idesc `,` $enableInputD (`scale` `=` $scaleInputD^)?
+    (`mask` `=` $disableOutputLane^)? attr-dict `:` `(` type(operands) `)`
+  }];
+
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs(
+        Operation &op, LLVM::ModuleTranslation &mt,
+        llvm::IRBuilderBase &builder);
+  }];
+
+  let llvmBuilder = [{
+    auto [ID, args] = NVVM::Tcgen05MMAOp::getIntrinsicIDAndArgs(
+        *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, ID, args);
+  }];
+
+  let hasVerifier = true;
+}
+
+def NVVM_Tcgen05MMASpOp : NVVM_Op<"tcgen05.mma.sp", [AttrSizedOperandSegments]> {
+
+  let summary = "Performs MMA operation with sparse A matrix on 5th-gen tensor cores";
+
+  let arguments = (ins
+    // Attributes
+    Tcgen05MMAKindAttr:$kind,
+    CTAGroupKindAttr:$ctaGroup,
+    DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
+                      "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
+    UnitAttr:$ashift,
+    // Arguments
+    LLVM_PointerTensor:$d,
+    AnyTypeOf<[LLVM_PointerTensor, I64]>:$a,
+    I64:$b,
+    I32:$idesc,
+    I1:$enableInputD,
+    LLVM_PointerTensor:$sparseMetadata,
+    Optional<I64>:$scaleInputD,
+    Optional<FixedVectorOfLengthAndType<[4, 8], [I32]>>:$disableOutputLane
+  );
+
+  let description = [{
+    The `tcgen05.mma.sp` performs matrix multiplication and accumulation with
+    sparse `A` matrix using 5th generation tensor cores.
+
+    It executes a non-blocking `M x N x K` MMA operation:
+    ```
+    D = A * B + (D * 2^ -scaleInputD)    // if `scaleInputD` is provided
+    D = A * B                            // if `enableInputD` is false
+    D = A * B + D                        // otherwise
+    ```
+
+    where:
+    - A is an `M x (K / 2)` matrix in tensor memory or described using shared memory descriptor
+    - B is a `K x N` matrix described using shared memory descriptor
+    - D is an `M x N` accumulator matrix in tensory memory
+    - sparseMetadata specifies the mapping of the `K / 2` non-zero elements to
+      the K elements before performing the MMA operation
+
+    `shared memory descriptor` is a 64 bit value which describes the properties
+    of multiplicand matrix in shared memory including its location in the shared
+    memory of the current CTA. For more details, please refer the
+    [PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-shared-memory-descriptor)
+
+    - `idesc` is a 32 bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor)
+
+    Optional Operands:
+    - `scaleInputD` is an Immediate value operand used for scaling D matrix by 2 ^ (-scaleInputD). The valid range is [0, 15]
+
+    - `disableOutputLane` is a vector mask for selective output
+      * vector<4 x i32> when ctaGroup is CTA_1
+      * vector<8 x i32> when ctaGroup is CTA_2
+
+    Required Attributes:
+    - `kind` specifies the computation data type and precision
+      * f16    : 16-bit floating point (half precision)
+      * tf32   : Tensor Float 32 (truncated 32-bit float)
+      * f8f6f4 : Mixed precision FP8/FP6/FP4
+      * i8     : 8-bit integer operations
+
+    - `ctaGroup` specifies CTA group configuration
+      * cta_1: MMA will be performed on the current thread's CTA
+      * cta_2: MMA will be performed on the current thread and it's peer CTA
+
+    Default Attributes:
+    - collectorOp specifies the collector buffer operations for matrix A
+      * discard : Release buffer after use (default)
+      * lastuse : Mark buffer for last use
+      * fill    : Fill buffer
+      * use     : Use buffer without modification
+
+    - `ashift` shifts the rows of the A matrix down by one row
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-sp)
+  }];
+
+  let assemblyFormat = [{
+    $d `,` $a `,` $b `,` $idesc `,` $enableInputD `,` $sparseMetadata (`scale` `=` $scaleInputD^)? (`mask` `=` $disableOutputLane^)? attr-dict `:` `(` type(operands) `)`
+  }];
+
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs(
+        Operation &op, LLVM::ModuleTranslation &mt,
+        llvm::IRBuilderBase &builder);
+  }];
+
+  let llvmBuilder = [{
+    auto [ID, args] = NVVM::Tcgen05MMASpOp::getIntrinsicIDAndArgs(
+        *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, ID, args);
+  }];
+
+  let hasVerifier = true;
+}
+
+// tcgen05.mma.block_scale attribute
+def Tcgen05MMAKindMXF8F6F4     : I32EnumAttrCase<"MXF8F6F4", 0, "mxf8f6f4">;
+def Tcgen05MMAKindMXF4     : I32EnumAttrCase<"MXF4", 1, "mxf4">;
+def Tcgen05MMAKindMXF4NVF4     : I32EnumAttrCase<"MXF4NVF4", 2, "mxf4nvf4">;
+
+def Tcgen05MMABlockScaleKind : I32EnumAttr<
+  "Tcgen05MMABlockScaleKind",
+  "tcgen05.mma.block_scale supported types",
+  [Tcgen05MMAKindMXF8F6F4, Tcgen05MMAKindMXF4, Tcgen05MMAKindMXF4NVF4]> {
+    let cppNamespace = "::mlir::NVVM";
+    let genSpecializedAttr = 0;
+}
+
+def Tcgen05MMABlockScaleKindAttr : EnumAttr<NVVM_Dialect, Tcgen05MMABlockScaleKind,
+                                            "tcgen05_mma_block_scale_kind"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+def Tcgen05MMABlockScaleDefault : I32EnumAttrCase<"DEFAULT", 0, "default">;
+def Tcgen05MMABlockScaleBlock16      : I32EnumAttrCase<"BLOCK16", 1, "block16">;
+def Tcgen05MMABlockScaleBlock32      : I32EnumAttrCase<"BLOCK32", 2, "block32">;
+
+def Tcgen05MMABlockScale
+    : I32EnumAttr<"Tcgen05MMABlockScale",
+                  "tcgen05.mma block scale attribute",
+                  [Tcgen05MMABlockScaleDefault, Tcgen05MMABlockScaleBlock16,
+                   Tcgen05MMABlockScaleBlock32]> {
+  let cppNamespace = "::mlir::NVVM";
+  let genSpecializedAttr = 0;
+}
+
+def Tcgen05MMABlockScaleAttr : EnumAttr<NVVM_Dialect, Tcgen05MMABlockScale,
+                                          "tcgen05_mma_block_scale"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+//===----------------------------------------------------------------------===//
+// NVVM tcgen05.mma.block_scale Op
+//===----------------------------------------------------------------------===//
+
+def NVVM_Tcgen05MMABlockScaleOp : NVVM_Op<"tcgen05.mma.block_scale"> {
+
+  let summary = "Performs block scaled MMA operation on 5th-gen tensor cores";
+
+  let arguments = (ins
+      // Attributes
+      Tcgen05MMABlockScaleKindAttr:$kind,
+      CTAGroupKindAttr:$ctaGroup,
+      DefaultValuedAttr<Tcgen05MMABlockScaleAttr,
+                      "Tcgen05MMABlockScale::DEFAULT">:$blockScale,
+      DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
+                        "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
+      // Arguments
+      LLVM_PointerTensor:$d,
+      AnyTypeOf<[LLVM_PointerTensor, I64]>:$a,
+      I64:$b,
+      I32:$idesc, I1:$enableInputD,
+      LLVM_PointerTensor:$scaleA,
+      LLVM_PointerTensor:$scaleB
+    );
+
+  let description = [{
+    `nvvm.tcgen05.mma.block_scale` performs matrix multiplication, and
+    accumulate (MMA) using 5th generation tensor cores. It executes a non-blocking
+    `M x N x K` matrix operation. The matrices `A` and `B` are scaled before
+    performing the matrix multiply and accumulate operation.
+
+    It executes a non-blocking `M x N x K` MMA operation:
+
+    ```
+    D = (A * scale_a)  * (B * scale_b)`      // if `enableInputD` is false
+    D = (A * scale_a)  * (B * scale_b) + D`
+    ```
+
+    where:
+    - A is an M x (K / 2) matrix in tensor memory or described using shared memory descriptor
+    - B is a K x N matrix described using shared memory descriptor
+    - D is an M x N accumulator matrix in tensor memory
+    - `scale_a` and `scale_b` are matrices in tensor memory used to scale `A` and `B` respectively
+
+    `shared memory descriptor` is a 64 bit value which describes the properties
+    of multiplicand matrix in shared memory including its location in the shared
+    memory of the current CTA. For more details, please refer the
+    [PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-shared-memory-descriptor)
+
+    - `idesc` is a 32 bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor)
+
+    Required Attributes:
+    - `kind` specifies the computation data type and precision
+      * mxf8f6f4 - MX-floating point formats
+      * mxf4     - MX-floating point formats (FP4)
+      * mxf4nvf4 - MXF4 + custom NVIDIA 4-bit format (with common scaling factor)
+
+    - `ctaGroup` specifies CTA group configuration
+      * cta_1: MMA will be performed on the current thread's CTA
+      * cta_2: MMA will be performed on the current thread and it's peer CTA
+
+    Default Attributes:
+    - collectorOp specifies the collector buffer operations for matrix A
+      * discard : Release buffer after use (default)
+      * lastuse : Mark buffer for last use
+      * fill    : Fill buffer
+      * use     : Use buffer without modification
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma)
+  }];
+
+  let assemblyFormat = [{
+    $d `,` $a `,` $b `,` $idesc `,` $enableInputD `,` $scaleA `,` $scaleB
+    attr-dict `:` `(` type(operands) `)`
+  }];
+
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase &builder);
+  }];
+
+  let llvmBuilder = [{
+    auto [ID, args] = NVVM::Tcgen05MMABlockScaleOp::getIntrinsicIDAndArgs(
+        *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, ID, args);
+  }];
+  let hasVerifier = true;
+}
+
+def NVVM_Tcgen05MMASpBlockScaleOp : NVVM_Op<"tcgen05.mma.sp.block_scale"> {
+
+  let summary = "Performs block scaled MMA operation with sparse A matrix on 5th-gen tensor cores";
+
+  let arguments = (ins
+    // Attributes
+    Tcgen05MMABlockScaleKindAttr:$kind,
+    CTAGroupKindAttr:$ctaGroup,
+    DefaultValuedAttr<Tcgen05MMABlockScaleAttr,
+                      "Tcgen05MMABlockScale::DEFAULT">:$blockScale,
+    DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
+                      "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
+    // Arguments
+    LLVM_PointerTensor:$d,
+    AnyTypeOf<[LLVM_PointerTensor, I64]>:$a,
+    I64:$b,
+    I32:$idesc,
+    I1:$enableInputD,
+    LLVM_PointerTensor:$sparseMetadata,
+    LLVM_PointerTensor:$scaleA,
+    LLVM_PointerTensor:$scaleB
+  );
+
+  let description = [{
+    `nvvm.tcgen05.mma.sp.block_scale` is an asynchronous op which performs
+    matrix multiplication, and accumulate with sparse A using 5th generation tensor cores
+
+    ```
+    D = (A * scale_a)  * (B * scale_b)      // if `enableInputD` is specified
+    D = (A * scale_a)  * (B * scale_b) + D  // otherwise
+    ```
+
+    where:
+    - A is an M x (K / 2) matrix in tensor memory or described using shared memory descriptor
+    - B is a K x N matrix described using shared memory descriptor
+    - D is an M x N accumulator matrix in tensor memory
+    - `scale_a` and `scale_b` are matrices in tensor memory used to scale `A` and `B` respectively
+
+    `shared memory descriptor` is a 64 bit value which describes the properties
+    of multiplicand matrix in shared memory including its location in the shared
+    memory of the current CTA. For more details, please refer the
+    [PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-shared-memory-descriptor)
+
+    Operands:
+    - `idesc` is a 32 bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor)
+
+    - `sparseMetadata` specifies the mapping of the `K / 2` non-zero elements to
+      the K elements before performing the MMA operation
+
+    Required Attributes:
+    - `kind` specifies the computation data type and precision
+      * mxf8f6f4 - MX-floating point formats
+      * mxf4     - MX-floating point formats (FP4)
+      * mxf4nvf4 - MXF4 + custom NVIDIA 4-bit format (with common scaling factor)
+
+    - `ctaGroup` specifies CTA group configuration
+      * cta_1: MMA will be performed on the current thread's CTA
+      * cta_2: MMA will be performed on the current thread and it's peer CTA
+
+    Default Attributes:
+    - collectorOp specifies the collector buffer operations for matrix A
+      * discard : Release buffer after use (default)
+      * lastuse : Mark buffer for last use
+      * fill    : Fill buffer
+      * use     : Use buffer without modification
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-sp)
+  }];
+
+  let assemblyFormat = [{
+    $d `,` $a `,` $b `,` $idesc `,` $enableInputD `,` $sparseMetadata `,`  $scaleA `,`  $scaleB
+    attr-dict `:` `(` type(operands) `)`
+  }];
+
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase &builder);
+  }];
+
+  let llvmBuilder = [{
+    auto [ID, args] = NVVM::Tcgen05MMASpBlockScaleOp::getIntrinsicIDAndArgs(
+        *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, ID, args);
+  }];
+
+  let hasVerifier = true;
+}
+
+def Tcgen05MMACollectorBBuffer0  : I32EnumAttrCase<"B0", 0, "b0">;
+def Tcgen05MMACollectorBBuffer1  : I32EnumAttrCase<"B1", 1, "b1">;
+def Tcgen05MMACollectorBBuffer2  : I32EnumAttrCase<"B2", 2, "b2">;
+def Tcgen05MMACollectorBBuffer3  : I32EnumAttrCase<"B3", 3, "b3">;
+
+def Tcgen05MMACollectorBBuffer : I32EnumAttr<
+  "Tcgen05MMACollectorBBuffer",
+  "tcgen05 MMA Collector Buffer B Attribute",
+  [Tcgen05MMACollectorBBuffer0,
+  Tcgen05MMACollectorBBuffer1,
+  Tcgen05MMACollectorBBuffer2,
+  Tcgen05MMACollectorBBuffer3]> {
+    let cppNamespace = "::mlir::NVVM";
+    let genSpecializedAttr = 0;
+}
+
+def Tcgen05MMACollectorBBufferAttr : EnumAttr<NVVM_Dialect, Tcgen05MMACollectorBBuffer, "tcgen05_mma_collectorb"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+//===----------------------------------------------------------------------===//
+// NVVM tcgen05.mma.ws Op
+//===----------------------------------------------------------------------===//
+
+def NVVM_Tcgen05MMAWsOp : NVVM_Op<"tcgen05.mma.ws"> {
+    let summary = "Performs weight stationary convolution MMA operation on 5th-gen tensor cores";
+
+  let arguments = (ins
+    // Attributes
+    Tcgen05MMAKindAttr:$kind,
+    DefaultValuedAttr<Tcgen05MMACollectorBBufferAttr,
+                      "Tcgen05MMACollectorBBuffer::B0">:$collectorBBuffer,
+    DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
+                      "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
+    // Arguments
+    LLVM_PointerTensor:$d,
+    AnyTypeOf<[LLVM_PointerTensor, I64]>:$a,
+    I64:$b,
+    I32:$idesc,
+    I1:$enableInputD,
+    Optional<I64>:$zeroColMask
+  );...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/164356