[Mlir-commits] [mlir] [mlir][nvvm] fix a bug when checking layout that cannot be transposed (PR #97538)

Wed Jul 3 07:33:40 PDT 2024

https://github.com/shubaoyu2 updated https://github.com/llvm/llvm-project/pull/97538

>From f2f072920b322857ed3ad623aac30cfbc0e27265 Mon Sep 17 00:00:00 2001
From: bangyu shen <94283495+shubaoyu2 at users.noreply.github.com>
Date: Wed, 3 Jul 2024 16:36:56 +0800
Subject: [PATCH 1/7] fix a bug when checking layout that cannot be transposed

the WGMMA expect layouts for A/B are row/col, the transposed version should be col/row. when checking other datatypes cannot use transposed layout, it should reject col-major for A and row-major for B
---
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 036a9a15af838..48f44165ccc58 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -880,7 +880,7 @@ LogicalResult NVVM::WgmmaMmaAsyncOp::verify() {
   // Check transpose (only available for f16/bf16)
   if ((typeA != WGMMATypes::f16 && typeA != WGMMATypes::bf16) &&
       (getLayoutA() == mlir::NVVM::MMALayout::col ||
-       getLayoutB() == mlir::NVVM::MMALayout::col)) {
+       getLayoutB() == mlir::NVVM::MMALayout::row)) {
     return emitOpError()
            << "given layouts layout_a = " << stringifyMMALayout(getLayoutA())
            << " and layout_b = " << stringifyMMALayout(getLayoutB())

>From aec1049febb1893a5cdb57ff7c2981001dbe355d Mon Sep 17 00:00:00 2001
From: bangyu shen <94283495+shubaoyu2 at users.noreply.github.com>
Date: Wed, 3 Jul 2024 17:01:35 +0800
Subject: [PATCH 2/7] ch

---
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 1834 ++++++++------------
 1 file changed, 693 insertions(+), 1141 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 48f44165ccc58..375e2951a037c 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1,1142 +1,694 @@
-//===- NVVMDialect.cpp - NVVM IR Ops and Dialect registration -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the types and operation details for the NVVM IR dialect in
-// MLIR, and the LLVM IR dialect.  It also registers the dialect.
-//
-// The NVVM dialect only contains GPU specific additions on top of the general
-// LLVM dialect.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
-
-#include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
-#include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
-#include "mlir/Dialect/Utils/StaticValueUtils.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Diagnostics.h"
-#include "mlir/IR/DialectImplementation.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/OperationSupport.h"
-#include "mlir/IR/Types.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/TypeSwitch.h"
-#include "llvm/AsmParser/Parser.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <optional>
-#include <string>
-
-using namespace mlir;
-using namespace NVVM;
-
-#include "mlir/Dialect/LLVMIR/NVVMOpsDialect.cpp.inc"
-#include "mlir/Dialect/LLVMIR/NVVMOpsEnums.cpp.inc"
-
-//===----------------------------------------------------------------------===//
-// Printing/parsing for NVVM ops
-//===----------------------------------------------------------------------===//
-
-static void printNVVMIntrinsicOp(OpAsmPrinter &p, Operation *op) {
-  p << " " << op->getOperands();
-  if (op->getNumResults() > 0)
-    p << " : " << op->getResultTypes();
+// RUN: mlir-opt --convert-nvvm-to-llvm --convert-arith-to-llvm --split-input-file %s | FileCheck %s
+
+// Same below, but using the `ConvertToLLVMPatternInterface` entry point
+// and the generic `convert-to-llvm` pass.
+// RUN: mlir-opt --convert-to-llvm --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: @init_mbarrier
+llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %count : i32, %pred : i1) {
+  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.shared.b64 [$0], $1;", "r,r,b" 
+  nvvm.mbarrier.init.shared %barrier, %count, predicate = %pred : !llvm.ptr<3>, i32, i1 
+  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.b64 [$0], $1;", "l,r,b" 
+  nvvm.mbarrier.init %barrier_gen, %count, predicate = %pred : !llvm.ptr, i32, i1
+  llvm.return
+}
+
+// CHECK-LABEL: @init_mbarrier_arrive_expect_tx
+llvm.func @init_mbarrier_arrive_expect_tx(%barrier : !llvm.ptr<3>, %txcount : i32, %pred : i1) {
+  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r"
+  nvvm.mbarrier.arrive.expect_tx.shared %barrier, %txcount : !llvm.ptr<3>, i32
+  //CHECK:  llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r,b"
+  nvvm.mbarrier.arrive.expect_tx.shared %barrier, %txcount, predicate = %pred : !llvm.ptr<3>, i32, i1 
+  llvm.return
+}
+
+// CHECK-LABEL: @init_mbarrier_arrive_expect_tx_generic
+llvm.func @init_mbarrier_arrive_expect_tx_generic(%barrier : !llvm.ptr, %txcount : i32, %pred : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.b64 _, [$0], $1;", "l,r" 
+  nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr, i32
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.b64 _, [$0], $1;", "l,r,b"
+  nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred : !llvm.ptr, i32, i1 
+  llvm.return
+}
+
+// CHECK-LABEL: @init_mbarrier_try_wait_shared
+llvm.func @init_mbarrier_try_wait_shared(%barrier : !llvm.ptr<3>, %ticks : i32, %phase : i32) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "{
+  // CHECK-SAME: .reg .pred       P1;
+  // CHECK-SAME: LAB_WAIT: 
+  // CHECK-SAME: mbarrier.try_wait.parity.shared.b64 P1, [$0], $1, $2;
+  // CHECK-SAME: @P1 bra.uni DONE;
+  // CHECK-SAME: bra.uni     LAB_WAIT;
+  // CHECK-SAME: DONE:
+  // CHECK-SAME: }",
+  // CHECK-SAME: "r,r,r"
+   nvvm.mbarrier.try_wait.parity.shared %barrier, %phase, %ticks : !llvm.ptr<3>, i32, i32
+  llvm.return
+}
+
+// CHECK-LABEL: @init_mbarrier_try_wait
+llvm.func @init_mbarrier_try_wait(%barrier : !llvm.ptr, %ticks : i32, %phase : i32){
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att
+  // CHECK-SAME: "{
+  // CHECK-SAME: .reg .pred       P1;
+  // CHECK-SAME: LAB_WAIT: 
+  // CHECK-SAME: mbarrier.try_wait.parity.b64 P1, [$0], $1, $2;
+  // CHECK-SAME: @P1 bra.uni DONE;
+  // CHECK-SAME: bra.uni     LAB_WAIT;
+  // CHECK-SAME: DONE:
+  // CHECK-SAME: }",
+  // CHECK-SAME: "l,r,r"
+  nvvm.mbarrier.try_wait.parity %barrier, %phase, %ticks : !llvm.ptr, i32, i32
+  llvm.return
+}
+
+// CHECK-LABEL: @async_cp
+func.func @async_cp(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>) {
+  // CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16, cache =  ca : !llvm.ptr<3>, !llvm.ptr<1>
+  nvvm.cp.async.shared.global %dst, %src, 16, cache =  ca : !llvm.ptr<3>, !llvm.ptr<1>
+  // CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16, cache =  cg : !llvm.ptr<3>, !llvm.ptr<1>
+  nvvm.cp.async.shared.global %dst, %src, 16, cache =  cg : !llvm.ptr<3>, !llvm.ptr<1>
+  return
+}
+
+// CHECK-LABEL: @async_cp_zfill
+func.func @async_cp_zfill(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>, %cpSize: i32) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "cp.async.cg.shared.global [$0], [$1], $2, $3;\0A", 
+  // CHECK-SAME: "r,l,n,r" %{{.*}}, %{{.*}}, %{{.*}} : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> ()
+  nvvm.cp.async.shared.global %dst, %src, 16, cache =  cg, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "cp.async.ca.shared.global [$0], [$1], $2, $3;\0A", 
+  // CHECK-SAME: "r,l,n,r" %{{.*}}, %{{.*}}, %{{.*}} : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> ()
+  nvvm.cp.async.shared.global %dst, %src, 4, cache =  ca, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32
+  return
+}
+
+// CHECK-LABEL: @cp_async_mbarrier_arrive
+func.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.ptr) {
+  // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}}
+  nvvm.cp.async.mbarrier.arrive %bar_gen : !llvm.ptr
+  // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} {noinc = true}
+  nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr
+  // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}}
+  nvvm.cp.async.mbarrier.arrive.shared %bar_shared : !llvm.ptr<3>
+  // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} {noinc = true}
+  nvvm.cp.async.mbarrier.arrive.shared %bar_shared {noinc = true} : !llvm.ptr<3>
+  llvm.return
+}
+
+// CHECK-LABEL: @tma_load_3d_all
+func.func @tma_load_3d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4} ], [$5],{$6}, $7, $8;", "r,l,r,r,r,r,h,h,l"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr  
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$9 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4} ], [$5],{$6}, $7, $8;", "r,l,r,r,r,r,h,h,l,b"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p : !llvm.ptr<3>, !llvm.ptr
+  return
+}
+
+// CHECK-LABEL: @tma_load_4d_all
+func.func @tma_load_4d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5} ], [$6],{$7,$8}, $9, $10;", "r,l,r,r,r,r,r,h,h,h,l"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] im2col[%off0,%off1] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr  
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$11 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5} ], [$6],{$7,$8}, $9, $10;", "r,l,r,r,r,r,r,h,h,h,l,b"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] im2col[%off0,%off1] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p : !llvm.ptr<3>, !llvm.ptr
+  return
+}
+
+// CHECK-LABEL: @tma_load_5d_all
+func.func @tma_load_5d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %off0: i16, %off1: i16, %off2: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5,$6} ], [$7],{$8,$9,$10}, $11, $12;", "r,l,r,r,r,r,r,r,h,h,h,h,l"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] im2col[%off0,%off1,%off2] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr  
+  // CHECK: lvm.inline_asm has_side_effects asm_dialect = att "@$13 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5,$6} ], [$7],{$8,$9,$10}, $11, $12;", "r,l,r,r,r,r,r,r,h,h,h,h,l,b"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] im2col[%off0,%off1,%off2] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p : !llvm.ptr<3>, !llvm.ptr
+  return
+}
+
+// CHECK-LABEL: @tma_load_1d
+func.func @tma_load_1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %p : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2} ], [$3];", "r,l,r,r"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0] : !llvm.ptr<3>, !llvm.ptr
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$4 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2} ], [$3];", "r,l,r,r,b"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0] predicate=%p : !llvm.ptr<3>, !llvm.ptr
+  return
+}
+
+// CHECK-LABEL: @tma_load_2d
+func.func @tma_load_2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %p : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3} ], [$4];", "r,l,r,r,r"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1] : !llvm.ptr<3>, !llvm.ptr
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3} ], [$4];", "r,l,r,r,r,b"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
+  return
+}
+
+// CHECK-LABEL: @tma_load_3d
+func.func @tma_load_3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4} ], [$5];", "r,l,r,r,r,r"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2] : !llvm.ptr<3>, !llvm.ptr
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4} ], [$5];", "r,l,r,r,r,r,b"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
+  return
+}
+
+// CHECK-LABEL: @tma_load_4d
+func.func @tma_load_4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5} ], [$6];", "r,l,r,r,r,r,r"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] : !llvm.ptr<3>, !llvm.ptr
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5} ], [$6];", "r,l,r,r,r,r,r,b"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
+  return
+}
+
+// CHECK-LABEL: @tma_load_5d
+func.func @tma_load_5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5,$6} ], [$7];", "r,l,r,r,r,r,r,r"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] : !llvm.ptr<3>, !llvm.ptr
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$8 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5,$6} ], [$7];", "r,l,r,r,r,r,r,r,b"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
+  return
+}
+
+// CHECK-LABEL: @tma_load_multicast1d
+func.func @tma_load_multicast1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %p : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2} ], [$3], $4;", "r,l,r,r,h"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2} ], [$3], $4;", "r,l,r,r,h,b"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<3>, !llvm.ptr
+  return
+}
+
+// CHECK-LABEL: @tma_load_multicast2d
+func.func @tma_load_multicast2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %p : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3} ], [$4], $5;", "r,l,r,r,r,h"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3} ], [$4], $5;", "r,l,r,r,r,h,b"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1] multicast_mask = %multicastMask  predicate=%p  : !llvm.ptr<3>, !llvm.ptr
+  return
+}
+
+// CHECK-LABEL: @tma_load_multicast3d
+func.func @tma_load_multicast3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4} ], [$5], $6;", "r,l,r,r,r,r,h"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4} ], [$5], $6;", "r,l,r,r,r,r,h,b"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2] multicast_mask = %multicastMask  predicate=%p  : !llvm.ptr<3>, !llvm.ptr
+  return
+}
+
+// CHECK-LABEL: @tma_load_multicast4d
+func.func @tma_load_multicast4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5} ], [$6], $7;", "r,l,r,r,r,r,r,h"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2,%crd3] multicast_mask = %multicastMask: !llvm.ptr<3>, !llvm.ptr
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$8 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5} ], [$6], $7;", "r,l,r,r,r,r,r,h,b"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2,%crd3] multicast_mask = %multicastMask predicate=%p  : !llvm.ptr<3>, !llvm.ptr
+  return
+}
+
+// CHECK-LABEL: @tma_load_multicast5d
+func.func @tma_load_multicast5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5,$6} ], [$7], $8;", "r,l,r,r,r,r,r,r,h"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2,%crd3,%crd4] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$9 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5,$6} ], [$7], $8;", "r,l,r,r,r,r,r,r,h,b"
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2,%crd3,%crd4] multicast_mask = %multicastMask predicate=%p  : !llvm.ptr<3>, !llvm.ptr
+  return
+}
+
+// CHECK-LABEL: @tma_store_1d
+func.func @tma_store_1d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %p : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [$0, {$2} ], [$1];", "l,r,r"
+  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0] : !llvm.ptr, !llvm.ptr<3>, i32
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$3 cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [$0, {$2} ], [$1];", "l,r,r,b"
+  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i1
+  return
+}
+
+// CHECK-LABEL: @tma_store_2d
+func.func @tma_store_2d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %p : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [$0, {$2, $3} ], [$1];", "l,r,r,r"
+  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1] : !llvm.ptr, !llvm.ptr<3>, i32, i32
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$4 cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [$0, {$2, $3} ], [$1];", "l,r,r,r,b"
+  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i32, i1
+  return
+}
+
+// CHECK-LABEL: @tma_store_3d
+func.func @tma_store_3d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [$0, {$2, $3, $4} ], [$1];", "l,r,r,r,r"
+  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2] : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [$0, {$2, $3, $4} ], [$1];", "l,r,r,r,r,b"
+  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i1
+  return
+}
+
+// CHECK-LABEL: @tma_store_4d
+func.func @tma_store_4d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5} ], [$1];", "l,r,r,r,r,r"
+  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3] : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5} ], [$1];", "l,r,r,r,r,r,b"
+  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32, i1
+  return
+}
+
+// CHECK-LABEL: @tma_store_5d
+func.func @tma_store_5d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) {
+  // CHECK-NEXT: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5, $6} ], [$1];", "l,r,r,r,r,r,r"
+  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3,%crd4] : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32, i32
+
+  // CHECK-NEXT: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.5d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5, $6} ], [$1];", "l,r,r,r,r,r,r,b"
+  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3,%crd4], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32, i32, i1
+  return
+}
+
+// CHECK-LABEL: @wgmma_execute
+func.func @wgmma_execute() {  
+  nvvm.wgmma.fence.aligned
+  nvvm.wgmma.commit.group.sync.aligned
+  nvvm.wgmma.wait.group.sync.aligned 0
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "wgmma.fence.sync.aligned;"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "wgmma.commit_group.sync.aligned;"
+  // CHECK: %[[S0:.+]] = llvm.mlir.constant(0 : i32) : i32
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "wgmma.wait_group.sync.aligned $0;", "n" %[[S0]] : (i32)
+  
+
+  nvvm.wgmma.fence.aligned
+  nvvm.wgmma.commit.group.sync.aligned
+  nvvm.wgmma.wait.group.sync.aligned 5
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "wgmma.fence.sync.aligned;"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "wgmma.commit_group.sync.aligned;"
+  // CHECK: %[[S1:.+]] = llvm.mlir.constant(5 : i32) : i32
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "wgmma.wait_group.sync.aligned $0;", "n" %[[S1]] : (i32)
+  return
+}
+
+
+// -----
+
+!mat64f32 = !llvm.struct<(
+  f32, f32, f32, f32, f32, f32, f32, f32, 
+  f32, f32, f32, f32, f32, f32, f32, f32)>
+
+// CHECK-LABEL: @wgmma_f32_f16_f16(
+// CHECK-SAME: %[[ARG0:.+]]: i64, %[[ARG1:.+]]: i64
+func.func @wgmma_f32_f16_f16(%descA : i64, %descB : i64) -> !mat64f32{  
+  // CHECK: %[[RES:.*]] = llvm.mlir.undef : !llvm.struct
+  // CHECK: %[[A1:.*]] = llvm.mlir.constant(0 : i32) : i32
+  // CHECK: %[[A2:.*]] = llvm.mlir.constant(-1 : i32) : i32
+  // CHECK: %[[A3:.*]] = llvm.mlir.constant(-1 : i32) : i32
+  // CHECK: %[[A4:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[A5:.*]] = llvm.mlir.constant(0 : i32) : i32
+  // CHECK: %[[V0:.*]] = llvm.extractvalue %[[RES]][0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
+  // CHECK: %[[V4:.*]] = llvm.extractvalue %[[RES]][4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
+  // CHECK: %[[V11:.*]] = llvm.extractvalue %[[RES]][11] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>  
+  // CHECK: %[[V13:.*]] = llvm.extractvalue %[[RES]][13] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
+  // CHECK: %[[RES1:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "{
+  // CHECK-SAME: reg .pred p;
+  // CHECK-SAME: setp.ne.b32 p, $34, 0;
+  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 
+  // CHECK-SAME: {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}, $32, $33, p, $35,  $36, $37,  $38;\0A}\0A", 
+  // CHECK-SAME: "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,l,l,n,n,n,n,n" 
+  // CHECK-SAME: %[[V0]], %{{.*}}, %{{.*}}, %{{.*}}, %[[V4]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[V11]], %{{.*}}, %[[V13]], %{{.*}}, %{{.*}}, %[[ARG0]], %[[ARG1]], %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[A5]] 
+  // CHECK-SAME: : (f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, i64, i64, i32, i32, i32, i32, i32) 
+  // CHECK-SAME: -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
+  // CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64
+  // CHECK: %[[DESCa:.+]] = llvm.add %[[ARG0]], %[[C2]] : i64
+  // CHECK: %[[DESCb:.+]] = llvm.add %[[ARG1]], %[[C2]] : i64
+  // CHECK: %[[V0_2:.*]] = llvm.extractvalue %[[RES1]][0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
+  // CHECK: %[[V4_2:.*]] = llvm.extractvalue %[[RES1]][4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
+  // CHECK: %[[V11_2:.*]] = llvm.extractvalue %[[RES1]][11] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>  
+  // CHECK: %[[V13_2:.*]] = llvm.extractvalue %[[RES1]][13] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
+  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "{
+    // CHECK-SAME: .reg .pred p;
+    // CHECK-SAME: setp.ne.b32 p, $34, 0;
+    // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 
+    // CHECK-SAME: {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}, $32, $33, p, $35,  $36, $37,  $38;\0A}\0A", 
+    // CHECK-SAME: "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,l,l,n,n,n,n,n" 
+    // CHECK-SAME: %[[V0_2]], %{{.*}}, %{{.*}}, %{{.*}}, %[[V4_2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[V11_2]], %{{.*}}, %[[V13_2]], %{{.*}}, %{{.*}}, %[[DESCa]], %[[DESCb]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} 
+  %result = llvm.mlir.undef : !mat64f32
+  %result1 = nvvm.wgmma.mma_async 
+      %descA, %descB, %result,
+      #nvvm.shape<m = 64, n = 32, k = 16>, 
+      D [<f32>, #nvvm.wgmma_scale_out<zero>],
+      A [<f16>, #nvvm.wgmma_scale_in<neg>, <col>], 
+      B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>]
+      :!mat64f32 -> !mat64f32
+  %c2 = arith.constant 2 : i64
+  %descAnext = arith.addi %descA, %c2 : i64
+  %descBnext = arith.addi %descB, %c2 : i64
+  %result2 = nvvm.wgmma.mma_async 
+      %descAnext, %descBnext, %result1,
+      #nvvm.shape<m = 64, n = 32, k = 16>, 
+      D [<f32>, #nvvm.wgmma_scale_out<zero>],
+      A [<f16>, #nvvm.wgmma_scale_in<neg>, <col>], 
+      B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>]
+      : !mat64f32 -> !mat64f32
+  return %result2 : !mat64f32
+}
+
+// -----
+
+!mat16i32 = !llvm.struct<(i32, i32, i32, i32)>
+
+// CHECK-LABEL: @wgmma_s32_s8_s8_satfinite(
+// CHECK-SAME: %[[ARG0:.+]]: i64, %[[ARG1:.+]]: i64
+func.func @wgmma_s32_s8_s8_satfinite(%descA : i64, %descB : i64) -> !mat16i32{  
+  %result = llvm.mlir.undef : !mat16i32
+// CHECK: %[[RES:.*]] = llvm.mlir.undef : !llvm.struct
+// CHECK: %[[A1:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: %[[V0:.*]] = llvm.extractvalue %[[RES]][0]
+// CHECK: %[[V1:.*]] = llvm.extractvalue %[[RES]][1]
+// CHECK: %[[V2:.*]] = llvm.extractvalue %[[RES]][2]
+// CHECK: %[[V3:.*]] = llvm.extractvalue %[[RES]][3]
+// CHECK: %[[RES_2:.*]] =  llvm.inline_asm has_side_effects asm_dialect = att 
+// CHECK-SAME: "{
+// CHECK-SAME: .reg .pred p;
+// CHECK-SAME: setp.ne.b32 p, $10, 0;
+// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite 
+// CHECK-SAME: {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", "=r,=r,=r,=r,0,1,2,3,l,l,n" 
+// CHECK-SAME: %[[V0]], %[[V1]], %[[V2]], %[[V3]], %[[ARG0]], %[[ARG1]], %[[A1]] : 
+// CHECK-SAME: (i32, i32, i32, i32, i64, i64, i32) -> !llvm.struct<(i32, i32, i32, i32)>
+// CHECK: %[[V0_2:.*]] = llvm.extractvalue %[[RES_2]][0]
+// CHECK: %[[V1_2:.*]] = llvm.extractvalue %[[RES_2]][1]
+// CHECK: %[[V2_2:.*]] = llvm.extractvalue %[[RES_2]][2]
+// CHECK: %[[V3_2:.*]] = llvm.extractvalue %[[RES_2]][3]
+// CHECK: %[[RES_3:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 
+// CHECK-SAME: "{
+// CHECK-SAME: .reg .pred p;
+// CHECK-SAME: setp.ne.b32 p, $10, 0;
+// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite 
+// CHECK-SAME: {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", 
+// CHECK-SAME: "=r,=r,=r,=r,0,1,2,3,l,l,n" 
+// CHECK-SAME: %[[V0_2]], %[[V1_2]], %[[V2_2]], %[[V3_2]], %[[ARG0]], %[[ARG1]], %{{.*}}
+// CHECK: %[[V0_3:.*]] = llvm.extractvalue %[[RES_3]][0]
+// CHECK: %[[V1_3:.*]] = llvm.extractvalue %[[RES_3]][1]
+// CHECK: %[[V2_3:.*]] = llvm.extractvalue %[[RES_3]][2]
+// CHECK: %[[V3_3:.*]] = llvm.extractvalue %[[RES_3]][3]
+// CHECK: %[[RES1:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 
+// CHECK-SAME:"{
+// CHECK-SAME:.reg .pred p;
+// CHECK-SAME: setp.ne.b32 p, $10, 0;
+// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite
+// CHECK-SAME: {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", "=r,=r,=r,=r,0,1,2,3,l,l,n" 
+// CHECK-SAME: %[[V0_3]], %[[V1_3]], %[[V2_3]], %[[V3_3]], %[[ARG0]], %[[ARG1]], %{{.*}} 
+  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result, 
+      #nvvm.shape<m = 64, n = 8, k = 32>, 
+      D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>],
+      A [<s8>, #nvvm.wgmma_scale_in<one>, <row>], 
+      B [<s8>, #nvvm.wgmma_scale_in<one>, <col>]
+      : !mat16i32 -> !mat16i32
+  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1, 
+      #nvvm.shape<m = 64, n = 8, k = 32>, 
+      D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>],
+      A [<s8>, #nvvm.wgmma_scale_in<one>, <row>], 
+      B [<s8>, #nvvm.wgmma_scale_in<one>, <col>]
+      : !mat16i32 -> !mat16i32
+  %result3 = nvvm.wgmma.mma_async %descA, %descB, %result2, 
+      #nvvm.shape<m = 64, n = 8, k = 32>, 
+      D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>],
+      A [<s8>, #nvvm.wgmma_scale_in<one>, <row>], 
+      B [<s8>, #nvvm.wgmma_scale_in<one>, <col>]
+      : !mat16i32 -> !mat16i32
+  return %result3 : !mat16i32
+}
+
+// CHECK-LABEL: @wgmma_s32_u8_u8(
+  // CHECK-SAME: %[[ARG0:.+]]: i64, %[[ARG1:.+]]: i64
+func.func @wgmma_s32_u8_u8(%descA : i64, %descB : i64) -> !mat16i32 {  
+// CHECK: %[[RES:.*]] = llvm.mlir.undef : !llvm.struct
+// CHECK: %[[A1:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: %[[V0:.*]] = llvm.extractvalue %[[RES]][0]
+// CHECK: %[[V1:.*]] = llvm.extractvalue %[[RES]][1]
+// CHECK: %[[V2:.*]] = llvm.extractvalue %[[RES]][2]
+// CHECK: %[[V3:.*]] = llvm.extractvalue %[[RES]][3]
+// CHECK: %[[RES_2:.*]] =  llvm.inline_asm has_side_effects asm_dialect = att 
+// CHECK-SAME: "{
+// CHECK-SAME: .reg .pred p;
+// CHECK-SAME: setp.ne.b32 p, $10, 0;
+// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p;
+// CHECK-SAME: }\0A",
+// CHECK-SAME: "=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0]], %[[V1]], %[[V2]], %[[V3]], %[[ARG0]], %[[ARG1]], %[[A1]] : 
+// CHECK-SAME:(i32, i32, i32, i32, i64, i64, i32) -> !llvm.struct<(i32, i32, i32, i32)>
+// CHECK: %[[V0_2:.*]] = llvm.extractvalue %[[RES_2]][0]
+// CHECK: %[[V1_2:.*]] = llvm.extractvalue %[[RES_2]][1]
+// CHECK: %[[V2_2:.*]] = llvm.extractvalue %[[RES_2]][2]
+// CHECK: %[[V3_2:.*]] = llvm.extractvalue %[[RES_2]][3]
+// CHECK: %[[RES_3:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 
+// CHECK-SAME:"{
+// CHECK-SAME: .reg .pred p;
+// CHECK-SAME: setp.ne.b32 p, $10, 0;
+// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p;
+// CHECK-SAME: }\0A",
+// CHECK-SAME: "=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0_2]], %[[V1_2]], %[[V2_2]], %[[V3_2]], %[[ARG0]], %[[ARG1]], %{{.*}}
+// CHECK: %[[V0_3:.*]] = llvm.extractvalue %[[RES_3]][0]
+// CHECK: %[[V1_3:.*]] = llvm.extractvalue %[[RES_3]][1]
+// CHECK: %[[V2_3:.*]] = llvm.extractvalue %[[RES_3]][2]
+// CHECK: %[[V3_3:.*]] = llvm.extractvalue %[[RES_3]][3]
+// CHECK: %[[RES1:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 
+// CHECK-SAME:"{
+// CHECK-SAME: .reg .pred p;
+// CHECK-SAME: setp.ne.b32 p, $10, 0;
+// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p;
+// CHECK-SAME:}\0A", 
+// CHECK-SAME:"=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0_3]], %[[V1_3]], %[[V2_3]], %[[V3_3]], %[[ARG0]], %[[ARG1]], %{{.*}} 
+  %result = llvm.mlir.undef : !mat16i32
+  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result,
+      #nvvm.shape<m = 64, n = 8, k = 32>, 
+      D [<s32>, #nvvm.wgmma_scale_out<one>],
+      A [<u8>, #nvvm.wgmma_scale_in<one>, <row>], 
+      B [<u8>, #nvvm.wgmma_scale_in<one>, <col>]
+      : !mat16i32 -> !mat16i32
+  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
+      #nvvm.shape<m = 64, n = 8, k = 32>, 
+      D [<s32>, #nvvm.wgmma_scale_out<one>],
+      A [<u8>, #nvvm.wgmma_scale_in<one>, <row>], 
+      B [<u8>, #nvvm.wgmma_scale_in<one>, <col>]
+      : !mat16i32 -> !mat16i32
+  %result3 = nvvm.wgmma.mma_async %descA, %descB, %result2,
+      #nvvm.shape<m = 64, n = 8, k = 32>, 
+      D [<s32>, #nvvm.wgmma_scale_out<one>],
+      A [<u8>, #nvvm.wgmma_scale_in<one>, <row>], 
+      B [<u8>, #nvvm.wgmma_scale_in<one>, <col>]
+      : !mat16i32 -> !mat16i32
+  return %result3 : !mat16i32
+}
+
+// -----
+
+!mat32f32 = !llvm.struct<(
+  f32, f32, f32, f32, f32, f32, f32, f32, 
+  f32, f32, f32, f32, f32, f32, f32, f32, 
+  f32, f32, f32, f32, f32, f32, f32, f32, 
+  f32, f32, f32, f32, f32, f32, f32, f32)>
+
+// CHECK-LABEL: @wgmma_f32_tf32_tf32
+func.func @wgmma_f32_tf32_tf32(%descA : i64, %descB : i64) -> !mat32f32 {  
+  // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME:"{
+  // CHECK-SAME: .reg .pred p;
+  // CHECK-SAME: setp.ne.b32 p, $66, 0;
+  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
+  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "{
+  // CHECK-SAME: .reg .pred p;
+  // CHECK-SAME: setp.ne.b32 p, $66, 0;
+  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
+  %result = llvm.mlir.undef : !mat32f32
+  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result,
+      #nvvm.shape<m = 64, n = 64, k = 8>, 
+      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
+      A [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
+      B [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
+       : !mat32f32 -> !mat32f32
+  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
+      #nvvm.shape<m = 64, n = 64, k = 8>, 
+      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
+      A [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
+      B [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
+      : !mat32f32 -> !mat32f32
+  return %result2 : !mat32f32
+}
+
+
+// -----
+
+!mat32f32 = !llvm.struct<(
+  f32, f32, f32, f32, f32, f32, f32, f32, 
+  f32, f32, f32, f32, f32, f32, f32, f32, 
+  f32, f32, f32, f32, f32, f32, f32, f32, 
+  f32, f32, f32, f32, f32, f32, f32, f32)>
+
+// CHECK-LABEL: @wgmma_f32_e4m3_e4m3
+func.func @wgmma_f32_e4m3_e4m3(%descA : i64, %descB : i64) -> !mat32f32 {  
+  // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
+  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
+  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
+  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
+  %result = llvm.mlir.undef : !mat32f32
+  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result,
+      #nvvm.shape<m = 64, n = 64, k = 32>, 
+      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
+      A [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
+      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
+       : !mat32f32 -> !mat32f32
+  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
+      #nvvm.shape<m = 64, n = 64, k = 32>, 
+      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
+      A [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
+      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
+      : !mat32f32 -> !mat32f32
+  return %result2 : !mat32f32
+}
+
+// -----
+
+!mat32f32 = !llvm.struct<(
+  f32, f32, f32, f32, f32, f32, f32, f32, 
+  f32, f32, f32, f32, f32, f32, f32, f32, 
+  f32, f32, f32, f32, f32, f32, f32, f32, 
+  f32, f32, f32, f32, f32, f32, f32, f32)>
+
+// CHECK-LABEL: @wgmma_f32_e5m2_e4m3
+func.func @wgmma_f32_e5m2_e4m3(%descA : i64, %descB : i64) -> !mat32f32 {  
+  // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
+  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
+  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
+  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
+  %result = llvm.mlir.undef : !mat32f32
+  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result,
+      #nvvm.shape<m = 64, n = 64, k = 32>, 
+      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
+      A [#nvvm.wgmma_type<e5m2>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
+      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
+       : !mat32f32 -> !mat32f32
+  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
+      #nvvm.shape<m = 64, n = 64, k = 32>, 
+      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
+      A [#nvvm.wgmma_type<e5m2>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
+      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
+      : !mat32f32 -> !mat32f32
+  return %result2 : !mat32f32
+}
+
+// -----
+
+func.func @elect_one_leader_sync() {  
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "{
+  // CHECK-SAME: .reg .u32 rx;
+  // CHECK-SAME: .reg .pred px;
+  // CHECK-SAME: mov.pred $0, 0;
+  // CHECK-SAME: elect.sync rx | px, 0xFFFFFFFF;
+  // CHECK-SAME: @px mov.pred $0, 1;
+  // CHECK-SAME: "=b"  : () -> i1
+  %cnd = nvvm.elect.sync -> i1 
+  return 
+}
+
+// -----
+
+// CHECK-LABEL: @stmatrix(
+// CHECK-SAME: %[[arg0:[a-zA-Z0-9_]+]]: !llvm.ptr<3>, 
+// CHECK-SAME: %[[arg1:[a-zA-Z0-9_]+]]: i32,
+// CHECK-SAME: %[[arg2:[a-zA-Z0-9_]+]]: i32,
+// CHECK-SAME: %[[arg3:[a-zA-Z0-9_]+]]: i32,
+// CHECK-SAME: %[[arg4:[a-zA-Z0-9_]+]]: i32)
+llvm.func @stmatrix(%arg0 : !llvm.ptr<3>, %m1 : i32, %m2 : i32, %m3 : i32, %m4 : i32) {
+// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x1.m8n8.shared.b16 [$0], {$1};", "r,r" %[[arg0]], %[[arg1]] : (!llvm.ptr<3>, i32) -> ()
+// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x2.m8n8.shared.b16 [$0], {$1, $2};", "r,r,r" %[[arg0]], %[[arg1]], %[[arg2]] : (!llvm.ptr<3>, i32, i32) -> ()
+// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x4.m8n8.shared.b16 [$0], {$1, $2, $3, $4};", "r,r,r,r,r" %[[arg0]], %[[arg1]], %[[arg2]], %[[arg3]], %[[arg4]] : (!llvm.ptr<3>, i32, i32, i32, i32) -> ()
+// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x1.trans.m8n8.shared.b16 [$0], {$1};", "r,r" %[[arg0]], %[[arg1]] : (!llvm.ptr<3>, i32) -> ()
+// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x2.trans.m8n8.shared.b16 [$0], {$1, $2};", "r,r,r" %[[arg0]], %[[arg1]], %[[arg2]] : (!llvm.ptr<3>, i32, i32) -> ()
+// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x4.trans.m8n8.shared.b16 [$0], {$1, $2, $3, $4};", "r,r,r,r,r" %[[arg0]], %[[arg1]], %[[arg2]], %[[arg3]], %[[arg4]] : (!llvm.ptr<3>, i32, i32, i32, i32) -> ()
+  nvvm.stmatrix %arg0, %m1 {layout = #nvvm.mma_layout<row>} : !llvm.ptr<3>, i32
+  nvvm.stmatrix %arg0, %m1, %m2 {layout = #nvvm.mma_layout<row>} : !llvm.ptr<3>, i32, i32
+  nvvm.stmatrix %arg0, %m1, %m2, %m3, %m4 {layout = #nvvm.mma_layout<row>} : !llvm.ptr<3>, i32, i32, i32, i32
+  nvvm.stmatrix %arg0, %m1 {layout = #nvvm.mma_layout<col>} : !llvm.ptr<3>, i32
+  nvvm.stmatrix %arg0, %m1, %m2 {layout = #nvvm.mma_layout<col>} : !llvm.ptr<3>, i32, i32
+  nvvm.stmatrix %arg0, %m1, %m2, %m3, %m4 {layout = #nvvm.mma_layout<col>} : !llvm.ptr<3>, i32, i32, i32, i32
+  llvm.return 
+}
+
+// -----
+
+// CHECK-LABEL: @init_mbarrier_arrive_expect_tx
+llvm.func @init_mbarrier_arrive_expect_tx(%desc : !llvm.ptr, %pred : i1) {
+  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "prefetch.tensormap [$0];", "l"
+  nvvm.prefetch.tensormap %desc : !llvm.ptr
+  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$1 prefetch.tensormap [$0];", "l,b"
+  nvvm.prefetch.tensormap %desc, predicate = %pred : !llvm.ptr, i1
+  llvm.return
+}
+
+// -----
+
+func.func @set_max_register() {
+  // CHECK: nvvm.setmaxregister increase 232
+  nvvm.setmaxregister increase 232
+
+  // CHECK: nvvm.setmaxregister decrease 40
+  nvvm.setmaxregister decrease 40
+  func.return
+}
+
+// -----
+
+func.func @cp_async_bulk_commit() {
+  // CHECK: nvvm.cp.async.bulk.commit.group
+  nvvm.cp.async.bulk.commit.group
+  func.return
+}
+
+// -----
+
+func.func @cp_async_bulk_wait_group() {
+  // CHECK: nvvm.cp.async.bulk.wait_group 1
+  // CHECK: nvvm.cp.async.bulk.wait_group 0
+  // CHECK: nvvm.cp.async.bulk.wait_group 5 {read}
+  // CHECK: nvvm.cp.async.bulk.wait_group 0 {read}
+  nvvm.cp.async.bulk.wait_group 1
+  nvvm.cp.async.bulk.wait_group 0
+  nvvm.cp.async.bulk.wait_group 5 {read}
+  nvvm.cp.async.bulk.wait_group 0 {read}
+  func.return
+}
+
+// -----
+
+func.func @fence_mbarrier_init() {
+  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.mbarrier_init.release.cluster;"
+  nvvm.fence.mbarrier.init
+  func.return 
+}
+// -----
+
+func.func @fence_proxy() {
+  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.alias;", ""  : () -> ()
+  nvvm.fence.proxy { kind = #nvvm.proxy_kind<alias>}
+  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async;", ""  : () -> ()
+  nvvm.fence.proxy { kind = #nvvm.proxy_kind<async>}
+  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async.global;", ""  : () -> ()
+  nvvm.fence.proxy { kind = #nvvm.proxy_kind<async.global>}
+  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async.shared::cta;", ""  : () -> ()
+  nvvm.fence.proxy { kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>}
+  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async.shared::cluster;", ""  : () -> ()
+  nvvm.fence.proxy { kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cluster>}
+  func.return
+}
+
+// -----
+
+// CHECK-LABEL: @llvm_nvvm_barrier_arrive
+// CHECK-SAME: (%[[barId:.*]]: i32, %[[numberOfThreads:.*]]: i32)
+llvm.func @llvm_nvvm_barrier_arrive(%barID : i32, %numberOfThreads : i32) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "bar.arrive 0, $0;", "r" %[[numberOfThreads]] : (i32) -> ()
+  nvvm.barrier.arrive number_of_threads = %numberOfThreads
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "bar.arrive $0, $1;", "r,r" %[[barId]], %[[numberOfThreads]] : (i32, i32) -> ()
+  nvvm.barrier.arrive id = %barID number_of_threads = %numberOfThreads
+  llvm.return
 }
-
-// <operation> ::= `llvm.nvvm.vote.ballot.sync %mask, %pred` : result_type
-ParseResult VoteBallotOp::parse(OpAsmParser &parser, OperationState &result) {
-  MLIRContext *context = parser.getContext();
-  auto int32Ty = IntegerType::get(context, 32);
-  auto int1Ty = IntegerType::get(context, 1);
-
-  SmallVector<OpAsmParser::UnresolvedOperand, 8> ops;
-  Type type;
-  return failure(parser.parseOperandList(ops) ||
-                 parser.parseOptionalAttrDict(result.attributes) ||
-                 parser.parseColonType(type) ||
-                 parser.addTypeToList(type, result.types) ||
-                 parser.resolveOperands(ops, {int32Ty, int1Ty},
-                                        parser.getNameLoc(), result.operands));
-}
-
-void VoteBallotOp::print(OpAsmPrinter &p) { printNVVMIntrinsicOp(p, *this); }
-
-LogicalResult CpAsyncBulkTensorGlobalToSharedClusterOp::verify() {
-  if (getCoordinates().empty() || getCoordinates().size() > 5)
-    return emitError("expects coordinates between 1 to 5 dimension");
-
-  // Check for im2col mode
-  if (!getIm2colOffsets().empty()) {
-    if (getCoordinates().size() < 3)
-      return emitError(
-          "to use im2col mode, the tensor has to be at least 3-dimensional");
-    if (getCoordinates().size() != (getIm2colOffsets().size() + 2))
-      return emitError(
-          "im2col offsets must be 2 less than number of coordinates");
-  }
-  return success();
-}
-
-LogicalResult CpAsyncBulkTensorSharedCTAToGlobalOp::verify() {
-  if (getCoordinates().size() > 5)
-    return emitError("Maximum 5 coordinates and dimension is supported.");
-  return success();
-}
-
-LogicalResult CpAsyncOp::verify() {
-  if (getModifier() != LoadCacheModifierKind::CG &&
-      getModifier() != LoadCacheModifierKind::CA)
-    return emitError("Only CG and CA cache modifiers are supported.");
-  if (getSize() != 4 && getSize() != 8 && getSize() != 16)
-    return emitError("expected byte size to be either 4, 8 or 16.");
-  if (getModifier() == LoadCacheModifierKind::CG && getSize() != 16)
-    return emitError("CG cache modifier is only support for 16 bytes copy.");
-  return success();
-}
-
-// Given the element type of an operand and whether or not it is an accumulator,
-// this function returns the PTX type (`NVVM::MMATypes`) that corresponds to the
-// operand's element type.
-std::optional<mlir::NVVM::MMATypes>
-MmaOp::inferOperandMMAType(Type operandElType, bool isAccumulator) {
-  auto half2Type =
-      LLVM::getFixedVectorType(Float16Type::get(operandElType.getContext()), 2);
-  if (operandElType.isF64())
-    return NVVM::MMATypes::f64;
-  if (operandElType.isF16() || operandElType == half2Type)
-    return NVVM::MMATypes::f16;
-  if (operandElType.isF32() && isAccumulator)
-    return NVVM::MMATypes::f32;
-  if (operandElType.isF32() && !isAccumulator)
-    return NVVM::MMATypes::tf32;
-  if (llvm::isa<IntegerType>(operandElType)) {
-    if (isAccumulator)
-      return NVVM::MMATypes::s32;
-    return std::nullopt;
-  }
-
-  if (auto structType = llvm::dyn_cast<LLVM::LLVMStructType>(operandElType)) {
-    if (structType.getBody().empty())
-      return std::nullopt;
-    return inferOperandMMAType(structType.getBody()[0], isAccumulator);
-  }
-
-  return std::nullopt;
-}
-
-static bool isInt4PtxType(MMATypes type) {
-  return (type == MMATypes::u4 || type == MMATypes::s4);
-}
-
-static bool isInt8PtxType(MMATypes type) {
-  return (type == MMATypes::u8 || type == MMATypes::s8);
-}
-
-static bool isIntegerPtxType(MMATypes type) {
-  return isInt4PtxType(type) || isInt8PtxType(type) || type == MMATypes::b1 ||
-         type == MMATypes::s32;
-}
-
-MMATypes MmaOp::accumPtxType() {
-  std::optional<mlir::NVVM::MMATypes> val = inferOperandMMAType(
-      getODSOperands(2).getTypes().front(), /*isAccum=*/true);
-  assert(val.has_value() && "accumulator PTX type should always be inferrable");
-  return val.value();
-}
-
-MMATypes MmaOp::resultPtxType() {
-  std::optional<mlir::NVVM::MMATypes> val =
-      inferOperandMMAType(getResult().getType(), /*isAccum=*/true);
-  assert(val.has_value() && "result PTX type should always be inferrable");
-  return val.value();
-}
-
-void MmaOp::print(OpAsmPrinter &p) {
-  SmallVector<Type, 4> regTypes;
-  struct OperandFragment {
-    StringRef operandName;
-    StringRef ptxTypeAttr;
-    SmallVector<Value, 4> regs;
-    explicit OperandFragment(StringRef name, StringRef ptxTypeName)
-        : operandName(name), ptxTypeAttr(ptxTypeName) {}
-  };
-
-  std::array<OperandFragment, 3> frags{
-      OperandFragment("A", getMultiplicandAPtxTypeAttrName()),
-      OperandFragment("B", getMultiplicandBPtxTypeAttrName()),
-      OperandFragment("C", "")};
-  SmallVector<StringRef, 4> ignoreAttrNames{
-      mlir::NVVM::MmaOp::getOperandSegmentSizeAttr()};
-
-  for (unsigned fragIdx = 0; fragIdx < frags.size(); fragIdx++) {
-    auto &frag = frags[fragIdx];
-    auto varOperandSpec = getODSOperandIndexAndLength(fragIdx);
-    for (auto operandIdx = varOperandSpec.first;
-         operandIdx < varOperandSpec.first + varOperandSpec.second;
-         operandIdx++) {
-      frag.regs.push_back(this->getOperand(operandIdx));
-      if (operandIdx == 0) {
-        regTypes.push_back(this->getOperand(operandIdx).getType());
-      }
-    }
-    std::optional<MMATypes> inferredType =
-        inferOperandMMAType(regTypes.back(), /*isAccum=*/fragIdx >= 2);
-    if (inferredType)
-      ignoreAttrNames.push_back(frag.ptxTypeAttr);
-  }
-
-  auto printMmaOperand = [&](const OperandFragment &frag) -> void {
-    p << " " << frag.operandName;
-    p << "[";
-    p.printOperands(frag.regs);
-    p << "] ";
-  };
-
-  for (const auto &frag : frags) {
-    printMmaOperand(frag);
-  }
-
-  p.printOptionalAttrDict(this->getOperation()->getAttrs(), ignoreAttrNames);
-
-  // Print the types of the operands and result.
-  p << " : " << "(";
-  llvm::interleaveComma(SmallVector<Type, 3>{frags[0].regs[0].getType(),
-                                             frags[1].regs[0].getType(),
-                                             frags[2].regs[0].getType()},
-                        p);
-  p << ")";
-  p.printArrowTypeList(TypeRange{this->getRes().getType()});
-}
-
-void MmaOp::build(OpBuilder &builder, OperationState &result, Type resultType,
-                  ValueRange operandA, ValueRange operandB, ValueRange operandC,
-                  ArrayRef<int64_t> shape, std::optional<MMAB1Op> b1Op,
-                  std::optional<MMAIntOverflow> intOverflow,
-                  std::optional<std::array<MMATypes, 2>> multiplicandPtxTypes,
-                  std::optional<std::array<MMALayout, 2>> multiplicandLayouts) {
-
-  assert(shape.size() == 3 && "expected shape to have size 3 (m, n, k)");
-  MLIRContext *ctx = builder.getContext();
-  result.addAttribute(
-      "shape", builder.getAttr<MMAShapeAttr>(shape[0], shape[1], shape[2]));
-
-  result.addOperands(operandA);
-  result.addOperands(operandB);
-  result.addOperands(operandC);
-
-  if (multiplicandPtxTypes) {
-    result.addAttribute("multiplicandAPtxType",
-                        MMATypesAttr::get(ctx, (*multiplicandPtxTypes)[0]));
-    result.addAttribute("multiplicandBPtxType",
-                        MMATypesAttr::get(ctx, (*multiplicandPtxTypes)[1]));
-  } else {
-    if (auto res = inferOperandMMAType(operandA[0].getType(), false))
-      result.addAttribute("multiplicandAPtxType", MMATypesAttr::get(ctx, *res));
-    if (auto res = inferOperandMMAType(operandB[0].getType(), false))
-      result.addAttribute("multiplicandBPtxType", MMATypesAttr::get(ctx, *res));
-  }
-
-  if (multiplicandLayouts) {
-    result.addAttribute("layoutA",
-                        MMALayoutAttr::get(ctx, (*multiplicandLayouts)[0]));
-    result.addAttribute("layoutB",
-                        MMALayoutAttr::get(ctx, (*multiplicandLayouts)[1]));
-  } else {
-    result.addAttribute("layoutA", MMALayoutAttr::get(ctx, MMALayout::row));
-    result.addAttribute("layoutB", MMALayoutAttr::get(ctx, MMALayout::col));
-  }
-
-  if (intOverflow.has_value())
-    result.addAttribute("intOverflowBehavior",
-                        MMAIntOverflowAttr::get(ctx, *intOverflow));
-  if (b1Op.has_value())
-    result.addAttribute("b1Op", MMAB1OpAttr::get(ctx, *b1Op));
-
-  result.addTypes(resultType);
-  result.addAttribute(
-      MmaOp::getOperandSegmentSizeAttr(),
-      builder.getDenseI32ArrayAttr({static_cast<int32_t>(operandA.size()),
-                                    static_cast<int32_t>(operandB.size()),
-                                    static_cast<int32_t>(operandC.size())}));
-}
-
-// <operation> :=
-//   A `[` $operandA `]` B `[` $operandB `]` C `[` $operandC `]`
-//   attr-dict : (type($operandA[0]), type($operandB[0]), type($operandC[0]))
-//     `->` type($res)
-ParseResult MmaOp::parse(OpAsmParser &parser, OperationState &result) {
-  struct OperandFragment {
-    std::optional<MMATypes> elemtype;
-    SmallVector<OpAsmParser::UnresolvedOperand, 4> regs;
-    SmallVector<Type> regTypes;
-  };
-
-  Builder &builder = parser.getBuilder();
-  std::array<OperandFragment, 4> frags;
-
-  NamedAttrList namedAttributes;
-
-  // A helper to parse the operand segments.
-  auto parseMmaOperand = [&](StringRef operandName,
-                             OperandFragment &frag) -> LogicalResult {
-    if (parser.parseKeyword(operandName).failed())
-      return failure();
-    if (parser
-            .parseOperandList(frag.regs, OpAsmParser::Delimiter::OptionalSquare)
-            .failed())
-      return failure();
-    return success();
-  };
-
-  // Parse the operand segments.
-  if (parseMmaOperand("A", frags[0]).failed())
-    return failure();
-  if (parseMmaOperand("B", frags[1]).failed())
-    return failure();
-  if (parseMmaOperand("C", frags[2]).failed())
-    return failure();
-
-  if (parser.parseOptionalAttrDict(namedAttributes).failed())
-    return failure();
-
-  // Parse the type specification and resolve operands.
-  SmallVector<Type, 3> operandTypes;
-  if (failed(parser.parseColon()))
-    return failure();
-  if (failed(parser.parseLParen()))
-    return failure();
-  if (failed(parser.parseTypeList(operandTypes)))
-    return failure();
-  if (failed(parser.parseRParen()))
-    if (operandTypes.size() != 3)
-      return parser.emitError(
-          parser.getNameLoc(),
-          "expected one type for each operand segment but got " +
-              Twine(operandTypes.size()) + " types");
-  for (const auto &iter : llvm::enumerate(operandTypes)) {
-    auto &frag = frags[iter.index()];
-    frag.regTypes.resize(frag.regs.size(), iter.value());
-    if (failed(parser.resolveOperands(frag.regs, frag.regTypes,
-                                      parser.getNameLoc(), result.operands)))
-      return failure();
-    frag.elemtype =
-        inferOperandMMAType(frag.regTypes[0], /*isAccum=*/iter.index() < 2);
-  }
-
-  Type resultType;
-  if (parser.parseArrow() || parser.parseType(resultType))
-    return failure();
-  frags[3].elemtype = inferOperandMMAType(resultType, /*isAccum=*/true);
-
-  std::array<StringRef, 2> names{"multiplicandAPtxType",
-                                 "multiplicandBPtxType"};
-  for (unsigned idx = 0; idx < names.size(); idx++) {
-    const auto &frag = frags[idx];
-    std::optional<NamedAttribute> attr = namedAttributes.getNamed(names[idx]);
-    if (!frag.elemtype.has_value() && !attr.has_value()) {
-      return parser.emitError(
-          parser.getNameLoc(),
-          "attribute " + names[idx] +
-              " is not provided explicitly and cannot be inferred");
-    }
-    if (!attr.has_value())
-      result.addAttribute(
-          names[idx], MMATypesAttr::get(parser.getContext(), *frag.elemtype));
-  }
-
-  result.addTypes(resultType);
-  if (!namedAttributes.empty())
-    result.addAttributes(namedAttributes);
-  result.addAttribute(MmaOp::getOperandSegmentSizeAttr(),
-                      builder.getDenseI32ArrayAttr({
-                          static_cast<int32_t>(frags[0].regs.size()),
-                          static_cast<int32_t>(frags[1].regs.size()),
-                          static_cast<int32_t>(frags[2].regs.size()),
-                      }));
-  return success();
-}
-
-LogicalResult MmaOp::verify() {
-  MLIRContext *context = getContext();
-  auto f16Ty = Float16Type::get(context);
-  auto i32Ty = IntegerType::get(context, 32);
-  auto f16x2Ty = LLVM::getFixedVectorType(f16Ty, 2);
-  auto f32Ty = Float32Type::get(context);
-  auto f16x2x4StructTy = LLVM::LLVMStructType::getLiteral(
-      context, {f16x2Ty, f16x2Ty, f16x2Ty, f16x2Ty});
-
-  auto s32x4StructTy =
-      LLVM::LLVMStructType::getLiteral(context, {i32Ty, i32Ty, i32Ty, i32Ty});
-  auto f32x8StructTy =
-      LLVM::LLVMStructType::getLiteral(context, SmallVector<Type>(8, f32Ty));
-  auto f16x2x2StructTy =
-      LLVM::LLVMStructType::getLiteral(context, {f16x2Ty, f16x2Ty});
-  auto f32x4StructTy =
-      LLVM::LLVMStructType::getLiteral(context, {f32Ty, f32Ty, f32Ty, f32Ty});
-  auto s32x2StructTy =
-      LLVM::LLVMStructType::getLiteral(context, {i32Ty, i32Ty});
-
-  std::array<int64_t, 3> mmaShape{getShapeAttr().getM(), getShapeAttr().getN(),
-                                  getShapeAttr().getK()};
-
-  // These variables define the set of allowed data types for matrices A, B, C,
-  // and result.
-  using AllowedShapes = SmallVector<std::array<int64_t, 3>, 2>;
-  using AllowedTypes = SmallVector<SmallVector<Type, 4>, 2>;
-  AllowedShapes allowedShapes;
-  AllowedTypes expectedA;
-  AllowedTypes expectedB;
-  AllowedTypes expectedC;
-  SmallVector<Type> expectedResult;
-
-  // When M = 16, we just need to calculate the number of 8xk tiles, where
-  // k is a factor that depends on the data type.
-  if (mmaShape[0] == 16) {
-    int64_t kFactor;
-    Type multiplicandFragType;
-    switch (*getMultiplicandAPtxType()) {
-    case MMATypes::tf32:
-      kFactor = 4;
-      multiplicandFragType = i32Ty;
-      expectedResult.push_back(LLVM::LLVMStructType::getLiteral(
-          context, {f32Ty, f32Ty, f32Ty, f32Ty}));
-      break;
-    case MMATypes::f16:
-    case MMATypes::bf16:
-      kFactor = 8;
-      multiplicandFragType = f16x2Ty;
-      expectedResult.push_back(f16x2x2StructTy);
-      expectedResult.push_back(f32x4StructTy);
-      break;
-    case MMATypes::s4:
-    case MMATypes::u4:
-      kFactor = 32;
-      break;
-    case MMATypes::b1:
-      kFactor = 128;
-      break;
-    case MMATypes::s8:
-    case MMATypes::u8:
-      kFactor = 16;
-      break;
-    default:
-      return emitError("invalid shape or multiplicand type: " +
-                       stringifyEnum(getMultiplicandAPtxType().value()));
-    }
-
-    if (isIntegerPtxType(getMultiplicandAPtxType().value())) {
-      expectedResult.push_back(s32x4StructTy);
-      expectedC.emplace_back(4, i32Ty);
-      multiplicandFragType = i32Ty;
-    } else {
-      expectedC.emplace_back(2, f16x2Ty);
-      expectedC.emplace_back(4, f32Ty);
-    }
-
-    int64_t unitA = (mmaShape[0] / 8) * (mmaShape[2] / kFactor);
-    int64_t unitB = (mmaShape[1] / 8) * (mmaShape[2] / kFactor);
-    expectedA.emplace_back(unitA, multiplicandFragType);
-    expectedB.emplace_back(unitB, multiplicandFragType);
-    allowedShapes.push_back({16, 8, kFactor});
-    allowedShapes.push_back({16, 8, kFactor * 2});
-  }
-
-  // In the M=8 case, there is only 1 possible case per data type.
-  if (mmaShape[0] == 8) {
-    if (*getMultiplicandAPtxType() == MMATypes::f16) {
-      expectedA.emplace_back(2, f16x2Ty);
-      expectedB.emplace_back(2, f16x2Ty);
-      expectedResult.push_back(f16x2x4StructTy);
-      expectedResult.push_back(f32x8StructTy);
-      expectedC.emplace_back(4, f16x2Ty);
-      expectedC.emplace_back(8, f32Ty);
-      allowedShapes.push_back({8, 8, 4});
-    }
-    if (*getMultiplicandAPtxType() == MMATypes::f64) {
-      Type f64Ty = Float64Type::get(context);
-      expectedA.emplace_back(1, f64Ty);
-      expectedB.emplace_back(1, f64Ty);
-      expectedC.emplace_back(2, f64Ty);
-      // expectedC.emplace_back(1, LLVM::getFixedVectorType(f64Ty, 2));
-      expectedResult.emplace_back(LLVM::LLVMStructType::getLiteral(
-          context, SmallVector<Type>(2, f64Ty)));
-      allowedShapes.push_back({8, 8, 4});
-    }
-    if (isIntegerPtxType(getMultiplicandAPtxType().value())) {
-      expectedA.push_back({i32Ty});
-      expectedB.push_back({i32Ty});
-      expectedC.push_back({i32Ty, i32Ty});
-      expectedResult.push_back(s32x2StructTy);
-      if (isInt4PtxType(getMultiplicandAPtxType().value()))
-        allowedShapes.push_back({8, 8, 32});
-      if (isInt8PtxType(getMultiplicandAPtxType().value()))
-        allowedShapes.push_back({8, 8, 16});
-      if (getMultiplicandAPtxType().value() == MMATypes::b1)
-        allowedShapes.push_back({8, 8, 128});
-    }
-  }
-
-  std::string errorMessage;
-  llvm::raw_string_ostream errorStream(errorMessage);
-
-  // Check that we matched an existing shape/dtype combination.
-  if (expectedA.empty() || expectedB.empty() || expectedC.empty() ||
-      !llvm::is_contained(allowedShapes, mmaShape)) {
-    errorStream << "unimplemented variant for MMA shape <";
-    llvm::interleaveComma(mmaShape, errorStream);
-    errorStream << ">";
-    return emitOpError(errorMessage);
-  }
-
-  // Verify the operand types for segments of A, B, and C operands.
-  std::array<StringRef, 3> operandNames{"A", "B", "C"};
-  for (const auto &iter : llvm::enumerate(
-           SmallVector<AllowedTypes, 3>{expectedA, expectedB, expectedC})) {
-    auto spec = this->getODSOperandIndexAndLength(iter.index());
-    SmallVector<Type, 4> operandTySeg(operand_type_begin() + spec.first,
-                                      operand_type_begin() + spec.first +
-                                          spec.second);
-    bool match = llvm::is_contained(iter.value(), operandTySeg);
-
-    if (!match) {
-      errorStream << "Could not match types for the "
-                  << operandNames[iter.index()]
-                  << " operands; expected one of ";
-      for (const auto &x : iter.value()) {
-        errorStream << x.size() << "x" << x[0] << " ";
-      }
-      errorStream << "but got ";
-      llvm::interleaveComma(operandTySeg, errorStream);
-      return emitOpError(errorStream.str());
-    }
-  }
-
-  // Check the result type
-  if (!llvm::any_of(expectedResult, [&](Type expectedResultType) {
-        return expectedResultType == getResult().getType();
-      })) {
-    errorStream
-        << "Could not match allowed types for the result; expected one of ";
-    llvm::interleaveComma(expectedResult, errorStream);
-    errorStream << " but got " << getResult().getType();
-    return emitOpError(errorStream.str());
-  }
-
-  // Ensure that binary MMA variants have a b1 MMA operation defined.
-  if (getMultiplicandAPtxType() == MMATypes::b1 && !getB1Op()) {
-    return emitOpError("op requires " + getB1OpAttrName().strref() +
-                       " attribute");
-  }
-
-  // Ensure int4/int8 MMA variants specify the accum overflow behavior
-  // attribute.
-  if (isInt4PtxType(*getMultiplicandAPtxType()) ||
-      isInt8PtxType(*getMultiplicandAPtxType())) {
-    if (!getIntOverflowBehavior())
-      return emitOpError("op requires " +
-                         getIntOverflowBehaviorAttrName().strref() +
-                         " attribute");
-  }
-
-  return success();
-}
-
-LogicalResult ShflOp::verify() {
-  if (!(*this)->getAttrOfType<UnitAttr>("return_value_and_is_valid"))
-    return success();
-  auto type = llvm::dyn_cast<LLVM::LLVMStructType>(getType());
-  auto elementType = (type && type.getBody().size() == 2)
-                         ? llvm::dyn_cast<IntegerType>(type.getBody()[1])
-                         : nullptr;
-  if (!elementType || elementType.getWidth() != 1)
-    return emitError("expected return type to be a two-element struct with "
-                     "i1 as the second element");
-  return success();
-}
-
-std::pair<mlir::Type, unsigned> NVVM::inferMMAType(NVVM::MMATypes type,
-                                                   NVVM::MMAFrag frag, int nRow,
-                                                   int nCol,
-                                                   MLIRContext *context) {
-  unsigned numberElements = 0;
-  Type elementType;
-  OpBuilder builder(context);
-  Type f16x2 = VectorType::get(2, builder.getF16Type());
-  if (type == NVVM::MMATypes::f16) {
-    elementType = f16x2;
-    if (frag == NVVM::MMAFrag::a || frag == NVVM::MMAFrag::b)
-      numberElements = 8;
-    else
-      numberElements = 4;
-  } else if (type == NVVM::MMATypes::f32) {
-    elementType = builder.getF32Type();
-    numberElements = 8;
-  } else if (type == NVVM::MMATypes::tf32) {
-    elementType = builder.getI32Type();
-    numberElements = 4;
-  } else if (type == NVVM::MMATypes::s8 || type == NVVM::MMATypes::u8) {
-    elementType = builder.getI32Type();
-    int parallelSize = 0;
-    if (frag == NVVM::MMAFrag::a)
-      parallelSize = nRow;
-    if (frag == NVVM::MMAFrag::b)
-      parallelSize = nCol;
-
-    // m == 16 && n == 16 && k == 16
-    if (parallelSize == 16)
-      numberElements = 2;
-    // m == 8 && n == 32 && k == 16 or m == 32 && n == 8 && k == 16
-    else if (parallelSize == 8)
-      numberElements = 1;
-    else if (parallelSize == 32)
-      numberElements = 4;
-  } else if (type == NVVM::MMATypes::s32) {
-    elementType = builder.getI32Type();
-    numberElements = 8;
-  }
-  assert(numberElements != 0 && elementType != nullptr);
-  return std::make_pair(elementType, numberElements);
-}
-
-static std::pair<mlir::Type, unsigned>
-inferMMATypeFromMNK(NVVM::MMATypes type, NVVM::MMAFrag frag, int m, int n,
-                    int k, MLIRContext *context) {
-  int nRow, nCol;
-  if (frag == NVVM::MMAFrag::a) {
-    nRow = m;
-    nCol = k;
-  } else if (frag == NVVM::MMAFrag::b) {
-    nRow = k;
-    nCol = n;
-  } else {
-    nRow = m;
-    nCol = n;
-  }
-  assert(nRow && nCol);
-  return inferMMAType(type, frag, nRow, nCol, context);
-}
-
-LogicalResult NVVM::WMMALoadOp::verify() {
-  unsigned addressSpace =
-      llvm::cast<LLVM::LLVMPointerType>(getPtr().getType()).getAddressSpace();
-  if (addressSpace != 0 && addressSpace != NVVM::kGlobalMemorySpace &&
-      addressSpace != NVVM::kSharedMemorySpace)
-    return emitOpError("expected source pointer in memory "
-                       "space 0, 1, 3");
-
-  if (NVVM::WMMALoadOp::getIntrinsicID(getM(), getN(), getK(), getLayout(),
-                                       getEltype(), getFrag()) == 0)
-    return emitOpError() << "invalid attribute combination";
-  std::pair<Type, unsigned> typeInfo = inferMMATypeFromMNK(
-      getEltype(), getFrag(), getM(), getN(), getK(), getContext());
-  Type dstType = LLVM::LLVMStructType::getLiteral(
-      getContext(), SmallVector<Type, 8>(typeInfo.second, typeInfo.first));
-  if (getType() != dstType)
-    return emitOpError("expected destination type is a structure of ")
-           << typeInfo.second << " elements of type " << typeInfo.first;
-  return success();
-}
-
-LogicalResult NVVM::WMMAStoreOp::verify() {
-  unsigned addressSpace =
-      llvm::cast<LLVM::LLVMPointerType>(getPtr().getType()).getAddressSpace();
-  if (addressSpace != 0 && addressSpace != NVVM::kGlobalMemorySpace &&
-      addressSpace != NVVM::kSharedMemorySpace)
-    return emitOpError("expected operands to be a source pointer in memory "
-                       "space 0, 1, 3");
-
-  if (NVVM::WMMAStoreOp::getIntrinsicID(getM(), getN(), getK(), getLayout(),
-                                        getEltype()) == 0)
-    return emitOpError() << "invalid attribute combination";
-  std::pair<Type, unsigned> typeInfo = inferMMATypeFromMNK(
-      getEltype(), NVVM::MMAFrag::c, getM(), getN(), getK(), getContext());
-  if (getArgs().size() != typeInfo.second)
-    return emitOpError() << "expected " << typeInfo.second << " data operands";
-  if (llvm::any_of(getArgs(), [&typeInfo](Value operands) {
-        return operands.getType() != typeInfo.first;
-      }))
-    return emitOpError() << "expected data operands of type " << typeInfo.first;
-  return success();
-}
-
-LogicalResult NVVM::WMMAMmaOp::verify() {
-  if (NVVM::WMMAMmaOp::getIntrinsicID(getM(), getN(), getK(), getLayoutA(),
-                                      getLayoutB(), getEltypeA(),
-                                      getEltypeB()) == 0)
-    return emitOpError() << "invalid attribute combination";
-  std::pair<Type, unsigned> typeInfoA = inferMMATypeFromMNK(
-      getEltypeA(), NVVM::MMAFrag::a, getM(), getN(), getK(), getContext());
-  std::pair<Type, unsigned> typeInfoB = inferMMATypeFromMNK(
-      getEltypeA(), NVVM::MMAFrag::b, getM(), getN(), getK(), getContext());
-  std::pair<Type, unsigned> typeInfoC = inferMMATypeFromMNK(
-      getEltypeB(), NVVM::MMAFrag::c, getM(), getN(), getK(), getContext());
-  SmallVector<Type, 32> arguments;
-  arguments.append(typeInfoA.second, typeInfoA.first);
-  arguments.append(typeInfoB.second, typeInfoB.first);
-  arguments.append(typeInfoC.second, typeInfoC.first);
-  unsigned numArgs = arguments.size();
-  if (getArgs().size() != numArgs)
-    return emitOpError() << "expected " << numArgs << " arguments";
-  for (unsigned i = 0; i < numArgs; i++) {
-    if (getArgs()[i].getType() != arguments[i])
-      return emitOpError() << "expected argument " << i << " to be of type "
-                           << arguments[i];
-  }
-  Type dstType = LLVM::LLVMStructType::getLiteral(
-      getContext(), SmallVector<Type, 8>(typeInfoC.second, typeInfoC.first));
-  if (getType() != dstType)
-    return emitOpError("expected destination type is a structure of ")
-           << typeInfoC.second << " elements of type " << typeInfoC.first;
-  return success();
-}
-
-LogicalResult NVVM::LdMatrixOp::verify() {
-  unsigned addressSpace =
-      llvm::cast<LLVM::LLVMPointerType>(getPtr().getType()).getAddressSpace();
-  if (addressSpace != NVVM::kSharedMemorySpace)
-    return emitOpError("expected source pointer in memory space 3");
-
-  if (getNum() != 1 && getNum() != 2 && getNum() != 4)
-    return emitOpError("expected num attribute to be 1, 2 or 4");
-
-  Type i32 = IntegerType::get(getContext(), 32);
-  if (getNum() == 1 && getType() != i32)
-    return emitOpError("expected destination type is i32");
-  if (getNum() == 2 || getNum() == 4) {
-    Type dstType = LLVM::LLVMStructType::getLiteral(
-        getContext(), SmallVector<Type>(getNum(), i32));
-    if (getType() != dstType)
-      return emitOpError("expected destination type is a structure of ")
-             << getNum() << " elements of type i32";
-  }
-  return success();
-}
-
-LogicalResult NVVM::StMatrixOp::verify() {
-  unsigned addressSpace =
-      llvm::cast<LLVM::LLVMPointerType>(getPtr().getType()).getAddressSpace();
-  if (addressSpace != NVVM::kSharedMemorySpace)
-    return emitOpError("expected source pointer in memory space 3");
-
-  int numMatrix = getSources().size();
-  if (numMatrix != 1 && numMatrix != 2 && numMatrix != 4)
-    return emitOpError("expected num attribute to be 1, 2 or 4");
-
-  return success();
-}
-
-FailureOr<int> getAllowedSizeK(NVVM::WGMMATypes typeA) {
-  if (typeA == NVVM::WGMMATypes::tf32)
-    return 8;
-  if (typeA == NVVM::WGMMATypes::f16 || typeA == NVVM::WGMMATypes::bf16)
-    return 16;
-  if (typeA == NVVM::WGMMATypes::s8 || typeA == NVVM::WGMMATypes::u8)
-    return 32;
-  if (typeA == NVVM::WGMMATypes::e4m3 || typeA == NVVM::WGMMATypes::e5m2)
-    return 32;
-  if (typeA == NVVM::WGMMATypes::b1)
-    return 256;
-  return failure();
-}
-
-LogicalResult isAllowedWGMMADataType(NVVM::WGMMATypes typeD,
-                                     NVVM::WGMMATypes typeA,
-                                     NVVM::WGMMATypes typeB) {
-  switch (typeA) {
-  case NVVM::WGMMATypes::f16:
-    if ((typeD == NVVM::WGMMATypes::f32 || typeD == NVVM::WGMMATypes::f16) &&
-        typeB == NVVM::WGMMATypes::f16)
-      return success();
-    break;
-  case NVVM::WGMMATypes::tf32:
-    if (typeD == NVVM::WGMMATypes::f32 && typeB == NVVM::WGMMATypes::tf32)
-      return success();
-    break;
-  case NVVM::WGMMATypes::u8:
-  case NVVM::WGMMATypes::s8:
-    if (typeD == NVVM::WGMMATypes::s32 &&
-        (typeB == NVVM::WGMMATypes::u8 || typeB == NVVM::WGMMATypes::s8))
-      return success();
-    break;
-  case NVVM::WGMMATypes::b1:
-    if (typeD == NVVM::WGMMATypes::s32 && typeB == NVVM::WGMMATypes::b1)
-      return success();
-    break;
-  case NVVM::WGMMATypes::bf16:
-    if ((typeD == NVVM::WGMMATypes::f32 || typeD == NVVM::WGMMATypes::f16) &&
-        typeB == NVVM::WGMMATypes::bf16)
-      return success();
-    break;
-  case NVVM::WGMMATypes::e4m3:
-  case NVVM::WGMMATypes::e5m2:
-    if ((typeD == NVVM::WGMMATypes::f32 || typeD == NVVM::WGMMATypes::f16) &&
-        (typeB == NVVM::WGMMATypes::e5m2 || typeB == NVVM::WGMMATypes::e4m3))
-      return success();
-    break;
-  case WGMMATypes::f32:
-  case WGMMATypes::s32:
-    llvm_unreachable("unsupported input types");
-    break;
-  }
-  return failure();
-}
-
-LogicalResult isAllowedSizeN(int sizeN, NVVM::WGMMATypes typeA) {
-  SmallVector<int> allowedN = {8,   16,  24,  32,  40,  48,  56,  64,
-                               72,  80,  88,  96,  104, 112, 120, 128,
-                               136, 144, 152, 160, 168, 176, 184, 192,
-                               200, 208, 216, 224, 232, 240, 248, 256};
-  SmallVector<int> allowedNshort = {8,   16,  24,  32,  48,  64,
-                                    80,  96,  112, 128, 144, 160,
-                                    176, 192, 208, 224, 240, 256};
-  switch (typeA) {
-  case WGMMATypes::f16:
-  case WGMMATypes::tf32:
-  case WGMMATypes::bf16:
-  case WGMMATypes::e4m3:
-  case WGMMATypes::e5m2:
-    if (llvm::is_contained(allowedN, sizeN))
-      return success();
-    break;
-  case WGMMATypes::u8:
-  case WGMMATypes::s8:
-  case WGMMATypes::b1:
-    if (llvm::is_contained(allowedNshort, sizeN))
-      return success();
-    break;
-  case WGMMATypes::f32:
-  case WGMMATypes::s32:
-    llvm_unreachable("unsupported input types");
-    break;
-  }
-  return failure();
-}
-
-LogicalResult NVVM::WgmmaMmaAsyncOp::verify() {
-  Value outValue = getResults();
-  auto stype = dyn_cast<LLVM::LLVMStructType>(outValue.getType());
-  if (!stype)
-    return emitOpError() << "expected results to be struct";
-  int outputSize = stype.getBody().size();
-  WGMMATypes typeD = getTypeD();
-  WGMMATypes typeA = getTypeA();
-  WGMMATypes typeB = getTypeB();
-
-  for (Type t : stype.getBody()) {
-    if (t != stype.getBody().front())
-      return emitOpError()
-             << "all elements in struct must be same type but there is " << t;
-  }
-
-  if (typeD != WGMMATypes::f32 && typeD != WGMMATypes::f16 &&
-      typeD != WGMMATypes::s32) {
-    return emitOpError() << "does not support the given output type "
-                         << NVVM::stringifyWGMMATypes(typeD);
-  }
-  if (typeD == WGMMATypes::s32 &&
-      (getScaleA() == WGMMAScaleIn::neg || getScaleB() == WGMMAScaleIn::neg)) {
-    return emitOpError() << "has s32 output, scaleA and scaleB cannot be neg";
-  }
-
-  if (failed(isAllowedWGMMADataType(typeD, typeA, typeB))) {
-    return emitOpError() << NVVM::stringifyWGMMATypes(typeD)
-                         << " += " << NVVM::stringifyWGMMATypes(typeA) << " * "
-                         << NVVM::stringifyWGMMATypes(typeB)
-                         << ", it is not supported.";
-  }
-
-  // Check M
-  if (getShape().getM() != 64)
-    return emitOpError() << "shape 'm' must be 64";
-
-  // Check K
-  FailureOr<int> allowedK = getAllowedSizeK(typeA);
-  if (failed(allowedK) || allowedK.value() != getShape().getK())
-    return emitOpError() << "shape 'k' must be " << allowedK.value()
-                         << " for input type "
-                         << NVVM::stringifyWGMMATypes(typeA);
-
-  // Check N
-  if (failed(isAllowedSizeN(getShape().getN(), typeA))) {
-    return emitOpError() << "has input type "
-                         << NVVM::stringifyWGMMATypes(typeA) << " n is set to "
-                         << getShape().getN() << ", it is not supported.";
-  }
-
-  // Check transpose (only available for f16/bf16)
-  if ((typeA != WGMMATypes::f16 && typeA != WGMMATypes::bf16) &&
-      (getLayoutA() == mlir::NVVM::MMALayout::col ||
-       getLayoutB() == mlir::NVVM::MMALayout::row)) {
-    return emitOpError()
-           << "given layouts layout_a = " << stringifyMMALayout(getLayoutA())
-           << " and layout_b = " << stringifyMMALayout(getLayoutB())
-           << " for input types " << stringifyWGMMATypes(typeA) << " and "
-           << stringifyWGMMATypes(typeB)
-           << " requires transpose. However, this is only supported for: "
-           << stringifyMMATypes(MMATypes::f16) << " and "
-           << stringifyMMATypes(MMATypes::bf16);
-  }
-
-  // Check result registers
-  int expectedOutput = 0;
-  if (typeD == WGMMATypes::f32 || typeD == WGMMATypes::s32)
-    expectedOutput = getShape().getN() / 2;
-  if (typeD == WGMMATypes::f16)
-    expectedOutput = getShape().getN() / 4;
-  if (outputSize != expectedOutput) {
-    return emitOpError() << "results " << expectedOutput
-                         << ", however output struct has " << outputSize
-                         << " elements";
-  }
-  // Check satfinite (only available for s32 accumulator)
-  if (typeD != WGMMATypes::s32 &&
-      getSatfinite().value_or(NVVM::MMAIntOverflow::wrapped) ==
-          NVVM::MMAIntOverflow::satfinite) {
-    return emitOpError()
-           << " `satfinite` can be only used with s32 accumulator, however "
-              "the current accumulator is "
-           << NVVM::stringifyWGMMATypes(typeD);
-  }
-
-  return success();
-}
-
-std::string NVVM::WgmmaMmaAsyncOp::getPtx() {
-
-  int m = getShape().getM(), n = getShape().getN(), k = getShape().getK();
-  bool isF16 = getTypeA() == WGMMATypes::f16 || getTypeA() == WGMMATypes::bf16;
-
-  StringRef outputTypeName = stringifyWGMMATypes(getTypeD());
-
-  int expectedOutputRegisters = 0;
-  if (getTypeD() == WGMMATypes::f16)
-    expectedOutputRegisters = getShape().getN() / 4;
-  else
-    expectedOutputRegisters = getShape().getN() / 2;
-
-  std::string ptx;
-  llvm::raw_string_ostream ss(ptx);
-
-  ss << "{\n"
-        ".reg .pred p;\n"
-        "setp.ne.b32 p, $"
-     << ((expectedOutputRegisters * 2) + 2)
-     << ", 0;\n"
-        "wgmma.mma_async.sync.aligned.m"
-     << m << "n" << n << "k" << k << "." << outputTypeName << "."
-     << stringifyWGMMATypes(getTypeA()) << "."
-     << stringifyWGMMATypes(getTypeB());
-  if (getSatfinite().value_or(NVVM::MMAIntOverflow::wrapped) ==
-      NVVM::MMAIntOverflow::satfinite)
-    ss << ".satfinite";
-  ss << " {";
-  int regCnt = 0;
-  for (; regCnt < expectedOutputRegisters; ++regCnt) {
-    ss << "$" << regCnt;
-    if (regCnt != expectedOutputRegisters - 1)
-      ss << ", ";
-  }
-
-  ss << "},";
-  // Need to map read/write registers correctly.
-  regCnt = (regCnt * 2);
-  ss << " $" << (regCnt) << "," << " $" << (regCnt + 1) << "," << " p";
-  if (getTypeD() != WGMMATypes::s32) {
-    ss << ", $" << (regCnt + 3) << ",  $" << (regCnt + 4);
-  }
-  // Don't add transpose parameters unless needed.
-  if (isF16) {
-    ss << ", $" << (regCnt + 5) << ",  $" << (regCnt + 6);
-  }
-  ss << ";\n"
-     << "}\n";
-  ss.flush();
-  return ptx;
-}
-
-void NVVM::WgmmaMmaAsyncOp::getAsmValues(
-    RewriterBase &rewriter,
-    llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>>
-        &asmValues) {
-  bool isF16 = getTypeA() == WGMMATypes::f16 || getTypeA() == WGMMATypes::bf16;
-  if (getResults())
-    asmValues.push_back({getResults(), mlir::NVVM::PTXRegisterMod::Write});
-  if (getInouts())
-    asmValues.push_back({getInouts(), mlir::NVVM::PTXRegisterMod::ReadWrite});
-  asmValues.push_back({getDescriptorA(), mlir::NVVM::PTXRegisterMod::Read});
-  asmValues.push_back({getDescriptorB(), mlir::NVVM::PTXRegisterMod::Read});
-  asmValues.push_back({makeConstantI32(rewriter, static_cast<int>(getScaleD())),
-                       mlir::NVVM::PTXRegisterMod::Read});
-  if (getTypeD() != WGMMATypes::s32) {
-    asmValues.push_back(
-        {makeConstantI32(rewriter,
-                         getScaleA() == NVVM::WGMMAScaleIn::neg ? -1 : 1),
-         mlir::NVVM::PTXRegisterMod::Read});
-    asmValues.push_back(
-        {makeConstantI32(rewriter,
-                         getScaleB() == NVVM::WGMMAScaleIn::neg ? -1 : 1),
-         mlir::NVVM::PTXRegisterMod::Read});
-  }
-  if (isF16) {
-    asmValues.push_back(
-        {makeConstantI32(rewriter, static_cast<int>(getLayoutA())),
-         mlir::NVVM::PTXRegisterMod::Read});
-    asmValues.push_back(
-        {makeConstantI32(rewriter, 1 - static_cast<int>(getLayoutB())),
-         mlir::NVVM::PTXRegisterMod::Read});
-  }
-}
-LogicalResult NVVM::FenceProxyOp::verify() {
-  if (getKind() == NVVM::ProxyKind::async_shared && !getSpace().has_value()) {
-    return emitOpError() << "async_shared fence requires space attribute";
-  }
-  if (getKind() != NVVM::ProxyKind::async_shared && getSpace().has_value()) {
-    return emitOpError() << "only async_shared fence can have space attribute";
-  }
-  return success();
-}
-
-LogicalResult NVVM::SetMaxRegisterOp::verify() {
-  if (getRegCount() % 8)
-    return emitOpError("new register size must be multiple of 8");
-  if (getRegCount() < 24 || getRegCount() > 256)
-    return emitOpError("new register size must be in between 24 to 256");
-  return success();
-}
-
-LogicalResult NVVM::BarrierOp::verify() {
-  if (getNumberOfThreads() && !getBarrierId())
-    return emitOpError(
-        "barrier id is missing, it should be set between 0 to 15");
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// NVVMDialect initialization, type parsing, and registration.
-//===----------------------------------------------------------------------===//
-
-// TODO: This should be the llvm.nvvm dialect once this is supported.
-void NVVMDialect::initialize() {
-  addOperations<
-#define GET_OP_LIST
-#include "mlir/Dialect/LLVMIR/NVVMOps.cpp.inc"
-      >();
-  addAttributes<
-#define GET_ATTRDEF_LIST
-#include "mlir/Dialect/LLVMIR/NVVMOpsAttributes.cpp.inc"
-      >();
-
-  // Support unknown operations because not all NVVM operations are
-  // registered.
-  allowUnknownOperations();
-  declarePromisedInterface<ConvertToLLVMPatternInterface, NVVMDialect>();
-  declarePromisedInterface<gpu::TargetAttrInterface, NVVMTargetAttr>();
-}
-
-LogicalResult NVVMDialect::verifyOperationAttribute(Operation *op,
-                                                    NamedAttribute attr) {
-  StringAttr attrName = attr.getName();
-  // Kernel function attribute should be attached to functions.
-  if (attrName == NVVMDialect::getKernelFuncAttrName()) {
-    if (!isa<LLVM::LLVMFuncOp>(op)) {
-      return op->emitError() << "'" << NVVMDialect::getKernelFuncAttrName()
-                             << "' attribute attached to unexpected op";
-    }
-  }
-  // If maxntid and reqntid exist, it must be an array with max 3 dim
-  if (attrName == NVVMDialect::getMaxntidAttrName() ||
-      attrName == NVVMDialect::getReqntidAttrName()) {
-    auto values = llvm::dyn_cast<DenseI32ArrayAttr>(attr.getValue());
-    if (!values || values.empty() || values.size() > 3)
-      return op->emitError()
-             << "'" << attrName
-             << "' attribute must be integer array with maximum 3 index";
-  }
-  // If minctasm and maxnreg exist, it must be an integer attribute
-  if (attrName == NVVMDialect::getMinctasmAttrName() ||
-      attrName == NVVMDialect::getMaxnregAttrName()) {
-    if (!llvm::dyn_cast<IntegerAttr>(attr.getValue()))
-      return op->emitError()
-             << "'" << attrName << "' attribute must be integer constant";
-  }
-
-  return success();
-}
-
-LogicalResult NVVMDialect::verifyRegionArgAttribute(Operation *op,
-                                                    unsigned regionIndex,
-                                                    unsigned argIndex,
-                                                    NamedAttribute argAttr) {
-  auto funcOp = dyn_cast<FunctionOpInterface>(op);
-  if (!funcOp)
-    return success();
-
-  bool isKernel = op->hasAttr(NVVMDialect::getKernelFuncAttrName());
-  StringAttr attrName = argAttr.getName();
-  if (attrName == NVVM::NVVMDialect::getGridConstantAttrName()) {
-    if (!isKernel) {
-      return op->emitError()
-             << "'" << attrName
-             << "' attribute must be present only on kernel arguments";
-    }
-    if (!isa<UnitAttr>(argAttr.getValue()))
-      return op->emitError() << "'" << attrName << "' must be a unit attribute";
-    if (!funcOp.getArgAttr(argIndex, LLVM::LLVMDialect::getByValAttrName())) {
-      return op->emitError()
-             << "'" << attrName
-             << "' attribute requires the argument to also have attribute '"
-             << LLVM::LLVMDialect::getByValAttrName() << "'";
-    }
-  }
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// NVVM target attribute.
-//===----------------------------------------------------------------------===//
-LogicalResult
-NVVMTargetAttr::verify(function_ref<InFlightDiagnostic()> emitError,
-                       int optLevel, StringRef triple, StringRef chip,
-                       StringRef features, DictionaryAttr flags,
-                       ArrayAttr files) {
-  if (optLevel < 0 || optLevel > 3) {
-    emitError() << "The optimization level must be a number between 0 and 3.";
-    return failure();
-  }
-  if (triple.empty()) {
-    emitError() << "The target triple cannot be empty.";
-    return failure();
-  }
-  if (chip.empty()) {
-    emitError() << "The target chip cannot be empty.";
-    return failure();
-  }
-  if (files && !llvm::all_of(files, [](::mlir::Attribute attr) {
-        return attr && mlir::isa<StringAttr>(attr);
-      })) {
-    emitError() << "All the elements in the `link` array must be strings.";
-    return failure();
-  }
-  return success();
-}
-
-#define GET_OP_CLASSES
-#include "mlir/Dialect/LLVMIR/NVVMOps.cpp.inc"
-
-#define GET_ATTRDEF_CLASSES
-#include "mlir/Dialect/LLVMIR/NVVMOpsAttributes.cpp.inc"

>From 7c9053ca23bbf94e10c7174e8c8c4ae178e5c0e7 Mon Sep 17 00:00:00 2001
From: bangyu shen <94283495+shubaoyu2 at users.noreply.github.com>
Date: Wed, 3 Jul 2024 17:03:29 +0800
Subject: [PATCH 3/7] Update NVVMDialect.cpp

---
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 1834 ++++++++++++--------
 1 file changed, 1141 insertions(+), 693 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 375e2951a037c..036a9a15af838 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1,694 +1,1142 @@
-// RUN: mlir-opt --convert-nvvm-to-llvm --convert-arith-to-llvm --split-input-file %s | FileCheck %s
-
-// Same below, but using the `ConvertToLLVMPatternInterface` entry point
-// and the generic `convert-to-llvm` pass.
-// RUN: mlir-opt --convert-to-llvm --split-input-file %s | FileCheck %s
-
-// CHECK-LABEL: @init_mbarrier
-llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %count : i32, %pred : i1) {
-  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.shared.b64 [$0], $1;", "r,r,b" 
-  nvvm.mbarrier.init.shared %barrier, %count, predicate = %pred : !llvm.ptr<3>, i32, i1 
-  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.b64 [$0], $1;", "l,r,b" 
-  nvvm.mbarrier.init %barrier_gen, %count, predicate = %pred : !llvm.ptr, i32, i1
-  llvm.return
-}
-
-// CHECK-LABEL: @init_mbarrier_arrive_expect_tx
-llvm.func @init_mbarrier_arrive_expect_tx(%barrier : !llvm.ptr<3>, %txcount : i32, %pred : i1) {
-  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r"
-  nvvm.mbarrier.arrive.expect_tx.shared %barrier, %txcount : !llvm.ptr<3>, i32
-  //CHECK:  llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r,b"
-  nvvm.mbarrier.arrive.expect_tx.shared %barrier, %txcount, predicate = %pred : !llvm.ptr<3>, i32, i1 
-  llvm.return
-}
-
-// CHECK-LABEL: @init_mbarrier_arrive_expect_tx_generic
-llvm.func @init_mbarrier_arrive_expect_tx_generic(%barrier : !llvm.ptr, %txcount : i32, %pred : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.b64 _, [$0], $1;", "l,r" 
-  nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr, i32
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.b64 _, [$0], $1;", "l,r,b"
-  nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred : !llvm.ptr, i32, i1 
-  llvm.return
-}
-
-// CHECK-LABEL: @init_mbarrier_try_wait_shared
-llvm.func @init_mbarrier_try_wait_shared(%barrier : !llvm.ptr<3>, %ticks : i32, %phase : i32) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
-  // CHECK-SAME: "{
-  // CHECK-SAME: .reg .pred       P1;
-  // CHECK-SAME: LAB_WAIT: 
-  // CHECK-SAME: mbarrier.try_wait.parity.shared.b64 P1, [$0], $1, $2;
-  // CHECK-SAME: @P1 bra.uni DONE;
-  // CHECK-SAME: bra.uni     LAB_WAIT;
-  // CHECK-SAME: DONE:
-  // CHECK-SAME: }",
-  // CHECK-SAME: "r,r,r"
-   nvvm.mbarrier.try_wait.parity.shared %barrier, %phase, %ticks : !llvm.ptr<3>, i32, i32
-  llvm.return
-}
-
-// CHECK-LABEL: @init_mbarrier_try_wait
-llvm.func @init_mbarrier_try_wait(%barrier : !llvm.ptr, %ticks : i32, %phase : i32){
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att
-  // CHECK-SAME: "{
-  // CHECK-SAME: .reg .pred       P1;
-  // CHECK-SAME: LAB_WAIT: 
-  // CHECK-SAME: mbarrier.try_wait.parity.b64 P1, [$0], $1, $2;
-  // CHECK-SAME: @P1 bra.uni DONE;
-  // CHECK-SAME: bra.uni     LAB_WAIT;
-  // CHECK-SAME: DONE:
-  // CHECK-SAME: }",
-  // CHECK-SAME: "l,r,r"
-  nvvm.mbarrier.try_wait.parity %barrier, %phase, %ticks : !llvm.ptr, i32, i32
-  llvm.return
-}
-
-// CHECK-LABEL: @async_cp
-func.func @async_cp(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>) {
-  // CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16, cache =  ca : !llvm.ptr<3>, !llvm.ptr<1>
-  nvvm.cp.async.shared.global %dst, %src, 16, cache =  ca : !llvm.ptr<3>, !llvm.ptr<1>
-  // CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16, cache =  cg : !llvm.ptr<3>, !llvm.ptr<1>
-  nvvm.cp.async.shared.global %dst, %src, 16, cache =  cg : !llvm.ptr<3>, !llvm.ptr<1>
-  return
-}
-
-// CHECK-LABEL: @async_cp_zfill
-func.func @async_cp_zfill(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>, %cpSize: i32) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
-  // CHECK-SAME: "cp.async.cg.shared.global [$0], [$1], $2, $3;\0A", 
-  // CHECK-SAME: "r,l,n,r" %{{.*}}, %{{.*}}, %{{.*}} : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> ()
-  nvvm.cp.async.shared.global %dst, %src, 16, cache =  cg, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
-  // CHECK-SAME: "cp.async.ca.shared.global [$0], [$1], $2, $3;\0A", 
-  // CHECK-SAME: "r,l,n,r" %{{.*}}, %{{.*}}, %{{.*}} : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> ()
-  nvvm.cp.async.shared.global %dst, %src, 4, cache =  ca, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32
-  return
-}
-
-// CHECK-LABEL: @cp_async_mbarrier_arrive
-func.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.ptr) {
-  // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}}
-  nvvm.cp.async.mbarrier.arrive %bar_gen : !llvm.ptr
-  // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} {noinc = true}
-  nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr
-  // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}}
-  nvvm.cp.async.mbarrier.arrive.shared %bar_shared : !llvm.ptr<3>
-  // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} {noinc = true}
-  nvvm.cp.async.mbarrier.arrive.shared %bar_shared {noinc = true} : !llvm.ptr<3>
-  llvm.return
-}
-
-// CHECK-LABEL: @tma_load_3d_all
-func.func @tma_load_3d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4} ], [$5],{$6}, $7, $8;", "r,l,r,r,r,r,h,h,l"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr  
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$9 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4} ], [$5],{$6}, $7, $8;", "r,l,r,r,r,r,h,h,l,b"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p : !llvm.ptr<3>, !llvm.ptr
-  return
-}
-
-// CHECK-LABEL: @tma_load_4d_all
-func.func @tma_load_4d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5} ], [$6],{$7,$8}, $9, $10;", "r,l,r,r,r,r,r,h,h,h,l"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] im2col[%off0,%off1] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr  
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$11 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5} ], [$6],{$7,$8}, $9, $10;", "r,l,r,r,r,r,r,h,h,h,l,b"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] im2col[%off0,%off1] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p : !llvm.ptr<3>, !llvm.ptr
-  return
-}
-
-// CHECK-LABEL: @tma_load_5d_all
-func.func @tma_load_5d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %off0: i16, %off1: i16, %off2: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5,$6} ], [$7],{$8,$9,$10}, $11, $12;", "r,l,r,r,r,r,r,r,h,h,h,h,l"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] im2col[%off0,%off1,%off2] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr  
-  // CHECK: lvm.inline_asm has_side_effects asm_dialect = att "@$13 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5,$6} ], [$7],{$8,$9,$10}, $11, $12;", "r,l,r,r,r,r,r,r,h,h,h,h,l,b"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] im2col[%off0,%off1,%off2] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p : !llvm.ptr<3>, !llvm.ptr
-  return
-}
-
-// CHECK-LABEL: @tma_load_1d
-func.func @tma_load_1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %p : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2} ], [$3];", "r,l,r,r"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0] : !llvm.ptr<3>, !llvm.ptr
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$4 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2} ], [$3];", "r,l,r,r,b"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0] predicate=%p : !llvm.ptr<3>, !llvm.ptr
-  return
-}
-
-// CHECK-LABEL: @tma_load_2d
-func.func @tma_load_2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %p : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3} ], [$4];", "r,l,r,r,r"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1] : !llvm.ptr<3>, !llvm.ptr
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3} ], [$4];", "r,l,r,r,r,b"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
-  return
-}
-
-// CHECK-LABEL: @tma_load_3d
-func.func @tma_load_3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4} ], [$5];", "r,l,r,r,r,r"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2] : !llvm.ptr<3>, !llvm.ptr
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4} ], [$5];", "r,l,r,r,r,r,b"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
-  return
-}
-
-// CHECK-LABEL: @tma_load_4d
-func.func @tma_load_4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5} ], [$6];", "r,l,r,r,r,r,r"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] : !llvm.ptr<3>, !llvm.ptr
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5} ], [$6];", "r,l,r,r,r,r,r,b"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
-  return
-}
-
-// CHECK-LABEL: @tma_load_5d
-func.func @tma_load_5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5,$6} ], [$7];", "r,l,r,r,r,r,r,r"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] : !llvm.ptr<3>, !llvm.ptr
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$8 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5,$6} ], [$7];", "r,l,r,r,r,r,r,r,b"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
-  return
-}
-
-// CHECK-LABEL: @tma_load_multicast1d
-func.func @tma_load_multicast1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %p : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2} ], [$3], $4;", "r,l,r,r,h"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2} ], [$3], $4;", "r,l,r,r,h,b"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<3>, !llvm.ptr
-  return
-}
-
-// CHECK-LABEL: @tma_load_multicast2d
-func.func @tma_load_multicast2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %p : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3} ], [$4], $5;", "r,l,r,r,r,h"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3} ], [$4], $5;", "r,l,r,r,r,h,b"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1] multicast_mask = %multicastMask  predicate=%p  : !llvm.ptr<3>, !llvm.ptr
-  return
-}
-
-// CHECK-LABEL: @tma_load_multicast3d
-func.func @tma_load_multicast3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4} ], [$5], $6;", "r,l,r,r,r,r,h"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4} ], [$5], $6;", "r,l,r,r,r,r,h,b"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2] multicast_mask = %multicastMask  predicate=%p  : !llvm.ptr<3>, !llvm.ptr
-  return
-}
-
-// CHECK-LABEL: @tma_load_multicast4d
-func.func @tma_load_multicast4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5} ], [$6], $7;", "r,l,r,r,r,r,r,h"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2,%crd3] multicast_mask = %multicastMask: !llvm.ptr<3>, !llvm.ptr
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$8 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5} ], [$6], $7;", "r,l,r,r,r,r,r,h,b"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2,%crd3] multicast_mask = %multicastMask predicate=%p  : !llvm.ptr<3>, !llvm.ptr
-  return
-}
-
-// CHECK-LABEL: @tma_load_multicast5d
-func.func @tma_load_multicast5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5,$6} ], [$7], $8;", "r,l,r,r,r,r,r,r,h"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2,%crd3,%crd4] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$9 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5,$6} ], [$7], $8;", "r,l,r,r,r,r,r,r,h,b"
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2,%crd3,%crd4] multicast_mask = %multicastMask predicate=%p  : !llvm.ptr<3>, !llvm.ptr
-  return
-}
-
-// CHECK-LABEL: @tma_store_1d
-func.func @tma_store_1d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %p : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [$0, {$2} ], [$1];", "l,r,r"
-  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0] : !llvm.ptr, !llvm.ptr<3>, i32
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$3 cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [$0, {$2} ], [$1];", "l,r,r,b"
-  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i1
-  return
-}
-
-// CHECK-LABEL: @tma_store_2d
-func.func @tma_store_2d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %p : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [$0, {$2, $3} ], [$1];", "l,r,r,r"
-  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1] : !llvm.ptr, !llvm.ptr<3>, i32, i32
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$4 cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [$0, {$2, $3} ], [$1];", "l,r,r,r,b"
-  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i32, i1
-  return
-}
-
-// CHECK-LABEL: @tma_store_3d
-func.func @tma_store_3d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [$0, {$2, $3, $4} ], [$1];", "l,r,r,r,r"
-  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2] : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [$0, {$2, $3, $4} ], [$1];", "l,r,r,r,r,b"
-  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i1
-  return
-}
-
-// CHECK-LABEL: @tma_store_4d
-func.func @tma_store_4d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5} ], [$1];", "l,r,r,r,r,r"
-  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3] : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5} ], [$1];", "l,r,r,r,r,r,b"
-  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32, i1
-  return
-}
-
-// CHECK-LABEL: @tma_store_5d
-func.func @tma_store_5d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) {
-  // CHECK-NEXT: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5, $6} ], [$1];", "l,r,r,r,r,r,r"
-  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3,%crd4] : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32, i32
-
-  // CHECK-NEXT: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.5d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5, $6} ], [$1];", "l,r,r,r,r,r,r,b"
-  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3,%crd4], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32, i32, i1
-  return
-}
-
-// CHECK-LABEL: @wgmma_execute
-func.func @wgmma_execute() {  
-  nvvm.wgmma.fence.aligned
-  nvvm.wgmma.commit.group.sync.aligned
-  nvvm.wgmma.wait.group.sync.aligned 0
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "wgmma.fence.sync.aligned;"
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "wgmma.commit_group.sync.aligned;"
-  // CHECK: %[[S0:.+]] = llvm.mlir.constant(0 : i32) : i32
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "wgmma.wait_group.sync.aligned $0;", "n" %[[S0]] : (i32)
-  
-
-  nvvm.wgmma.fence.aligned
-  nvvm.wgmma.commit.group.sync.aligned
-  nvvm.wgmma.wait.group.sync.aligned 5
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "wgmma.fence.sync.aligned;"
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "wgmma.commit_group.sync.aligned;"
-  // CHECK: %[[S1:.+]] = llvm.mlir.constant(5 : i32) : i32
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "wgmma.wait_group.sync.aligned $0;", "n" %[[S1]] : (i32)
-  return
-}
-
-
-// -----
-
-!mat64f32 = !llvm.struct<(
-  f32, f32, f32, f32, f32, f32, f32, f32, 
-  f32, f32, f32, f32, f32, f32, f32, f32)>
-
-// CHECK-LABEL: @wgmma_f32_f16_f16(
-// CHECK-SAME: %[[ARG0:.+]]: i64, %[[ARG1:.+]]: i64
-func.func @wgmma_f32_f16_f16(%descA : i64, %descB : i64) -> !mat64f32{  
-  // CHECK: %[[RES:.*]] = llvm.mlir.undef : !llvm.struct
-  // CHECK: %[[A1:.*]] = llvm.mlir.constant(0 : i32) : i32
-  // CHECK: %[[A2:.*]] = llvm.mlir.constant(-1 : i32) : i32
-  // CHECK: %[[A3:.*]] = llvm.mlir.constant(-1 : i32) : i32
-  // CHECK: %[[A4:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[A5:.*]] = llvm.mlir.constant(0 : i32) : i32
-  // CHECK: %[[V0:.*]] = llvm.extractvalue %[[RES]][0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
-  // CHECK: %[[V4:.*]] = llvm.extractvalue %[[RES]][4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
-  // CHECK: %[[V11:.*]] = llvm.extractvalue %[[RES]][11] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>  
-  // CHECK: %[[V13:.*]] = llvm.extractvalue %[[RES]][13] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
-  // CHECK: %[[RES1:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
-  // CHECK-SAME: "{
-  // CHECK-SAME: reg .pred p;
-  // CHECK-SAME: setp.ne.b32 p, $34, 0;
-  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 
-  // CHECK-SAME: {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}, $32, $33, p, $35,  $36, $37,  $38;\0A}\0A", 
-  // CHECK-SAME: "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,l,l,n,n,n,n,n" 
-  // CHECK-SAME: %[[V0]], %{{.*}}, %{{.*}}, %{{.*}}, %[[V4]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[V11]], %{{.*}}, %[[V13]], %{{.*}}, %{{.*}}, %[[ARG0]], %[[ARG1]], %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[A5]] 
-  // CHECK-SAME: : (f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, i64, i64, i32, i32, i32, i32, i32) 
-  // CHECK-SAME: -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
-  // CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64
-  // CHECK: %[[DESCa:.+]] = llvm.add %[[ARG0]], %[[C2]] : i64
-  // CHECK: %[[DESCb:.+]] = llvm.add %[[ARG1]], %[[C2]] : i64
-  // CHECK: %[[V0_2:.*]] = llvm.extractvalue %[[RES1]][0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
-  // CHECK: %[[V4_2:.*]] = llvm.extractvalue %[[RES1]][4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
-  // CHECK: %[[V11_2:.*]] = llvm.extractvalue %[[RES1]][11] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>  
-  // CHECK: %[[V13_2:.*]] = llvm.extractvalue %[[RES1]][13] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
-  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
-  // CHECK-SAME: "{
-    // CHECK-SAME: .reg .pred p;
-    // CHECK-SAME: setp.ne.b32 p, $34, 0;
-    // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 
-    // CHECK-SAME: {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}, $32, $33, p, $35,  $36, $37,  $38;\0A}\0A", 
-    // CHECK-SAME: "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,l,l,n,n,n,n,n" 
-    // CHECK-SAME: %[[V0_2]], %{{.*}}, %{{.*}}, %{{.*}}, %[[V4_2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[V11_2]], %{{.*}}, %[[V13_2]], %{{.*}}, %{{.*}}, %[[DESCa]], %[[DESCb]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} 
-  %result = llvm.mlir.undef : !mat64f32
-  %result1 = nvvm.wgmma.mma_async 
-      %descA, %descB, %result,
-      #nvvm.shape<m = 64, n = 32, k = 16>, 
-      D [<f32>, #nvvm.wgmma_scale_out<zero>],
-      A [<f16>, #nvvm.wgmma_scale_in<neg>, <col>], 
-      B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>]
-      :!mat64f32 -> !mat64f32
-  %c2 = arith.constant 2 : i64
-  %descAnext = arith.addi %descA, %c2 : i64
-  %descBnext = arith.addi %descB, %c2 : i64
-  %result2 = nvvm.wgmma.mma_async 
-      %descAnext, %descBnext, %result1,
-      #nvvm.shape<m = 64, n = 32, k = 16>, 
-      D [<f32>, #nvvm.wgmma_scale_out<zero>],
-      A [<f16>, #nvvm.wgmma_scale_in<neg>, <col>], 
-      B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>]
-      : !mat64f32 -> !mat64f32
-  return %result2 : !mat64f32
-}
-
-// -----
-
-!mat16i32 = !llvm.struct<(i32, i32, i32, i32)>
-
-// CHECK-LABEL: @wgmma_s32_s8_s8_satfinite(
-// CHECK-SAME: %[[ARG0:.+]]: i64, %[[ARG1:.+]]: i64
-func.func @wgmma_s32_s8_s8_satfinite(%descA : i64, %descB : i64) -> !mat16i32{  
-  %result = llvm.mlir.undef : !mat16i32
-// CHECK: %[[RES:.*]] = llvm.mlir.undef : !llvm.struct
-// CHECK: %[[A1:.*]] = llvm.mlir.constant(1 : i32) : i32
-// CHECK: %[[V0:.*]] = llvm.extractvalue %[[RES]][0]
-// CHECK: %[[V1:.*]] = llvm.extractvalue %[[RES]][1]
-// CHECK: %[[V2:.*]] = llvm.extractvalue %[[RES]][2]
-// CHECK: %[[V3:.*]] = llvm.extractvalue %[[RES]][3]
-// CHECK: %[[RES_2:.*]] =  llvm.inline_asm has_side_effects asm_dialect = att 
-// CHECK-SAME: "{
-// CHECK-SAME: .reg .pred p;
-// CHECK-SAME: setp.ne.b32 p, $10, 0;
-// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite 
-// CHECK-SAME: {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", "=r,=r,=r,=r,0,1,2,3,l,l,n" 
-// CHECK-SAME: %[[V0]], %[[V1]], %[[V2]], %[[V3]], %[[ARG0]], %[[ARG1]], %[[A1]] : 
-// CHECK-SAME: (i32, i32, i32, i32, i64, i64, i32) -> !llvm.struct<(i32, i32, i32, i32)>
-// CHECK: %[[V0_2:.*]] = llvm.extractvalue %[[RES_2]][0]
-// CHECK: %[[V1_2:.*]] = llvm.extractvalue %[[RES_2]][1]
-// CHECK: %[[V2_2:.*]] = llvm.extractvalue %[[RES_2]][2]
-// CHECK: %[[V3_2:.*]] = llvm.extractvalue %[[RES_2]][3]
-// CHECK: %[[RES_3:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 
-// CHECK-SAME: "{
-// CHECK-SAME: .reg .pred p;
-// CHECK-SAME: setp.ne.b32 p, $10, 0;
-// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite 
-// CHECK-SAME: {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", 
-// CHECK-SAME: "=r,=r,=r,=r,0,1,2,3,l,l,n" 
-// CHECK-SAME: %[[V0_2]], %[[V1_2]], %[[V2_2]], %[[V3_2]], %[[ARG0]], %[[ARG1]], %{{.*}}
-// CHECK: %[[V0_3:.*]] = llvm.extractvalue %[[RES_3]][0]
-// CHECK: %[[V1_3:.*]] = llvm.extractvalue %[[RES_3]][1]
-// CHECK: %[[V2_3:.*]] = llvm.extractvalue %[[RES_3]][2]
-// CHECK: %[[V3_3:.*]] = llvm.extractvalue %[[RES_3]][3]
-// CHECK: %[[RES1:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 
-// CHECK-SAME:"{
-// CHECK-SAME:.reg .pred p;
-// CHECK-SAME: setp.ne.b32 p, $10, 0;
-// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite
-// CHECK-SAME: {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", "=r,=r,=r,=r,0,1,2,3,l,l,n" 
-// CHECK-SAME: %[[V0_3]], %[[V1_3]], %[[V2_3]], %[[V3_3]], %[[ARG0]], %[[ARG1]], %{{.*}} 
-  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result, 
-      #nvvm.shape<m = 64, n = 8, k = 32>, 
-      D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>],
-      A [<s8>, #nvvm.wgmma_scale_in<one>, <row>], 
-      B [<s8>, #nvvm.wgmma_scale_in<one>, <col>]
-      : !mat16i32 -> !mat16i32
-  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1, 
-      #nvvm.shape<m = 64, n = 8, k = 32>, 
-      D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>],
-      A [<s8>, #nvvm.wgmma_scale_in<one>, <row>], 
-      B [<s8>, #nvvm.wgmma_scale_in<one>, <col>]
-      : !mat16i32 -> !mat16i32
-  %result3 = nvvm.wgmma.mma_async %descA, %descB, %result2, 
-      #nvvm.shape<m = 64, n = 8, k = 32>, 
-      D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>],
-      A [<s8>, #nvvm.wgmma_scale_in<one>, <row>], 
-      B [<s8>, #nvvm.wgmma_scale_in<one>, <col>]
-      : !mat16i32 -> !mat16i32
-  return %result3 : !mat16i32
-}
-
-// CHECK-LABEL: @wgmma_s32_u8_u8(
-  // CHECK-SAME: %[[ARG0:.+]]: i64, %[[ARG1:.+]]: i64
-func.func @wgmma_s32_u8_u8(%descA : i64, %descB : i64) -> !mat16i32 {  
-// CHECK: %[[RES:.*]] = llvm.mlir.undef : !llvm.struct
-// CHECK: %[[A1:.*]] = llvm.mlir.constant(1 : i32) : i32
-// CHECK: %[[V0:.*]] = llvm.extractvalue %[[RES]][0]
-// CHECK: %[[V1:.*]] = llvm.extractvalue %[[RES]][1]
-// CHECK: %[[V2:.*]] = llvm.extractvalue %[[RES]][2]
-// CHECK: %[[V3:.*]] = llvm.extractvalue %[[RES]][3]
-// CHECK: %[[RES_2:.*]] =  llvm.inline_asm has_side_effects asm_dialect = att 
-// CHECK-SAME: "{
-// CHECK-SAME: .reg .pred p;
-// CHECK-SAME: setp.ne.b32 p, $10, 0;
-// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p;
-// CHECK-SAME: }\0A",
-// CHECK-SAME: "=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0]], %[[V1]], %[[V2]], %[[V3]], %[[ARG0]], %[[ARG1]], %[[A1]] : 
-// CHECK-SAME:(i32, i32, i32, i32, i64, i64, i32) -> !llvm.struct<(i32, i32, i32, i32)>
-// CHECK: %[[V0_2:.*]] = llvm.extractvalue %[[RES_2]][0]
-// CHECK: %[[V1_2:.*]] = llvm.extractvalue %[[RES_2]][1]
-// CHECK: %[[V2_2:.*]] = llvm.extractvalue %[[RES_2]][2]
-// CHECK: %[[V3_2:.*]] = llvm.extractvalue %[[RES_2]][3]
-// CHECK: %[[RES_3:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 
-// CHECK-SAME:"{
-// CHECK-SAME: .reg .pred p;
-// CHECK-SAME: setp.ne.b32 p, $10, 0;
-// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p;
-// CHECK-SAME: }\0A",
-// CHECK-SAME: "=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0_2]], %[[V1_2]], %[[V2_2]], %[[V3_2]], %[[ARG0]], %[[ARG1]], %{{.*}}
-// CHECK: %[[V0_3:.*]] = llvm.extractvalue %[[RES_3]][0]
-// CHECK: %[[V1_3:.*]] = llvm.extractvalue %[[RES_3]][1]
-// CHECK: %[[V2_3:.*]] = llvm.extractvalue %[[RES_3]][2]
-// CHECK: %[[V3_3:.*]] = llvm.extractvalue %[[RES_3]][3]
-// CHECK: %[[RES1:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 
-// CHECK-SAME:"{
-// CHECK-SAME: .reg .pred p;
-// CHECK-SAME: setp.ne.b32 p, $10, 0;
-// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p;
-// CHECK-SAME:}\0A", 
-// CHECK-SAME:"=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0_3]], %[[V1_3]], %[[V2_3]], %[[V3_3]], %[[ARG0]], %[[ARG1]], %{{.*}} 
-  %result = llvm.mlir.undef : !mat16i32
-  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result,
-      #nvvm.shape<m = 64, n = 8, k = 32>, 
-      D [<s32>, #nvvm.wgmma_scale_out<one>],
-      A [<u8>, #nvvm.wgmma_scale_in<one>, <row>], 
-      B [<u8>, #nvvm.wgmma_scale_in<one>, <col>]
-      : !mat16i32 -> !mat16i32
-  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
-      #nvvm.shape<m = 64, n = 8, k = 32>, 
-      D [<s32>, #nvvm.wgmma_scale_out<one>],
-      A [<u8>, #nvvm.wgmma_scale_in<one>, <row>], 
-      B [<u8>, #nvvm.wgmma_scale_in<one>, <col>]
-      : !mat16i32 -> !mat16i32
-  %result3 = nvvm.wgmma.mma_async %descA, %descB, %result2,
-      #nvvm.shape<m = 64, n = 8, k = 32>, 
-      D [<s32>, #nvvm.wgmma_scale_out<one>],
-      A [<u8>, #nvvm.wgmma_scale_in<one>, <row>], 
-      B [<u8>, #nvvm.wgmma_scale_in<one>, <col>]
-      : !mat16i32 -> !mat16i32
-  return %result3 : !mat16i32
-}
-
-// -----
-
-!mat32f32 = !llvm.struct<(
-  f32, f32, f32, f32, f32, f32, f32, f32, 
-  f32, f32, f32, f32, f32, f32, f32, f32, 
-  f32, f32, f32, f32, f32, f32, f32, f32, 
-  f32, f32, f32, f32, f32, f32, f32, f32)>
-
-// CHECK-LABEL: @wgmma_f32_tf32_tf32
-func.func @wgmma_f32_tf32_tf32(%descA : i64, %descB : i64) -> !mat32f32 {  
-  // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
-  // CHECK-SAME:"{
-  // CHECK-SAME: .reg .pred p;
-  // CHECK-SAME: setp.ne.b32 p, $66, 0;
-  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
-  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
-  // CHECK-SAME: "{
-  // CHECK-SAME: .reg .pred p;
-  // CHECK-SAME: setp.ne.b32 p, $66, 0;
-  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
-  %result = llvm.mlir.undef : !mat32f32
-  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result,
-      #nvvm.shape<m = 64, n = 64, k = 8>, 
-      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
-      A [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
-      B [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
-       : !mat32f32 -> !mat32f32
-  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
-      #nvvm.shape<m = 64, n = 64, k = 8>, 
-      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
-      A [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
-      B [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
-      : !mat32f32 -> !mat32f32
-  return %result2 : !mat32f32
-}
-
-
-// -----
-
-!mat32f32 = !llvm.struct<(
-  f32, f32, f32, f32, f32, f32, f32, f32, 
-  f32, f32, f32, f32, f32, f32, f32, f32, 
-  f32, f32, f32, f32, f32, f32, f32, f32, 
-  f32, f32, f32, f32, f32, f32, f32, f32)>
-
-// CHECK-LABEL: @wgmma_f32_e4m3_e4m3
-func.func @wgmma_f32_e4m3_e4m3(%descA : i64, %descB : i64) -> !mat32f32 {  
-  // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
-  // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
-  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
-  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
-  // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
-  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
-  %result = llvm.mlir.undef : !mat32f32
-  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result,
-      #nvvm.shape<m = 64, n = 64, k = 32>, 
-      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
-      A [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
-      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
-       : !mat32f32 -> !mat32f32
-  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
-      #nvvm.shape<m = 64, n = 64, k = 32>, 
-      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
-      A [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
-      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
-      : !mat32f32 -> !mat32f32
-  return %result2 : !mat32f32
-}
-
-// -----
-
-!mat32f32 = !llvm.struct<(
-  f32, f32, f32, f32, f32, f32, f32, f32, 
-  f32, f32, f32, f32, f32, f32, f32, f32, 
-  f32, f32, f32, f32, f32, f32, f32, f32, 
-  f32, f32, f32, f32, f32, f32, f32, f32)>
-
-// CHECK-LABEL: @wgmma_f32_e5m2_e4m3
-func.func @wgmma_f32_e5m2_e4m3(%descA : i64, %descB : i64) -> !mat32f32 {  
-  // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
-  // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
-  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
-  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
-  // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
-  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
-  %result = llvm.mlir.undef : !mat32f32
-  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result,
-      #nvvm.shape<m = 64, n = 64, k = 32>, 
-      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
-      A [#nvvm.wgmma_type<e5m2>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
-      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
-       : !mat32f32 -> !mat32f32
-  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
-      #nvvm.shape<m = 64, n = 64, k = 32>, 
-      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
-      A [#nvvm.wgmma_type<e5m2>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
-      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
-      : !mat32f32 -> !mat32f32
-  return %result2 : !mat32f32
-}
-
-// -----
-
-func.func @elect_one_leader_sync() {  
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "{
-  // CHECK-SAME: .reg .u32 rx;
-  // CHECK-SAME: .reg .pred px;
-  // CHECK-SAME: mov.pred $0, 0;
-  // CHECK-SAME: elect.sync rx | px, 0xFFFFFFFF;
-  // CHECK-SAME: @px mov.pred $0, 1;
-  // CHECK-SAME: "=b"  : () -> i1
-  %cnd = nvvm.elect.sync -> i1 
-  return 
-}
-
-// -----
-
-// CHECK-LABEL: @stmatrix(
-// CHECK-SAME: %[[arg0:[a-zA-Z0-9_]+]]: !llvm.ptr<3>, 
-// CHECK-SAME: %[[arg1:[a-zA-Z0-9_]+]]: i32,
-// CHECK-SAME: %[[arg2:[a-zA-Z0-9_]+]]: i32,
-// CHECK-SAME: %[[arg3:[a-zA-Z0-9_]+]]: i32,
-// CHECK-SAME: %[[arg4:[a-zA-Z0-9_]+]]: i32)
-llvm.func @stmatrix(%arg0 : !llvm.ptr<3>, %m1 : i32, %m2 : i32, %m3 : i32, %m4 : i32) {
-// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x1.m8n8.shared.b16 [$0], {$1};", "r,r" %[[arg0]], %[[arg1]] : (!llvm.ptr<3>, i32) -> ()
-// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x2.m8n8.shared.b16 [$0], {$1, $2};", "r,r,r" %[[arg0]], %[[arg1]], %[[arg2]] : (!llvm.ptr<3>, i32, i32) -> ()
-// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x4.m8n8.shared.b16 [$0], {$1, $2, $3, $4};", "r,r,r,r,r" %[[arg0]], %[[arg1]], %[[arg2]], %[[arg3]], %[[arg4]] : (!llvm.ptr<3>, i32, i32, i32, i32) -> ()
-// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x1.trans.m8n8.shared.b16 [$0], {$1};", "r,r" %[[arg0]], %[[arg1]] : (!llvm.ptr<3>, i32) -> ()
-// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x2.trans.m8n8.shared.b16 [$0], {$1, $2};", "r,r,r" %[[arg0]], %[[arg1]], %[[arg2]] : (!llvm.ptr<3>, i32, i32) -> ()
-// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x4.trans.m8n8.shared.b16 [$0], {$1, $2, $3, $4};", "r,r,r,r,r" %[[arg0]], %[[arg1]], %[[arg2]], %[[arg3]], %[[arg4]] : (!llvm.ptr<3>, i32, i32, i32, i32) -> ()
-  nvvm.stmatrix %arg0, %m1 {layout = #nvvm.mma_layout<row>} : !llvm.ptr<3>, i32
-  nvvm.stmatrix %arg0, %m1, %m2 {layout = #nvvm.mma_layout<row>} : !llvm.ptr<3>, i32, i32
-  nvvm.stmatrix %arg0, %m1, %m2, %m3, %m4 {layout = #nvvm.mma_layout<row>} : !llvm.ptr<3>, i32, i32, i32, i32
-  nvvm.stmatrix %arg0, %m1 {layout = #nvvm.mma_layout<col>} : !llvm.ptr<3>, i32
-  nvvm.stmatrix %arg0, %m1, %m2 {layout = #nvvm.mma_layout<col>} : !llvm.ptr<3>, i32, i32
-  nvvm.stmatrix %arg0, %m1, %m2, %m3, %m4 {layout = #nvvm.mma_layout<col>} : !llvm.ptr<3>, i32, i32, i32, i32
-  llvm.return 
-}
-
-// -----
-
-// CHECK-LABEL: @init_mbarrier_arrive_expect_tx
-llvm.func @init_mbarrier_arrive_expect_tx(%desc : !llvm.ptr, %pred : i1) {
-  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "prefetch.tensormap [$0];", "l"
-  nvvm.prefetch.tensormap %desc : !llvm.ptr
-  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$1 prefetch.tensormap [$0];", "l,b"
-  nvvm.prefetch.tensormap %desc, predicate = %pred : !llvm.ptr, i1
-  llvm.return
-}
-
-// -----
-
-func.func @set_max_register() {
-  // CHECK: nvvm.setmaxregister increase 232
-  nvvm.setmaxregister increase 232
-
-  // CHECK: nvvm.setmaxregister decrease 40
-  nvvm.setmaxregister decrease 40
-  func.return
-}
-
-// -----
-
-func.func @cp_async_bulk_commit() {
-  // CHECK: nvvm.cp.async.bulk.commit.group
-  nvvm.cp.async.bulk.commit.group
-  func.return
-}
-
-// -----
-
-func.func @cp_async_bulk_wait_group() {
-  // CHECK: nvvm.cp.async.bulk.wait_group 1
-  // CHECK: nvvm.cp.async.bulk.wait_group 0
-  // CHECK: nvvm.cp.async.bulk.wait_group 5 {read}
-  // CHECK: nvvm.cp.async.bulk.wait_group 0 {read}
-  nvvm.cp.async.bulk.wait_group 1
-  nvvm.cp.async.bulk.wait_group 0
-  nvvm.cp.async.bulk.wait_group 5 {read}
-  nvvm.cp.async.bulk.wait_group 0 {read}
-  func.return
-}
-
-// -----
-
-func.func @fence_mbarrier_init() {
-  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.mbarrier_init.release.cluster;"
-  nvvm.fence.mbarrier.init
-  func.return 
-}
-// -----
-
-func.func @fence_proxy() {
-  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.alias;", ""  : () -> ()
-  nvvm.fence.proxy { kind = #nvvm.proxy_kind<alias>}
-  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async;", ""  : () -> ()
-  nvvm.fence.proxy { kind = #nvvm.proxy_kind<async>}
-  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async.global;", ""  : () -> ()
-  nvvm.fence.proxy { kind = #nvvm.proxy_kind<async.global>}
-  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async.shared::cta;", ""  : () -> ()
-  nvvm.fence.proxy { kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>}
-  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async.shared::cluster;", ""  : () -> ()
-  nvvm.fence.proxy { kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cluster>}
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: @llvm_nvvm_barrier_arrive
-// CHECK-SAME: (%[[barId:.*]]: i32, %[[numberOfThreads:.*]]: i32)
-llvm.func @llvm_nvvm_barrier_arrive(%barID : i32, %numberOfThreads : i32) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "bar.arrive 0, $0;", "r" %[[numberOfThreads]] : (i32) -> ()
-  nvvm.barrier.arrive number_of_threads = %numberOfThreads
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "bar.arrive $0, $1;", "r,r" %[[barId]], %[[numberOfThreads]] : (i32, i32) -> ()
-  nvvm.barrier.arrive id = %barID number_of_threads = %numberOfThreads
-  llvm.return
+//===- NVVMDialect.cpp - NVVM IR Ops and Dialect registration -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the types and operation details for the NVVM IR dialect in
+// MLIR, and the LLVM IR dialect.  It also registers the dialect.
+//
+// The NVVM dialect only contains GPU specific additions on top of the general
+// LLVM dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+
+#include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
+#include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/Types.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <optional>
+#include <string>
+
+using namespace mlir;
+using namespace NVVM;
+
+#include "mlir/Dialect/LLVMIR/NVVMOpsDialect.cpp.inc"
+#include "mlir/Dialect/LLVMIR/NVVMOpsEnums.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for NVVM ops
+//===----------------------------------------------------------------------===//
+
+static void printNVVMIntrinsicOp(OpAsmPrinter &p, Operation *op) {
+  p << " " << op->getOperands();
+  if (op->getNumResults() > 0)
+    p << " : " << op->getResultTypes();
 }
+
+// <operation> ::= `llvm.nvvm.vote.ballot.sync %mask, %pred` : result_type
+ParseResult VoteBallotOp::parse(OpAsmParser &parser, OperationState &result) {
+  MLIRContext *context = parser.getContext();
+  auto int32Ty = IntegerType::get(context, 32);
+  auto int1Ty = IntegerType::get(context, 1);
+
+  SmallVector<OpAsmParser::UnresolvedOperand, 8> ops;
+  Type type;
+  return failure(parser.parseOperandList(ops) ||
+                 parser.parseOptionalAttrDict(result.attributes) ||
+                 parser.parseColonType(type) ||
+                 parser.addTypeToList(type, result.types) ||
+                 parser.resolveOperands(ops, {int32Ty, int1Ty},
+                                        parser.getNameLoc(), result.operands));
+}
+
+void VoteBallotOp::print(OpAsmPrinter &p) { printNVVMIntrinsicOp(p, *this); }
+
+LogicalResult CpAsyncBulkTensorGlobalToSharedClusterOp::verify() {
+  if (getCoordinates().empty() || getCoordinates().size() > 5)
+    return emitError("expects coordinates between 1 to 5 dimension");
+
+  // Check for im2col mode
+  if (!getIm2colOffsets().empty()) {
+    if (getCoordinates().size() < 3)
+      return emitError(
+          "to use im2col mode, the tensor has to be at least 3-dimensional");
+    if (getCoordinates().size() != (getIm2colOffsets().size() + 2))
+      return emitError(
+          "im2col offsets must be 2 less than number of coordinates");
+  }
+  return success();
+}
+
+LogicalResult CpAsyncBulkTensorSharedCTAToGlobalOp::verify() {
+  if (getCoordinates().size() > 5)
+    return emitError("Maximum 5 coordinates and dimension is supported.");
+  return success();
+}
+
+LogicalResult CpAsyncOp::verify() {
+  if (getModifier() != LoadCacheModifierKind::CG &&
+      getModifier() != LoadCacheModifierKind::CA)
+    return emitError("Only CG and CA cache modifiers are supported.");
+  if (getSize() != 4 && getSize() != 8 && getSize() != 16)
+    return emitError("expected byte size to be either 4, 8 or 16.");
+  if (getModifier() == LoadCacheModifierKind::CG && getSize() != 16)
+    return emitError("CG cache modifier is only support for 16 bytes copy.");
+  return success();
+}
+
+// Given the element type of an operand and whether or not it is an accumulator,
+// this function returns the PTX type (`NVVM::MMATypes`) that corresponds to the
+// operand's element type.
+std::optional<mlir::NVVM::MMATypes>
+MmaOp::inferOperandMMAType(Type operandElType, bool isAccumulator) {
+  auto half2Type =
+      LLVM::getFixedVectorType(Float16Type::get(operandElType.getContext()), 2);
+  if (operandElType.isF64())
+    return NVVM::MMATypes::f64;
+  if (operandElType.isF16() || operandElType == half2Type)
+    return NVVM::MMATypes::f16;
+  if (operandElType.isF32() && isAccumulator)
+    return NVVM::MMATypes::f32;
+  if (operandElType.isF32() && !isAccumulator)
+    return NVVM::MMATypes::tf32;
+  if (llvm::isa<IntegerType>(operandElType)) {
+    if (isAccumulator)
+      return NVVM::MMATypes::s32;
+    return std::nullopt;
+  }
+
+  if (auto structType = llvm::dyn_cast<LLVM::LLVMStructType>(operandElType)) {
+    if (structType.getBody().empty())
+      return std::nullopt;
+    return inferOperandMMAType(structType.getBody()[0], isAccumulator);
+  }
+
+  return std::nullopt;
+}
+
+static bool isInt4PtxType(MMATypes type) {
+  return (type == MMATypes::u4 || type == MMATypes::s4);
+}
+
+static bool isInt8PtxType(MMATypes type) {
+  return (type == MMATypes::u8 || type == MMATypes::s8);
+}
+
+static bool isIntegerPtxType(MMATypes type) {
+  return isInt4PtxType(type) || isInt8PtxType(type) || type == MMATypes::b1 ||
+         type == MMATypes::s32;
+}
+
+MMATypes MmaOp::accumPtxType() {
+  std::optional<mlir::NVVM::MMATypes> val = inferOperandMMAType(
+      getODSOperands(2).getTypes().front(), /*isAccum=*/true);
+  assert(val.has_value() && "accumulator PTX type should always be inferrable");
+  return val.value();
+}
+
+MMATypes MmaOp::resultPtxType() {
+  std::optional<mlir::NVVM::MMATypes> val =
+      inferOperandMMAType(getResult().getType(), /*isAccum=*/true);
+  assert(val.has_value() && "result PTX type should always be inferrable");
+  return val.value();
+}
+
+void MmaOp::print(OpAsmPrinter &p) {
+  SmallVector<Type, 4> regTypes;
+  struct OperandFragment {
+    StringRef operandName;
+    StringRef ptxTypeAttr;
+    SmallVector<Value, 4> regs;
+    explicit OperandFragment(StringRef name, StringRef ptxTypeName)
+        : operandName(name), ptxTypeAttr(ptxTypeName) {}
+  };
+
+  std::array<OperandFragment, 3> frags{
+      OperandFragment("A", getMultiplicandAPtxTypeAttrName()),
+      OperandFragment("B", getMultiplicandBPtxTypeAttrName()),
+      OperandFragment("C", "")};
+  SmallVector<StringRef, 4> ignoreAttrNames{
+      mlir::NVVM::MmaOp::getOperandSegmentSizeAttr()};
+
+  for (unsigned fragIdx = 0; fragIdx < frags.size(); fragIdx++) {
+    auto &frag = frags[fragIdx];
+    auto varOperandSpec = getODSOperandIndexAndLength(fragIdx);
+    for (auto operandIdx = varOperandSpec.first;
+         operandIdx < varOperandSpec.first + varOperandSpec.second;
+         operandIdx++) {
+      frag.regs.push_back(this->getOperand(operandIdx));
+      if (operandIdx == 0) {
+        regTypes.push_back(this->getOperand(operandIdx).getType());
+      }
+    }
+    std::optional<MMATypes> inferredType =
+        inferOperandMMAType(regTypes.back(), /*isAccum=*/fragIdx >= 2);
+    if (inferredType)
+      ignoreAttrNames.push_back(frag.ptxTypeAttr);
+  }
+
+  auto printMmaOperand = [&](const OperandFragment &frag) -> void {
+    p << " " << frag.operandName;
+    p << "[";
+    p.printOperands(frag.regs);
+    p << "] ";
+  };
+
+  for (const auto &frag : frags) {
+    printMmaOperand(frag);
+  }
+
+  p.printOptionalAttrDict(this->getOperation()->getAttrs(), ignoreAttrNames);
+
+  // Print the types of the operands and result.
+  p << " : " << "(";
+  llvm::interleaveComma(SmallVector<Type, 3>{frags[0].regs[0].getType(),
+                                             frags[1].regs[0].getType(),
+                                             frags[2].regs[0].getType()},
+                        p);
+  p << ")";
+  p.printArrowTypeList(TypeRange{this->getRes().getType()});
+}
+
+void MmaOp::build(OpBuilder &builder, OperationState &result, Type resultType,
+                  ValueRange operandA, ValueRange operandB, ValueRange operandC,
+                  ArrayRef<int64_t> shape, std::optional<MMAB1Op> b1Op,
+                  std::optional<MMAIntOverflow> intOverflow,
+                  std::optional<std::array<MMATypes, 2>> multiplicandPtxTypes,
+                  std::optional<std::array<MMALayout, 2>> multiplicandLayouts) {
+
+  assert(shape.size() == 3 && "expected shape to have size 3 (m, n, k)");
+  MLIRContext *ctx = builder.getContext();
+  result.addAttribute(
+      "shape", builder.getAttr<MMAShapeAttr>(shape[0], shape[1], shape[2]));
+
+  result.addOperands(operandA);
+  result.addOperands(operandB);
+  result.addOperands(operandC);
+
+  if (multiplicandPtxTypes) {
+    result.addAttribute("multiplicandAPtxType",
+                        MMATypesAttr::get(ctx, (*multiplicandPtxTypes)[0]));
+    result.addAttribute("multiplicandBPtxType",
+                        MMATypesAttr::get(ctx, (*multiplicandPtxTypes)[1]));
+  } else {
+    if (auto res = inferOperandMMAType(operandA[0].getType(), false))
+      result.addAttribute("multiplicandAPtxType", MMATypesAttr::get(ctx, *res));
+    if (auto res = inferOperandMMAType(operandB[0].getType(), false))
+      result.addAttribute("multiplicandBPtxType", MMATypesAttr::get(ctx, *res));
+  }
+
+  if (multiplicandLayouts) {
+    result.addAttribute("layoutA",
+                        MMALayoutAttr::get(ctx, (*multiplicandLayouts)[0]));
+    result.addAttribute("layoutB",
+                        MMALayoutAttr::get(ctx, (*multiplicandLayouts)[1]));
+  } else {
+    result.addAttribute("layoutA", MMALayoutAttr::get(ctx, MMALayout::row));
+    result.addAttribute("layoutB", MMALayoutAttr::get(ctx, MMALayout::col));
+  }
+
+  if (intOverflow.has_value())
+    result.addAttribute("intOverflowBehavior",
+                        MMAIntOverflowAttr::get(ctx, *intOverflow));
+  if (b1Op.has_value())
+    result.addAttribute("b1Op", MMAB1OpAttr::get(ctx, *b1Op));
+
+  result.addTypes(resultType);
+  result.addAttribute(
+      MmaOp::getOperandSegmentSizeAttr(),
+      builder.getDenseI32ArrayAttr({static_cast<int32_t>(operandA.size()),
+                                    static_cast<int32_t>(operandB.size()),
+                                    static_cast<int32_t>(operandC.size())}));
+}
+
+// <operation> :=
+//   A `[` $operandA `]` B `[` $operandB `]` C `[` $operandC `]`
+//   attr-dict : (type($operandA[0]), type($operandB[0]), type($operandC[0]))
+//     `->` type($res)
+ParseResult MmaOp::parse(OpAsmParser &parser, OperationState &result) {
+  struct OperandFragment {
+    std::optional<MMATypes> elemtype;
+    SmallVector<OpAsmParser::UnresolvedOperand, 4> regs;
+    SmallVector<Type> regTypes;
+  };
+
+  Builder &builder = parser.getBuilder();
+  std::array<OperandFragment, 4> frags;
+
+  NamedAttrList namedAttributes;
+
+  // A helper to parse the operand segments.
+  auto parseMmaOperand = [&](StringRef operandName,
+                             OperandFragment &frag) -> LogicalResult {
+    if (parser.parseKeyword(operandName).failed())
+      return failure();
+    if (parser
+            .parseOperandList(frag.regs, OpAsmParser::Delimiter::OptionalSquare)
+            .failed())
+      return failure();
+    return success();
+  };
+
+  // Parse the operand segments.
+  if (parseMmaOperand("A", frags[0]).failed())
+    return failure();
+  if (parseMmaOperand("B", frags[1]).failed())
+    return failure();
+  if (parseMmaOperand("C", frags[2]).failed())
+    return failure();
+
+  if (parser.parseOptionalAttrDict(namedAttributes).failed())
+    return failure();
+
+  // Parse the type specification and resolve operands.
+  SmallVector<Type, 3> operandTypes;
+  if (failed(parser.parseColon()))
+    return failure();
+  if (failed(parser.parseLParen()))
+    return failure();
+  if (failed(parser.parseTypeList(operandTypes)))
+    return failure();
+  if (failed(parser.parseRParen()))
+    if (operandTypes.size() != 3)
+      return parser.emitError(
+          parser.getNameLoc(),
+          "expected one type for each operand segment but got " +
+              Twine(operandTypes.size()) + " types");
+  for (const auto &iter : llvm::enumerate(operandTypes)) {
+    auto &frag = frags[iter.index()];
+    frag.regTypes.resize(frag.regs.size(), iter.value());
+    if (failed(parser.resolveOperands(frag.regs, frag.regTypes,
+                                      parser.getNameLoc(), result.operands)))
+      return failure();
+    frag.elemtype =
+        inferOperandMMAType(frag.regTypes[0], /*isAccum=*/iter.index() < 2);
+  }
+
+  Type resultType;
+  if (parser.parseArrow() || parser.parseType(resultType))
+    return failure();
+  frags[3].elemtype = inferOperandMMAType(resultType, /*isAccum=*/true);
+
+  std::array<StringRef, 2> names{"multiplicandAPtxType",
+                                 "multiplicandBPtxType"};
+  for (unsigned idx = 0; idx < names.size(); idx++) {
+    const auto &frag = frags[idx];
+    std::optional<NamedAttribute> attr = namedAttributes.getNamed(names[idx]);
+    if (!frag.elemtype.has_value() && !attr.has_value()) {
+      return parser.emitError(
+          parser.getNameLoc(),
+          "attribute " + names[idx] +
+              " is not provided explicitly and cannot be inferred");
+    }
+    if (!attr.has_value())
+      result.addAttribute(
+          names[idx], MMATypesAttr::get(parser.getContext(), *frag.elemtype));
+  }
+
+  result.addTypes(resultType);
+  if (!namedAttributes.empty())
+    result.addAttributes(namedAttributes);
+  result.addAttribute(MmaOp::getOperandSegmentSizeAttr(),
+                      builder.getDenseI32ArrayAttr({
+                          static_cast<int32_t>(frags[0].regs.size()),
+                          static_cast<int32_t>(frags[1].regs.size()),
+                          static_cast<int32_t>(frags[2].regs.size()),
+                      }));
+  return success();
+}
+
+LogicalResult MmaOp::verify() {
+  MLIRContext *context = getContext();
+  auto f16Ty = Float16Type::get(context);
+  auto i32Ty = IntegerType::get(context, 32);
+  auto f16x2Ty = LLVM::getFixedVectorType(f16Ty, 2);
+  auto f32Ty = Float32Type::get(context);
+  auto f16x2x4StructTy = LLVM::LLVMStructType::getLiteral(
+      context, {f16x2Ty, f16x2Ty, f16x2Ty, f16x2Ty});
+
+  auto s32x4StructTy =
+      LLVM::LLVMStructType::getLiteral(context, {i32Ty, i32Ty, i32Ty, i32Ty});
+  auto f32x8StructTy =
+      LLVM::LLVMStructType::getLiteral(context, SmallVector<Type>(8, f32Ty));
+  auto f16x2x2StructTy =
+      LLVM::LLVMStructType::getLiteral(context, {f16x2Ty, f16x2Ty});
+  auto f32x4StructTy =
+      LLVM::LLVMStructType::getLiteral(context, {f32Ty, f32Ty, f32Ty, f32Ty});
+  auto s32x2StructTy =
+      LLVM::LLVMStructType::getLiteral(context, {i32Ty, i32Ty});
+
+  std::array<int64_t, 3> mmaShape{getShapeAttr().getM(), getShapeAttr().getN(),
+                                  getShapeAttr().getK()};
+
+  // These variables define the set of allowed data types for matrices A, B, C,
+  // and result.
+  using AllowedShapes = SmallVector<std::array<int64_t, 3>, 2>;
+  using AllowedTypes = SmallVector<SmallVector<Type, 4>, 2>;
+  AllowedShapes allowedShapes;
+  AllowedTypes expectedA;
+  AllowedTypes expectedB;
+  AllowedTypes expectedC;
+  SmallVector<Type> expectedResult;
+
+  // When M = 16, we just need to calculate the number of 8xk tiles, where
+  // k is a factor that depends on the data type.
+  if (mmaShape[0] == 16) {
+    int64_t kFactor;
+    Type multiplicandFragType;
+    switch (*getMultiplicandAPtxType()) {
+    case MMATypes::tf32:
+      kFactor = 4;
+      multiplicandFragType = i32Ty;
+      expectedResult.push_back(LLVM::LLVMStructType::getLiteral(
+          context, {f32Ty, f32Ty, f32Ty, f32Ty}));
+      break;
+    case MMATypes::f16:
+    case MMATypes::bf16:
+      kFactor = 8;
+      multiplicandFragType = f16x2Ty;
+      expectedResult.push_back(f16x2x2StructTy);
+      expectedResult.push_back(f32x4StructTy);
+      break;
+    case MMATypes::s4:
+    case MMATypes::u4:
+      kFactor = 32;
+      break;
+    case MMATypes::b1:
+      kFactor = 128;
+      break;
+    case MMATypes::s8:
+    case MMATypes::u8:
+      kFactor = 16;
+      break;
+    default:
+      return emitError("invalid shape or multiplicand type: " +
+                       stringifyEnum(getMultiplicandAPtxType().value()));
+    }
+
+    if (isIntegerPtxType(getMultiplicandAPtxType().value())) {
+      expectedResult.push_back(s32x4StructTy);
+      expectedC.emplace_back(4, i32Ty);
+      multiplicandFragType = i32Ty;
+    } else {
+      expectedC.emplace_back(2, f16x2Ty);
+      expectedC.emplace_back(4, f32Ty);
+    }
+
+    int64_t unitA = (mmaShape[0] / 8) * (mmaShape[2] / kFactor);
+    int64_t unitB = (mmaShape[1] / 8) * (mmaShape[2] / kFactor);
+    expectedA.emplace_back(unitA, multiplicandFragType);
+    expectedB.emplace_back(unitB, multiplicandFragType);
+    allowedShapes.push_back({16, 8, kFactor});
+    allowedShapes.push_back({16, 8, kFactor * 2});
+  }
+
+  // In the M=8 case, there is only 1 possible case per data type.
+  if (mmaShape[0] == 8) {
+    if (*getMultiplicandAPtxType() == MMATypes::f16) {
+      expectedA.emplace_back(2, f16x2Ty);
+      expectedB.emplace_back(2, f16x2Ty);
+      expectedResult.push_back(f16x2x4StructTy);
+      expectedResult.push_back(f32x8StructTy);
+      expectedC.emplace_back(4, f16x2Ty);
+      expectedC.emplace_back(8, f32Ty);
+      allowedShapes.push_back({8, 8, 4});
+    }
+    if (*getMultiplicandAPtxType() == MMATypes::f64) {
+      Type f64Ty = Float64Type::get(context);
+      expectedA.emplace_back(1, f64Ty);
+      expectedB.emplace_back(1, f64Ty);
+      expectedC.emplace_back(2, f64Ty);
+      // expectedC.emplace_back(1, LLVM::getFixedVectorType(f64Ty, 2));
+      expectedResult.emplace_back(LLVM::LLVMStructType::getLiteral(
+          context, SmallVector<Type>(2, f64Ty)));
+      allowedShapes.push_back({8, 8, 4});
+    }
+    if (isIntegerPtxType(getMultiplicandAPtxType().value())) {
+      expectedA.push_back({i32Ty});
+      expectedB.push_back({i32Ty});
+      expectedC.push_back({i32Ty, i32Ty});
+      expectedResult.push_back(s32x2StructTy);
+      if (isInt4PtxType(getMultiplicandAPtxType().value()))
+        allowedShapes.push_back({8, 8, 32});
+      if (isInt8PtxType(getMultiplicandAPtxType().value()))
+        allowedShapes.push_back({8, 8, 16});
+      if (getMultiplicandAPtxType().value() == MMATypes::b1)
+        allowedShapes.push_back({8, 8, 128});
+    }
+  }
+
+  std::string errorMessage;
+  llvm::raw_string_ostream errorStream(errorMessage);
+
+  // Check that we matched an existing shape/dtype combination.
+  if (expectedA.empty() || expectedB.empty() || expectedC.empty() ||
+      !llvm::is_contained(allowedShapes, mmaShape)) {
+    errorStream << "unimplemented variant for MMA shape <";
+    llvm::interleaveComma(mmaShape, errorStream);
+    errorStream << ">";
+    return emitOpError(errorMessage);
+  }
+
+  // Verify the operand types for segments of A, B, and C operands.
+  std::array<StringRef, 3> operandNames{"A", "B", "C"};
+  for (const auto &iter : llvm::enumerate(
+           SmallVector<AllowedTypes, 3>{expectedA, expectedB, expectedC})) {
+    auto spec = this->getODSOperandIndexAndLength(iter.index());
+    SmallVector<Type, 4> operandTySeg(operand_type_begin() + spec.first,
+                                      operand_type_begin() + spec.first +
+                                          spec.second);
+    bool match = llvm::is_contained(iter.value(), operandTySeg);
+
+    if (!match) {
+      errorStream << "Could not match types for the "
+                  << operandNames[iter.index()]
+                  << " operands; expected one of ";
+      for (const auto &x : iter.value()) {
+        errorStream << x.size() << "x" << x[0] << " ";
+      }
+      errorStream << "but got ";
+      llvm::interleaveComma(operandTySeg, errorStream);
+      return emitOpError(errorStream.str());
+    }
+  }
+
+  // Check the result type
+  if (!llvm::any_of(expectedResult, [&](Type expectedResultType) {
+        return expectedResultType == getResult().getType();
+      })) {
+    errorStream
+        << "Could not match allowed types for the result; expected one of ";
+    llvm::interleaveComma(expectedResult, errorStream);
+    errorStream << " but got " << getResult().getType();
+    return emitOpError(errorStream.str());
+  }
+
+  // Ensure that binary MMA variants have a b1 MMA operation defined.
+  if (getMultiplicandAPtxType() == MMATypes::b1 && !getB1Op()) {
+    return emitOpError("op requires " + getB1OpAttrName().strref() +
+                       " attribute");
+  }
+
+  // Ensure int4/int8 MMA variants specify the accum overflow behavior
+  // attribute.
+  if (isInt4PtxType(*getMultiplicandAPtxType()) ||
+      isInt8PtxType(*getMultiplicandAPtxType())) {
+    if (!getIntOverflowBehavior())
+      return emitOpError("op requires " +
+                         getIntOverflowBehaviorAttrName().strref() +
+                         " attribute");
+  }
+
+  return success();
+}
+
+LogicalResult ShflOp::verify() {
+  if (!(*this)->getAttrOfType<UnitAttr>("return_value_and_is_valid"))
+    return success();
+  auto type = llvm::dyn_cast<LLVM::LLVMStructType>(getType());
+  auto elementType = (type && type.getBody().size() == 2)
+                         ? llvm::dyn_cast<IntegerType>(type.getBody()[1])
+                         : nullptr;
+  if (!elementType || elementType.getWidth() != 1)
+    return emitError("expected return type to be a two-element struct with "
+                     "i1 as the second element");
+  return success();
+}
+
+std::pair<mlir::Type, unsigned> NVVM::inferMMAType(NVVM::MMATypes type,
+                                                   NVVM::MMAFrag frag, int nRow,
+                                                   int nCol,
+                                                   MLIRContext *context) {
+  unsigned numberElements = 0;
+  Type elementType;
+  OpBuilder builder(context);
+  Type f16x2 = VectorType::get(2, builder.getF16Type());
+  if (type == NVVM::MMATypes::f16) {
+    elementType = f16x2;
+    if (frag == NVVM::MMAFrag::a || frag == NVVM::MMAFrag::b)
+      numberElements = 8;
+    else
+      numberElements = 4;
+  } else if (type == NVVM::MMATypes::f32) {
+    elementType = builder.getF32Type();
+    numberElements = 8;
+  } else if (type == NVVM::MMATypes::tf32) {
+    elementType = builder.getI32Type();
+    numberElements = 4;
+  } else if (type == NVVM::MMATypes::s8 || type == NVVM::MMATypes::u8) {
+    elementType = builder.getI32Type();
+    int parallelSize = 0;
+    if (frag == NVVM::MMAFrag::a)
+      parallelSize = nRow;
+    if (frag == NVVM::MMAFrag::b)
+      parallelSize = nCol;
+
+    // m == 16 && n == 16 && k == 16
+    if (parallelSize == 16)
+      numberElements = 2;
+    // m == 8 && n == 32 && k == 16 or m == 32 && n == 8 && k == 16
+    else if (parallelSize == 8)
+      numberElements = 1;
+    else if (parallelSize == 32)
+      numberElements = 4;
+  } else if (type == NVVM::MMATypes::s32) {
+    elementType = builder.getI32Type();
+    numberElements = 8;
+  }
+  assert(numberElements != 0 && elementType != nullptr);
+  return std::make_pair(elementType, numberElements);
+}
+
+static std::pair<mlir::Type, unsigned>
+inferMMATypeFromMNK(NVVM::MMATypes type, NVVM::MMAFrag frag, int m, int n,
+                    int k, MLIRContext *context) {
+  int nRow, nCol;
+  if (frag == NVVM::MMAFrag::a) {
+    nRow = m;
+    nCol = k;
+  } else if (frag == NVVM::MMAFrag::b) {
+    nRow = k;
+    nCol = n;
+  } else {
+    nRow = m;
+    nCol = n;
+  }
+  assert(nRow && nCol);
+  return inferMMAType(type, frag, nRow, nCol, context);
+}
+
+LogicalResult NVVM::WMMALoadOp::verify() {
+  unsigned addressSpace =
+      llvm::cast<LLVM::LLVMPointerType>(getPtr().getType()).getAddressSpace();
+  if (addressSpace != 0 && addressSpace != NVVM::kGlobalMemorySpace &&
+      addressSpace != NVVM::kSharedMemorySpace)
+    return emitOpError("expected source pointer in memory "
+                       "space 0, 1, 3");
+
+  if (NVVM::WMMALoadOp::getIntrinsicID(getM(), getN(), getK(), getLayout(),
+                                       getEltype(), getFrag()) == 0)
+    return emitOpError() << "invalid attribute combination";
+  std::pair<Type, unsigned> typeInfo = inferMMATypeFromMNK(
+      getEltype(), getFrag(), getM(), getN(), getK(), getContext());
+  Type dstType = LLVM::LLVMStructType::getLiteral(
+      getContext(), SmallVector<Type, 8>(typeInfo.second, typeInfo.first));
+  if (getType() != dstType)
+    return emitOpError("expected destination type is a structure of ")
+           << typeInfo.second << " elements of type " << typeInfo.first;
+  return success();
+}
+
+LogicalResult NVVM::WMMAStoreOp::verify() {
+  unsigned addressSpace =
+      llvm::cast<LLVM::LLVMPointerType>(getPtr().getType()).getAddressSpace();
+  if (addressSpace != 0 && addressSpace != NVVM::kGlobalMemorySpace &&
+      addressSpace != NVVM::kSharedMemorySpace)
+    return emitOpError("expected operands to be a source pointer in memory "
+                       "space 0, 1, 3");
+
+  if (NVVM::WMMAStoreOp::getIntrinsicID(getM(), getN(), getK(), getLayout(),
+                                        getEltype()) == 0)
+    return emitOpError() << "invalid attribute combination";
+  std::pair<Type, unsigned> typeInfo = inferMMATypeFromMNK(
+      getEltype(), NVVM::MMAFrag::c, getM(), getN(), getK(), getContext());
+  if (getArgs().size() != typeInfo.second)
+    return emitOpError() << "expected " << typeInfo.second << " data operands";
+  if (llvm::any_of(getArgs(), [&typeInfo](Value operands) {
+        return operands.getType() != typeInfo.first;
+      }))
+    return emitOpError() << "expected data operands of type " << typeInfo.first;
+  return success();
+}
+
+LogicalResult NVVM::WMMAMmaOp::verify() {
+  if (NVVM::WMMAMmaOp::getIntrinsicID(getM(), getN(), getK(), getLayoutA(),
+                                      getLayoutB(), getEltypeA(),
+                                      getEltypeB()) == 0)
+    return emitOpError() << "invalid attribute combination";
+  std::pair<Type, unsigned> typeInfoA = inferMMATypeFromMNK(
+      getEltypeA(), NVVM::MMAFrag::a, getM(), getN(), getK(), getContext());
+  std::pair<Type, unsigned> typeInfoB = inferMMATypeFromMNK(
+      getEltypeA(), NVVM::MMAFrag::b, getM(), getN(), getK(), getContext());
+  std::pair<Type, unsigned> typeInfoC = inferMMATypeFromMNK(
+      getEltypeB(), NVVM::MMAFrag::c, getM(), getN(), getK(), getContext());
+  SmallVector<Type, 32> arguments;
+  arguments.append(typeInfoA.second, typeInfoA.first);
+  arguments.append(typeInfoB.second, typeInfoB.first);
+  arguments.append(typeInfoC.second, typeInfoC.first);
+  unsigned numArgs = arguments.size();
+  if (getArgs().size() != numArgs)
+    return emitOpError() << "expected " << numArgs << " arguments";
+  for (unsigned i = 0; i < numArgs; i++) {
+    if (getArgs()[i].getType() != arguments[i])
+      return emitOpError() << "expected argument " << i << " to be of type "
+                           << arguments[i];
+  }
+  Type dstType = LLVM::LLVMStructType::getLiteral(
+      getContext(), SmallVector<Type, 8>(typeInfoC.second, typeInfoC.first));
+  if (getType() != dstType)
+    return emitOpError("expected destination type is a structure of ")
+           << typeInfoC.second << " elements of type " << typeInfoC.first;
+  return success();
+}
+
+LogicalResult NVVM::LdMatrixOp::verify() {
+  unsigned addressSpace =
+      llvm::cast<LLVM::LLVMPointerType>(getPtr().getType()).getAddressSpace();
+  if (addressSpace != NVVM::kSharedMemorySpace)
+    return emitOpError("expected source pointer in memory space 3");
+
+  if (getNum() != 1 && getNum() != 2 && getNum() != 4)
+    return emitOpError("expected num attribute to be 1, 2 or 4");
+
+  Type i32 = IntegerType::get(getContext(), 32);
+  if (getNum() == 1 && getType() != i32)
+    return emitOpError("expected destination type is i32");
+  if (getNum() == 2 || getNum() == 4) {
+    Type dstType = LLVM::LLVMStructType::getLiteral(
+        getContext(), SmallVector<Type>(getNum(), i32));
+    if (getType() != dstType)
+      return emitOpError("expected destination type is a structure of ")
+             << getNum() << " elements of type i32";
+  }
+  return success();
+}
+
+LogicalResult NVVM::StMatrixOp::verify() {
+  unsigned addressSpace =
+      llvm::cast<LLVM::LLVMPointerType>(getPtr().getType()).getAddressSpace();
+  if (addressSpace != NVVM::kSharedMemorySpace)
+    return emitOpError("expected source pointer in memory space 3");
+
+  int numMatrix = getSources().size();
+  if (numMatrix != 1 && numMatrix != 2 && numMatrix != 4)
+    return emitOpError("expected num attribute to be 1, 2 or 4");
+
+  return success();
+}
+
+FailureOr<int> getAllowedSizeK(NVVM::WGMMATypes typeA) {
+  if (typeA == NVVM::WGMMATypes::tf32)
+    return 8;
+  if (typeA == NVVM::WGMMATypes::f16 || typeA == NVVM::WGMMATypes::bf16)
+    return 16;
+  if (typeA == NVVM::WGMMATypes::s8 || typeA == NVVM::WGMMATypes::u8)
+    return 32;
+  if (typeA == NVVM::WGMMATypes::e4m3 || typeA == NVVM::WGMMATypes::e5m2)
+    return 32;
+  if (typeA == NVVM::WGMMATypes::b1)
+    return 256;
+  return failure();
+}
+
+LogicalResult isAllowedWGMMADataType(NVVM::WGMMATypes typeD,
+                                     NVVM::WGMMATypes typeA,
+                                     NVVM::WGMMATypes typeB) {
+  switch (typeA) {
+  case NVVM::WGMMATypes::f16:
+    if ((typeD == NVVM::WGMMATypes::f32 || typeD == NVVM::WGMMATypes::f16) &&
+        typeB == NVVM::WGMMATypes::f16)
+      return success();
+    break;
+  case NVVM::WGMMATypes::tf32:
+    if (typeD == NVVM::WGMMATypes::f32 && typeB == NVVM::WGMMATypes::tf32)
+      return success();
+    break;
+  case NVVM::WGMMATypes::u8:
+  case NVVM::WGMMATypes::s8:
+    if (typeD == NVVM::WGMMATypes::s32 &&
+        (typeB == NVVM::WGMMATypes::u8 || typeB == NVVM::WGMMATypes::s8))
+      return success();
+    break;
+  case NVVM::WGMMATypes::b1:
+    if (typeD == NVVM::WGMMATypes::s32 && typeB == NVVM::WGMMATypes::b1)
+      return success();
+    break;
+  case NVVM::WGMMATypes::bf16:
+    if ((typeD == NVVM::WGMMATypes::f32 || typeD == NVVM::WGMMATypes::f16) &&
+        typeB == NVVM::WGMMATypes::bf16)
+      return success();
+    break;
+  case NVVM::WGMMATypes::e4m3:
+  case NVVM::WGMMATypes::e5m2:
+    if ((typeD == NVVM::WGMMATypes::f32 || typeD == NVVM::WGMMATypes::f16) &&
+        (typeB == NVVM::WGMMATypes::e5m2 || typeB == NVVM::WGMMATypes::e4m3))
+      return success();
+    break;
+  case WGMMATypes::f32:
+  case WGMMATypes::s32:
+    llvm_unreachable("unsupported input types");
+    break;
+  }
+  return failure();
+}
+
+LogicalResult isAllowedSizeN(int sizeN, NVVM::WGMMATypes typeA) {
+  SmallVector<int> allowedN = {8,   16,  24,  32,  40,  48,  56,  64,
+                               72,  80,  88,  96,  104, 112, 120, 128,
+                               136, 144, 152, 160, 168, 176, 184, 192,
+                               200, 208, 216, 224, 232, 240, 248, 256};
+  SmallVector<int> allowedNshort = {8,   16,  24,  32,  48,  64,
+                                    80,  96,  112, 128, 144, 160,
+                                    176, 192, 208, 224, 240, 256};
+  switch (typeA) {
+  case WGMMATypes::f16:
+  case WGMMATypes::tf32:
+  case WGMMATypes::bf16:
+  case WGMMATypes::e4m3:
+  case WGMMATypes::e5m2:
+    if (llvm::is_contained(allowedN, sizeN))
+      return success();
+    break;
+  case WGMMATypes::u8:
+  case WGMMATypes::s8:
+  case WGMMATypes::b1:
+    if (llvm::is_contained(allowedNshort, sizeN))
+      return success();
+    break;
+  case WGMMATypes::f32:
+  case WGMMATypes::s32:
+    llvm_unreachable("unsupported input types");
+    break;
+  }
+  return failure();
+}
+
+LogicalResult NVVM::WgmmaMmaAsyncOp::verify() {
+  Value outValue = getResults();
+  auto stype = dyn_cast<LLVM::LLVMStructType>(outValue.getType());
+  if (!stype)
+    return emitOpError() << "expected results to be struct";
+  int outputSize = stype.getBody().size();
+  WGMMATypes typeD = getTypeD();
+  WGMMATypes typeA = getTypeA();
+  WGMMATypes typeB = getTypeB();
+
+  for (Type t : stype.getBody()) {
+    if (t != stype.getBody().front())
+      return emitOpError()
+             << "all elements in struct must be same type but there is " << t;
+  }
+
+  if (typeD != WGMMATypes::f32 && typeD != WGMMATypes::f16 &&
+      typeD != WGMMATypes::s32) {
+    return emitOpError() << "does not support the given output type "
+                         << NVVM::stringifyWGMMATypes(typeD);
+  }
+  if (typeD == WGMMATypes::s32 &&
+      (getScaleA() == WGMMAScaleIn::neg || getScaleB() == WGMMAScaleIn::neg)) {
+    return emitOpError() << "has s32 output, scaleA and scaleB cannot be neg";
+  }
+
+  if (failed(isAllowedWGMMADataType(typeD, typeA, typeB))) {
+    return emitOpError() << NVVM::stringifyWGMMATypes(typeD)
+                         << " += " << NVVM::stringifyWGMMATypes(typeA) << " * "
+                         << NVVM::stringifyWGMMATypes(typeB)
+                         << ", it is not supported.";
+  }
+
+  // Check M
+  if (getShape().getM() != 64)
+    return emitOpError() << "shape 'm' must be 64";
+
+  // Check K
+  FailureOr<int> allowedK = getAllowedSizeK(typeA);
+  if (failed(allowedK) || allowedK.value() != getShape().getK())
+    return emitOpError() << "shape 'k' must be " << allowedK.value()
+                         << " for input type "
+                         << NVVM::stringifyWGMMATypes(typeA);
+
+  // Check N
+  if (failed(isAllowedSizeN(getShape().getN(), typeA))) {
+    return emitOpError() << "has input type "
+                         << NVVM::stringifyWGMMATypes(typeA) << " n is set to "
+                         << getShape().getN() << ", it is not supported.";
+  }
+
+  // Check transpose (only available for f16/bf16)
+  if ((typeA != WGMMATypes::f16 && typeA != WGMMATypes::bf16) &&
+      (getLayoutA() == mlir::NVVM::MMALayout::col ||
+       getLayoutB() == mlir::NVVM::MMALayout::col)) {
+    return emitOpError()
+           << "given layouts layout_a = " << stringifyMMALayout(getLayoutA())
+           << " and layout_b = " << stringifyMMALayout(getLayoutB())
+           << " for input types " << stringifyWGMMATypes(typeA) << " and "
+           << stringifyWGMMATypes(typeB)
+           << " requires transpose. However, this is only supported for: "
+           << stringifyMMATypes(MMATypes::f16) << " and "
+           << stringifyMMATypes(MMATypes::bf16);
+  }
+
+  // Check result registers
+  int expectedOutput = 0;
+  if (typeD == WGMMATypes::f32 || typeD == WGMMATypes::s32)
+    expectedOutput = getShape().getN() / 2;
+  if (typeD == WGMMATypes::f16)
+    expectedOutput = getShape().getN() / 4;
+  if (outputSize != expectedOutput) {
+    return emitOpError() << "results " << expectedOutput
+                         << ", however output struct has " << outputSize
+                         << " elements";
+  }
+  // Check satfinite (only available for s32 accumulator)
+  if (typeD != WGMMATypes::s32 &&
+      getSatfinite().value_or(NVVM::MMAIntOverflow::wrapped) ==
+          NVVM::MMAIntOverflow::satfinite) {
+    return emitOpError()
+           << " `satfinite` can be only used with s32 accumulator, however "
+              "the current accumulator is "
+           << NVVM::stringifyWGMMATypes(typeD);
+  }
+
+  return success();
+}
+
+std::string NVVM::WgmmaMmaAsyncOp::getPtx() {
+
+  int m = getShape().getM(), n = getShape().getN(), k = getShape().getK();
+  bool isF16 = getTypeA() == WGMMATypes::f16 || getTypeA() == WGMMATypes::bf16;
+
+  StringRef outputTypeName = stringifyWGMMATypes(getTypeD());
+
+  int expectedOutputRegisters = 0;
+  if (getTypeD() == WGMMATypes::f16)
+    expectedOutputRegisters = getShape().getN() / 4;
+  else
+    expectedOutputRegisters = getShape().getN() / 2;
+
+  std::string ptx;
+  llvm::raw_string_ostream ss(ptx);
+
+  ss << "{\n"
+        ".reg .pred p;\n"
+        "setp.ne.b32 p, $"
+     << ((expectedOutputRegisters * 2) + 2)
+     << ", 0;\n"
+        "wgmma.mma_async.sync.aligned.m"
+     << m << "n" << n << "k" << k << "." << outputTypeName << "."
+     << stringifyWGMMATypes(getTypeA()) << "."
+     << stringifyWGMMATypes(getTypeB());
+  if (getSatfinite().value_or(NVVM::MMAIntOverflow::wrapped) ==
+      NVVM::MMAIntOverflow::satfinite)
+    ss << ".satfinite";
+  ss << " {";
+  int regCnt = 0;
+  for (; regCnt < expectedOutputRegisters; ++regCnt) {
+    ss << "$" << regCnt;
+    if (regCnt != expectedOutputRegisters - 1)
+      ss << ", ";
+  }
+
+  ss << "},";
+  // Need to map read/write registers correctly.
+  regCnt = (regCnt * 2);
+  ss << " $" << (regCnt) << "," << " $" << (regCnt + 1) << "," << " p";
+  if (getTypeD() != WGMMATypes::s32) {
+    ss << ", $" << (regCnt + 3) << ",  $" << (regCnt + 4);
+  }
+  // Don't add transpose parameters unless needed.
+  if (isF16) {
+    ss << ", $" << (regCnt + 5) << ",  $" << (regCnt + 6);
+  }
+  ss << ";\n"
+     << "}\n";
+  ss.flush();
+  return ptx;
+}
+
+void NVVM::WgmmaMmaAsyncOp::getAsmValues(
+    RewriterBase &rewriter,
+    llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>>
+        &asmValues) {
+  bool isF16 = getTypeA() == WGMMATypes::f16 || getTypeA() == WGMMATypes::bf16;
+  if (getResults())
+    asmValues.push_back({getResults(), mlir::NVVM::PTXRegisterMod::Write});
+  if (getInouts())
+    asmValues.push_back({getInouts(), mlir::NVVM::PTXRegisterMod::ReadWrite});
+  asmValues.push_back({getDescriptorA(), mlir::NVVM::PTXRegisterMod::Read});
+  asmValues.push_back({getDescriptorB(), mlir::NVVM::PTXRegisterMod::Read});
+  asmValues.push_back({makeConstantI32(rewriter, static_cast<int>(getScaleD())),
+                       mlir::NVVM::PTXRegisterMod::Read});
+  if (getTypeD() != WGMMATypes::s32) {
+    asmValues.push_back(
+        {makeConstantI32(rewriter,
+                         getScaleA() == NVVM::WGMMAScaleIn::neg ? -1 : 1),
+         mlir::NVVM::PTXRegisterMod::Read});
+    asmValues.push_back(
+        {makeConstantI32(rewriter,
+                         getScaleB() == NVVM::WGMMAScaleIn::neg ? -1 : 1),
+         mlir::NVVM::PTXRegisterMod::Read});
+  }
+  if (isF16) {
+    asmValues.push_back(
+        {makeConstantI32(rewriter, static_cast<int>(getLayoutA())),
+         mlir::NVVM::PTXRegisterMod::Read});
+    asmValues.push_back(
+        {makeConstantI32(rewriter, 1 - static_cast<int>(getLayoutB())),
+         mlir::NVVM::PTXRegisterMod::Read});
+  }
+}
+LogicalResult NVVM::FenceProxyOp::verify() {
+  if (getKind() == NVVM::ProxyKind::async_shared && !getSpace().has_value()) {
+    return emitOpError() << "async_shared fence requires space attribute";
+  }
+  if (getKind() != NVVM::ProxyKind::async_shared && getSpace().has_value()) {
+    return emitOpError() << "only async_shared fence can have space attribute";
+  }
+  return success();
+}
+
+LogicalResult NVVM::SetMaxRegisterOp::verify() {
+  if (getRegCount() % 8)
+    return emitOpError("new register size must be multiple of 8");
+  if (getRegCount() < 24 || getRegCount() > 256)
+    return emitOpError("new register size must be in between 24 to 256");
+  return success();
+}
+
+LogicalResult NVVM::BarrierOp::verify() {
+  if (getNumberOfThreads() && !getBarrierId())
+    return emitOpError(
+        "barrier id is missing, it should be set between 0 to 15");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// NVVMDialect initialization, type parsing, and registration.
+//===----------------------------------------------------------------------===//
+
+// TODO: This should be the llvm.nvvm dialect once this is supported.
+void NVVMDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/LLVMIR/NVVMOps.cpp.inc"
+      >();
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "mlir/Dialect/LLVMIR/NVVMOpsAttributes.cpp.inc"
+      >();
+
+  // Support unknown operations because not all NVVM operations are
+  // registered.
+  allowUnknownOperations();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, NVVMDialect>();
+  declarePromisedInterface<gpu::TargetAttrInterface, NVVMTargetAttr>();
+}
+
+LogicalResult NVVMDialect::verifyOperationAttribute(Operation *op,
+                                                    NamedAttribute attr) {
+  StringAttr attrName = attr.getName();
+  // Kernel function attribute should be attached to functions.
+  if (attrName == NVVMDialect::getKernelFuncAttrName()) {
+    if (!isa<LLVM::LLVMFuncOp>(op)) {
+      return op->emitError() << "'" << NVVMDialect::getKernelFuncAttrName()
+                             << "' attribute attached to unexpected op";
+    }
+  }
+  // If maxntid and reqntid exist, it must be an array with max 3 dim
+  if (attrName == NVVMDialect::getMaxntidAttrName() ||
+      attrName == NVVMDialect::getReqntidAttrName()) {
+    auto values = llvm::dyn_cast<DenseI32ArrayAttr>(attr.getValue());
+    if (!values || values.empty() || values.size() > 3)
+      return op->emitError()
+             << "'" << attrName
+             << "' attribute must be integer array with maximum 3 index";
+  }
+  // If minctasm and maxnreg exist, it must be an integer attribute
+  if (attrName == NVVMDialect::getMinctasmAttrName() ||
+      attrName == NVVMDialect::getMaxnregAttrName()) {
+    if (!llvm::dyn_cast<IntegerAttr>(attr.getValue()))
+      return op->emitError()
+             << "'" << attrName << "' attribute must be integer constant";
+  }
+
+  return success();
+}
+
+LogicalResult NVVMDialect::verifyRegionArgAttribute(Operation *op,
+                                                    unsigned regionIndex,
+                                                    unsigned argIndex,
+                                                    NamedAttribute argAttr) {
+  auto funcOp = dyn_cast<FunctionOpInterface>(op);
+  if (!funcOp)
+    return success();
+
+  bool isKernel = op->hasAttr(NVVMDialect::getKernelFuncAttrName());
+  StringAttr attrName = argAttr.getName();
+  if (attrName == NVVM::NVVMDialect::getGridConstantAttrName()) {
+    if (!isKernel) {
+      return op->emitError()
+             << "'" << attrName
+             << "' attribute must be present only on kernel arguments";
+    }
+    if (!isa<UnitAttr>(argAttr.getValue()))
+      return op->emitError() << "'" << attrName << "' must be a unit attribute";
+    if (!funcOp.getArgAttr(argIndex, LLVM::LLVMDialect::getByValAttrName())) {
+      return op->emitError()
+             << "'" << attrName
+             << "' attribute requires the argument to also have attribute '"
+             << LLVM::LLVMDialect::getByValAttrName() << "'";
+    }
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// NVVM target attribute.
+//===----------------------------------------------------------------------===//
+LogicalResult
+NVVMTargetAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+                       int optLevel, StringRef triple, StringRef chip,
+                       StringRef features, DictionaryAttr flags,
+                       ArrayAttr files) {
+  if (optLevel < 0 || optLevel > 3) {
+    emitError() << "The optimization level must be a number between 0 and 3.";
+    return failure();
+  }
+  if (triple.empty()) {
+    emitError() << "The target triple cannot be empty.";
+    return failure();
+  }
+  if (chip.empty()) {
+    emitError() << "The target chip cannot be empty.";
+    return failure();
+  }
+  if (files && !llvm::all_of(files, [](::mlir::Attribute attr) {
+        return attr && mlir::isa<StringAttr>(attr);
+      })) {
+    emitError() << "All the elements in the `link` array must be strings.";
+    return failure();
+  }
+  return success();
+}
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/LLVMIR/NVVMOps.cpp.inc"
+
+#define GET_ATTRDEF_CLASSES
+#include "mlir/Dialect/LLVMIR/NVVMOpsAttributes.cpp.inc"

>From 740ba25c18140a87f5311725daf9ae74663dc067 Mon Sep 17 00:00:00 2001
From: bangyu shen <94283495+shubaoyu2 at users.noreply.github.com>
Date: Wed, 3 Jul 2024 17:08:00 +0800
Subject: [PATCH 4/7] change check cases when ab cannot be transposed in wgmma

---
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 036a9a15af838..48f44165ccc58 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -880,7 +880,7 @@ LogicalResult NVVM::WgmmaMmaAsyncOp::verify() {
   // Check transpose (only available for f16/bf16)
   if ((typeA != WGMMATypes::f16 && typeA != WGMMATypes::bf16) &&
       (getLayoutA() == mlir::NVVM::MMALayout::col ||
-       getLayoutB() == mlir::NVVM::MMALayout::col)) {
+       getLayoutB() == mlir::NVVM::MMALayout::row)) {
     return emitOpError()
            << "given layouts layout_a = " << stringifyMMALayout(getLayoutA())
            << " and layout_b = " << stringifyMMALayout(getLayoutB())

>From b4a2937afd11ee53332f36febed83bc1306c58dc Mon Sep 17 00:00:00 2001
From: bangyu shen <94283495+shubaoyu2 at users.noreply.github.com>
Date: Wed, 3 Jul 2024 17:10:39 +0800
Subject: [PATCH 5/7] change some test cases to match the previous check change

---
 .../Conversion/NVVMToLLVM/nvvm-to-llvm.mlir   | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index 21947c242461e..375e2951a037c 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -397,19 +397,19 @@ func.func @wgmma_s32_s8_s8_satfinite(%descA : i64, %descB : i64) -> !mat16i32{
       #nvvm.shape<m = 64, n = 8, k = 32>, 
       D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>],
       A [<s8>, #nvvm.wgmma_scale_in<one>, <row>], 
-      B [<s8>, #nvvm.wgmma_scale_in<one>, <row>]
+      B [<s8>, #nvvm.wgmma_scale_in<one>, <col>]
       : !mat16i32 -> !mat16i32
   %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1, 
       #nvvm.shape<m = 64, n = 8, k = 32>, 
       D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>],
       A [<s8>, #nvvm.wgmma_scale_in<one>, <row>], 
-      B [<s8>, #nvvm.wgmma_scale_in<one>, <row>]
+      B [<s8>, #nvvm.wgmma_scale_in<one>, <col>]
       : !mat16i32 -> !mat16i32
   %result3 = nvvm.wgmma.mma_async %descA, %descB, %result2, 
       #nvvm.shape<m = 64, n = 8, k = 32>, 
       D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>],
       A [<s8>, #nvvm.wgmma_scale_in<one>, <row>], 
-      B [<s8>, #nvvm.wgmma_scale_in<one>, <row>]
+      B [<s8>, #nvvm.wgmma_scale_in<one>, <col>]
       : !mat16i32 -> !mat16i32
   return %result3 : !mat16i32
 }
@@ -458,19 +458,19 @@ func.func @wgmma_s32_u8_u8(%descA : i64, %descB : i64) -> !mat16i32 {
       #nvvm.shape<m = 64, n = 8, k = 32>, 
       D [<s32>, #nvvm.wgmma_scale_out<one>],
       A [<u8>, #nvvm.wgmma_scale_in<one>, <row>], 
-      B [<u8>, #nvvm.wgmma_scale_in<one>, <row>]
+      B [<u8>, #nvvm.wgmma_scale_in<one>, <col>]
       : !mat16i32 -> !mat16i32
   %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
       #nvvm.shape<m = 64, n = 8, k = 32>, 
       D [<s32>, #nvvm.wgmma_scale_out<one>],
       A [<u8>, #nvvm.wgmma_scale_in<one>, <row>], 
-      B [<u8>, #nvvm.wgmma_scale_in<one>, <row>]
+      B [<u8>, #nvvm.wgmma_scale_in<one>, <col>]
       : !mat16i32 -> !mat16i32
   %result3 = nvvm.wgmma.mma_async %descA, %descB, %result2,
       #nvvm.shape<m = 64, n = 8, k = 32>, 
       D [<s32>, #nvvm.wgmma_scale_out<one>],
       A [<u8>, #nvvm.wgmma_scale_in<one>, <row>], 
-      B [<u8>, #nvvm.wgmma_scale_in<one>, <row>]
+      B [<u8>, #nvvm.wgmma_scale_in<one>, <col>]
       : !mat16i32 -> !mat16i32
   return %result3 : !mat16i32
 }
@@ -500,13 +500,13 @@ func.func @wgmma_f32_tf32_tf32(%descA : i64, %descB : i64) -> !mat32f32 {
       #nvvm.shape<m = 64, n = 64, k = 8>, 
       D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
       A [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
-      B [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>]
+      B [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
        : !mat32f32 -> !mat32f32
   %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
       #nvvm.shape<m = 64, n = 64, k = 8>, 
       D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
       A [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
-      B [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>]
+      B [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
       : !mat32f32 -> !mat32f32
   return %result2 : !mat32f32
 }
@@ -533,13 +533,13 @@ func.func @wgmma_f32_e4m3_e4m3(%descA : i64, %descB : i64) -> !mat32f32 {
       #nvvm.shape<m = 64, n = 64, k = 32>, 
       D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
       A [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
-      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>]
+      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
        : !mat32f32 -> !mat32f32
   %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
       #nvvm.shape<m = 64, n = 64, k = 32>, 
       D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
       A [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
-      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>]
+      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
       : !mat32f32 -> !mat32f32
   return %result2 : !mat32f32
 }
@@ -565,13 +565,13 @@ func.func @wgmma_f32_e5m2_e4m3(%descA : i64, %descB : i64) -> !mat32f32 {
       #nvvm.shape<m = 64, n = 64, k = 32>, 
       D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
       A [#nvvm.wgmma_type<e5m2>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
-      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>]
+      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
        : !mat32f32 -> !mat32f32
   %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
       #nvvm.shape<m = 64, n = 64, k = 32>, 
       D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
       A [#nvvm.wgmma_type<e5m2>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
-      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>]
+      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
       : !mat32f32 -> !mat32f32
   return %result2 : !mat32f32
 }

>From f6404c653d97863b973775460334a7f9e2b651ca Mon Sep 17 00:00:00 2001
From: bangyu shen <94283495+shubaoyu2 at users.noreply.github.com>
Date: Wed, 3 Jul 2024 21:16:37 +0800
Subject: [PATCH 6/7] add comment

---
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 48f44165ccc58..ceeb1168eb13e 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -878,6 +878,9 @@ LogicalResult NVVM::WgmmaMmaAsyncOp::verify() {
   }
 
   // Check transpose (only available for f16/bf16)
+  // Matrices A should be stored in row-major and B in column-major.
+  // Only f16/bf16 matrices can be stored in either column-major or row-major 
+  // by setting the tranpose value(imm-trans-a,imm-trans-b) in PTX code.
   if ((typeA != WGMMATypes::f16 && typeA != WGMMATypes::bf16) &&
       (getLayoutA() == mlir::NVVM::MMALayout::col ||
        getLayoutB() == mlir::NVVM::MMALayout::row)) {

>From 319ec0edf21b483379b3490f8b2129b8555c6719 Mon Sep 17 00:00:00 2001
From: bangyu shen <94283495+shubaoyu2 at users.noreply.github.com>
Date: Wed, 3 Jul 2024 22:33:28 +0800
Subject: [PATCH 7/7] format code

---
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index ceeb1168eb13e..4d1896551101e 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -879,7 +879,7 @@ LogicalResult NVVM::WgmmaMmaAsyncOp::verify() {
 
   // Check transpose (only available for f16/bf16)
   // Matrices A should be stored in row-major and B in column-major.
-  // Only f16/bf16 matrices can be stored in either column-major or row-major 
+  // Only f16/bf16 matrices can be stored in either column-major or row-major
   // by setting the tranpose value(imm-trans-a,imm-trans-b) in PTX code.
   if ((typeA != WGMMATypes::f16 && typeA != WGMMATypes::bf16) &&
       (getLayoutA() == mlir::NVVM::MMALayout::col ||