[Mlir-commits] [mlir] [MLIR][NVVM] Enable nvvm intrinsics import (PR #68843)

Mon Dec 11 00:07:19 PST 2023

https://github.com/ivanradanov updated https://github.com/llvm/llvm-project/pull/68843

>From e8d7b4ca62ae7de2155d6857316087a77e7b767c Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <ivanov.i.aa at m.titech.ac.jp>
Date: Thu, 12 Oct 2023 10:18:30 +0900
Subject: [PATCH 01/11] Enable LLVMIR module import with nvvm intrinsics

---
 mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt             | 2 ++
 .../LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp       | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
index 64de028c7fe40..8e41fcc05a161 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
@@ -56,6 +56,8 @@ add_mlir_dialect(NVVMOps nvvm)
 add_mlir_doc(NVVMOps NVVMDialect Dialects/ -gen-dialect-doc -dialect=nvvm)
 set(LLVM_TARGET_DEFINITIONS NVVMOps.td)
 mlir_tablegen(NVVMConversions.inc -gen-llvmir-conversions)
+mlir_tablegen(NVVMFromLLVMIRConversions.inc -gen-intr-from-llvmir-conversions)
+mlir_tablegen(NVVMConvertibleLLVMIRIntrinsics.inc -gen-convertible-llvmir-intrinsics)
 mlir_tablegen(NVVMOpsEnums.h.inc -gen-enum-decls)
 mlir_tablegen(NVVMOpsEnums.cpp.inc -gen-enum-defs)
 mlir_tablegen(NVVMOpsAttributes.h.inc -gen-attrdef-decls -attrdefs-dialect=nvvm)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
index 40d8253d822f6..d20e95754fbf5 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMInterfaces.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Target/LLVMIR/ModuleImport.h"
 
@@ -24,6 +25,7 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/Support/ModRef.h"
 
 using namespace mlir;
@@ -37,6 +39,7 @@ using namespace mlir::LLVM::detail;
 static bool isConvertibleIntrinsic(llvm::Intrinsic::ID id) {
   static const DenseSet<unsigned> convertibleIntrinsics = {
 #include "mlir/Dialect/LLVMIR/LLVMConvertibleLLVMIRIntrinsics.inc"
+#include "mlir/Dialect/LLVMIR/NVVMConvertibleLLVMIRIntrinsics.inc"
   };
   return convertibleIntrinsics.contains(id);
 }
@@ -46,6 +49,7 @@ static bool isConvertibleIntrinsic(llvm::Intrinsic::ID id) {
 static ArrayRef<unsigned> getSupportedIntrinsicsImpl() {
   static const SmallVector<unsigned> convertibleIntrinsics = {
 #include "mlir/Dialect/LLVMIR/LLVMConvertibleLLVMIRIntrinsics.inc"
+#include "mlir/Dialect/LLVMIR/NVVMConvertibleLLVMIRIntrinsics.inc"
   };
   return convertibleIntrinsics;
 }
@@ -63,6 +67,7 @@ static LogicalResult convertIntrinsicImpl(OpBuilder &odsBuilder,
     SmallVector<llvm::Value *> args(inst->args());
     ArrayRef<llvm::Value *> llvmOperands(args);
 #include "mlir/Dialect/LLVMIR/LLVMIntrinsicFromLLVMIRConversions.inc"
+#include "mlir/Dialect/LLVMIR/NVVMFromLLVMIRConversions.inc"
   }
 
   return failure();
@@ -281,6 +286,7 @@ class LLVMDialectLLVMIRImportInterface : public LLVMImportDialectInterface {
 
 void mlir::registerLLVMDialectImport(DialectRegistry &registry) {
   registry.insert<LLVM::LLVMDialect>();
+  registry.insert<NVVM::NVVMDialect>();
   registry.addExtension(+[](MLIRContext *ctx, LLVM::LLVMDialect *dialect) {
     dialect->addInterfaces<LLVMDialectLLVMIRImportInterface>();
   });

>From 44509b9b8449c522375695b983be1cbcabd06682 Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <ivanov.i.aa at m.titech.ac.jp>
Date: Thu, 12 Oct 2023 10:26:46 +0900
Subject: [PATCH 02/11] Add test

---
 mlir/test/Target/LLVMIR/Import/intrinsic.ll | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
index c8dcde11d93e6..3f7705ea78979 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
@@ -878,6 +878,14 @@ define float @ssa_copy(float %0) {
   ret float %2
 }
 
+; CHECK-LABEL: llvm.func @nvvm
+define void @nvvm() {
+  ; CHECK: %{{.*}} = nvvm.read.ptx.sreg.ntid.x : i32
+  %1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  ret void
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 declare float @llvm.fmuladd.f32(float, float, float)
 declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>)
 declare float @llvm.fma.f32(float, float, float)

>From 3b8077c3a0e5eb943d6673e41b2837f7402615a8 Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <ivanov.i.aa at m.titech.ac.jp>
Date: Tue, 7 Nov 2023 14:43:47 +0900
Subject: [PATCH 03/11] Add NVVM Translation Interface

---
 mlir/include/mlir/Target/LLVMIR/Dialect/All.h |  2 +
 .../Dialect/NVVM/LLVMIRToNVVMTranslation.h    | 31 +++++++
 mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp  |  1 +
 .../LLVMIR/LLVMIRToLLVMTranslation.cpp        |  6 --
 .../Target/LLVMIR/Dialect/NVVM/CMakeLists.txt | 18 ++++
 .../Dialect/NVVM/LLVMIRToNVVMTranslation.cpp  | 93 +++++++++++++++++++
 6 files changed, 145 insertions(+), 6 deletions(-)
 create mode 100644 mlir/include/mlir/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.h
 create mode 100644 mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp

diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/All.h b/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
index 5dfc15afb7593..0b37e23e45118 100644
--- a/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
@@ -22,6 +22,7 @@
 #include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.h"
@@ -74,6 +75,7 @@ registerAllGPUToLLVMIRTranslations(DialectRegistry &registry) {
 static inline void
 registerAllFromLLVMIRTranslations(DialectRegistry &registry) {
   registerLLVMDialectImport(registry);
+  registerNVVMDialectImport(registry);
 }
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.h b/mlir/include/mlir/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.h
new file mode 100644
index 0000000000000..ed17556fb77f3
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.h
@@ -0,0 +1,31 @@
+//===- LLVMIRToNVVMTranslation.h - LLVM IR to NVVM Dialect ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides registration calls for LLVM IR to NVVM dialect translation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVMIR_DIALECT_LLVMIR_LLVMIRTONVVMTRANSLATION_H
+#define MLIR_TARGET_LLVMIR_DIALECT_LLVMIR_LLVMIRTONVVMTRANSLATION_H
+
+namespace mlir {
+
+class DialectRegistry;
+class MLIRContext;
+
+/// Registers the NVVM dialect and its import from LLVM IR in the given
+/// registry.
+void registerNVVMDialectImport(DialectRegistry &registry);
+
+/// Registers the NVVM dialect and its import from LLVM IR with the given
+/// context.
+void registerNVVMDialectImport(MLIRContext &context);
+
+} // namespace mlir
+
+#endif
diff --git a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
index c521d76a42995..dbd6985a829a0 100644
--- a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/DLTI/DLTI.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Target/LLVMIR/Dialect/All.h"
 #include "mlir/Target/LLVMIR/Import.h"
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
index d20e95754fbf5..40d8253d822f6 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
@@ -13,7 +13,6 @@
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMInterfaces.h"
-#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Target/LLVMIR/ModuleImport.h"
 
@@ -25,7 +24,6 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/Support/ModRef.h"
 
 using namespace mlir;
@@ -39,7 +37,6 @@ using namespace mlir::LLVM::detail;
 static bool isConvertibleIntrinsic(llvm::Intrinsic::ID id) {
   static const DenseSet<unsigned> convertibleIntrinsics = {
 #include "mlir/Dialect/LLVMIR/LLVMConvertibleLLVMIRIntrinsics.inc"
-#include "mlir/Dialect/LLVMIR/NVVMConvertibleLLVMIRIntrinsics.inc"
   };
   return convertibleIntrinsics.contains(id);
 }
@@ -49,7 +46,6 @@ static bool isConvertibleIntrinsic(llvm::Intrinsic::ID id) {
 static ArrayRef<unsigned> getSupportedIntrinsicsImpl() {
   static const SmallVector<unsigned> convertibleIntrinsics = {
 #include "mlir/Dialect/LLVMIR/LLVMConvertibleLLVMIRIntrinsics.inc"
-#include "mlir/Dialect/LLVMIR/NVVMConvertibleLLVMIRIntrinsics.inc"
   };
   return convertibleIntrinsics;
 }
@@ -67,7 +63,6 @@ static LogicalResult convertIntrinsicImpl(OpBuilder &odsBuilder,
     SmallVector<llvm::Value *> args(inst->args());
     ArrayRef<llvm::Value *> llvmOperands(args);
 #include "mlir/Dialect/LLVMIR/LLVMIntrinsicFromLLVMIRConversions.inc"
-#include "mlir/Dialect/LLVMIR/NVVMFromLLVMIRConversions.inc"
   }
 
   return failure();
@@ -286,7 +281,6 @@ class LLVMDialectLLVMIRImportInterface : public LLVMImportDialectInterface {
 
 void mlir::registerLLVMDialectImport(DialectRegistry &registry) {
   registry.insert<LLVM::LLVMDialect>();
-  registry.insert<NVVM::NVVMDialect>();
   registry.addExtension(+[](MLIRContext *ctx, LLVM::LLVMDialect *dialect) {
     dialect->addInterfaces<LLVMDialectLLVMIRImportInterface>();
   });
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/NVVM/CMakeLists.txt
index 9f3935b0c3f47..a90de15798161 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/CMakeLists.txt
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/CMakeLists.txt
@@ -1,3 +1,21 @@
+set(LLVM_OPTIONAL_SOURCES
+  LLVMIRToNVVMTranslation.cpp
+  NVVMToLLVMIRTranslation.cpp
+  )
+
+add_mlir_translation_library(MLIRLLVMIRToNVVMTranslation
+  LLVMIRToNVVMTranslation.cpp
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRNVVMDialect
+  MLIRSupport
+  MLIRTargetLLVMIRImport
+  )
+
 add_mlir_translation_library(MLIRNVVMToLLVMIRTranslation
   NVVMToLLVMIRTranslation.cpp
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
new file mode 100644
index 0000000000000..855abc12a909e
--- /dev/null
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
@@ -0,0 +1,93 @@
+//===- LLVMIRToNVVMTranslation.cpp - Translate LLVM IR to NVVM dialect ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a translation between LLVM IR and the MLIR NVVM dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Target/LLVMIR/ModuleImport.h"
+
+#include "llvm/IR/IntrinsicsNVPTX.h"
+
+using namespace mlir;
+using namespace mlir::NVVM;
+
+/// Returns true if the LLVM IR intrinsic is convertible to an MLIR NVVM dialect
+/// intrinsic. Returns false otherwise.
+static bool isConvertibleIntrinsic(llvm::Intrinsic::ID id) {
+  static const DenseSet<unsigned> convertibleIntrinsics = {
+#include "mlir/Dialect/LLVMIR/NVVMConvertibleLLVMIRIntrinsics.inc"
+  };
+  return convertibleIntrinsics.contains(id);
+}
+
+/// Returns the list of LLVM IR intrinsic identifiers that are convertible to
+/// MLIR NVVM dialect intrinsics.
+static ArrayRef<unsigned> getSupportedIntrinsicsImpl() {
+  static const SmallVector<unsigned> convertibleIntrinsics = {
+#include "mlir/Dialect/LLVMIR/NVVMConvertibleLLVMIRIntrinsics.inc"
+  };
+  return convertibleIntrinsics;
+}
+
+/// Converts the LLVM intrinsic to an MLIR NVVM dialect operation if a
+/// conversion exits. Returns failure otherwise.
+static LogicalResult convertIntrinsicImpl(OpBuilder &odsBuilder,
+                                          llvm::CallInst *inst,
+                                          LLVM::ModuleImport &moduleImport) {
+  llvm::Intrinsic::ID intrinsicID = inst->getIntrinsicID();
+
+  // Check if the intrinsic is convertible to an MLIR dialect counterpart and
+  // copy the arguments to an an LLVM operands array reference for conversion.
+  if (isConvertibleIntrinsic(intrinsicID)) {
+    SmallVector<llvm::Value *> args(inst->args());
+    ArrayRef<llvm::Value *> llvmOperands(args);
+#include "mlir/Dialect/LLVMIR/NVVMFromLLVMIRConversions.inc"
+  }
+
+  return failure();
+}
+
+namespace {
+
+/// Implementation of the dialect interface that converts operations belonging
+/// to the NVVM dialect.
+class NVVMDialectLLVMIRImportInterface : public LLVMImportDialectInterface {
+public:
+  using LLVMImportDialectInterface::LLVMImportDialectInterface;
+
+  /// Converts the LLVM intrinsic to an MLIR NVVM dialect operation if a
+  /// conversion exits. Returns failure otherwise.
+  LogicalResult convertIntrinsic(OpBuilder &builder, llvm::CallInst *inst,
+                                 LLVM::ModuleImport &moduleImport) const final {
+    return convertIntrinsicImpl(builder, inst, moduleImport);
+  }
+
+  /// Returns the list of LLVM IR intrinsic identifiers that are convertible to
+  /// MLIR NVVM dialect intrinsics.
+  ArrayRef<unsigned> getSupportedIntrinsics() const final {
+    return getSupportedIntrinsicsImpl();
+  }
+};
+
+} // namespace
+
+void mlir::registerNVVMDialectImport(DialectRegistry &registry) {
+  registry.insert<NVVM::NVVMDialect>();
+  registry.addExtension(+[](MLIRContext *ctx, NVVM::NVVMDialect *dialect) {
+    dialect->addInterfaces<NVVMDialectLLVMIRImportInterface>();
+  });
+}
+
+void mlir::registerNVVMDialectImport(MLIRContext &context) {
+  DialectRegistry registry;
+  registerNVVMDialectImport(registry);
+  context.appendDialectRegistry(registry);
+}

>From de7cf253740416356fa650c46a4de81c8ee780ce Mon Sep 17 00:00:00 2001
From: "Ivan R. Ivanov" <ivanov.i.aa at m.titech.ac.jp>
Date: Tue, 7 Nov 2023 18:38:36 -0800
Subject: [PATCH 04/11] Apply suggestions from code review

Co-authored-by: Tobias Gysi <tobias.gysi at nextsilicon.com>
---
 .../Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.h    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.h b/mlir/include/mlir/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.h
index ed17556fb77f3..02ee83284dd33 100644
--- a/mlir/include/mlir/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_TARGET_LLVMIR_DIALECT_LLVMIR_LLVMIRTONVVMTRANSLATION_H
-#define MLIR_TARGET_LLVMIR_DIALECT_LLVMIR_LLVMIRTONVVMTRANSLATION_H
+#ifndef MLIR_TARGET_LLVMIR_DIALECT_NVVM_LLVMIRTONVVMTRANSLATION_H
+#define MLIR_TARGET_LLVMIR_DIALECT_NVVM_LLVMIRTONVVMTRANSLATION_H
 
 namespace mlir {
 
@@ -28,4 +28,4 @@ void registerNVVMDialectImport(MLIRContext &context);
 
 } // namespace mlir
 
-#endif
+#endif // MLIR_TARGET_LLVMIR_DIALECT_NVVM_LLVMIRTONVVMTRANSLATION_H

>From 38472807c43ce3ab955666fd38763581af22f1c4 Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <ivanov.i.aa at m.titech.ac.jp>
Date: Tue, 21 Nov 2023 06:17:06 +0900
Subject: [PATCH 05/11] Add test

---
 mlir/test/Target/LLVMIR/Import/nvvmir.ll | 397 +++++++++++++++++++++++
 1 file changed, 397 insertions(+)
 create mode 100644 mlir/test/Target/LLVMIR/Import/nvvmir.ll

diff --git a/mlir/test/Target/LLVMIR/Import/nvvmir.ll b/mlir/test/Target/LLVMIR/Import/nvvmir.ll
new file mode 100644
index 0000000000000..cfe4d3862a898
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/Import/nvvmir.ll
@@ -0,0 +1,397 @@
+; RUN: mlir-translate -import-llvm %s | FileCheck %s
+
+define i32 @nvvm_special_regs() {
+  %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+  %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+  %4 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  %5 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+  %6 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+  %7 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  %8 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+  %9 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+  %10 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+  %11 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+  %12 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+  %13 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  %14 = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
+  %15 = call i32 @llvm.nvvm.read.ptx.sreg.clusterid.x()
+  %16 = call i32 @llvm.nvvm.read.ptx.sreg.clusterid.y()
+  %17 = call i32 @llvm.nvvm.read.ptx.sreg.clusterid.z()
+  %18 = call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x()
+  %19 = call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y()
+  %20 = call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z()
+  %21 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
+  %22 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
+  %23 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
+  %24 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
+  %25 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
+  %26 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
+  %27 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctarank()
+  %28 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctarank()
+  ret i32 %1
+}
+
+define float @nvvm_rcp(float %0) {
+  %2 = call float @llvm.nvvm.rcp.approx.ftz.f(float %0)
+  ret float %2
+}
+
+define void @llvm_nvvm_barrier0() {
+  call void @llvm.nvvm.barrier0()
+  ret void
+}
+
+define i32 @nvvm_shfl(i32 %0, i32 %1, i32 %2, i32 %3, float %4) {
+  %6 = call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 %0, i32 %3, i32 %1, i32 %2)
+  %7 = call float @llvm.nvvm.shfl.sync.bfly.f32(i32 %0, float %4, i32 %1, i32 %2)
+  %8 = call i32 @llvm.nvvm.shfl.sync.up.i32(i32 %0, i32 %3, i32 %1, i32 %2)
+  %9 = call float @llvm.nvvm.shfl.sync.up.f32(i32 %0, float %4, i32 %1, i32 %2)
+  %10 = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %0, i32 %3, i32 %1, i32 %2)
+  %11 = call float @llvm.nvvm.shfl.sync.down.f32(i32 %0, float %4, i32 %1, i32 %2)
+  %12 = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %0, i32 %3, i32 %1, i32 %2)
+  %13 = call float @llvm.nvvm.shfl.sync.idx.f32(i32 %0, float %4, i32 %1, i32 %2)
+  ret i32 %6
+}
+
+define { i32, i1 } @nvvm_shfl_pred(i32 %0, i32 %1, i32 %2, i32 %3, float %4) {
+  %6 = call { i32, i1 } @llvm.nvvm.shfl.sync.bfly.i32p(i32 %0, i32 %3, i32 %1, i32 %2)
+  %7 = call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 %0, float %4, i32 %1, i32 %2)
+  %8 = call { i32, i1 } @llvm.nvvm.shfl.sync.up.i32p(i32 %0, i32 %3, i32 %1, i32 %2)
+  %9 = call { float, i1 } @llvm.nvvm.shfl.sync.up.f32p(i32 %0, float %4, i32 %1, i32 %2)
+  %10 = call { i32, i1 } @llvm.nvvm.shfl.sync.down.i32p(i32 %0, i32 %3, i32 %1, i32 %2)
+  %11 = call { float, i1 } @llvm.nvvm.shfl.sync.down.f32p(i32 %0, float %4, i32 %1, i32 %2)
+  %12 = call { i32, i1 } @llvm.nvvm.shfl.sync.idx.i32p(i32 %0, i32 %3, i32 %1, i32 %2)
+  %13 = call { float, i1 } @llvm.nvvm.shfl.sync.idx.f32p(i32 %0, float %4, i32 %1, i32 %2)
+  ret { i32, i1 } %6
+}
+
+define i32 @nvvm_vote(i32 %0, i1 %1) {
+  %3 = call i32 @llvm.nvvm.vote.ballot.sync(i32 %0, i1 %1)
+  ret i32 %3
+}
+
+define { float, float, float, float, float, float, float, float } @nvvm_mma_mn8n8k4_row_col_f32_f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, float %4, float %5, float %6, float %7, float %8, float %9, float %10, float %11) {
+  %13 = call { float, float, float, float, float, float, float, float } @llvm.nvvm.mma.m8n8k4.row.col.f32.f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, float %4, float %5, float %6, float %7, float %8, float %9, float %10, float %11)
+  ret { float, float, float, float, float, float, float, float } %13
+}
+
+define { <2 x half>, <2 x half> } @nvvm_mma_m16n8k16_f16_f16(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7) {
+  %9 = call { <2 x half>, <2 x half> } @llvm.nvvm.mma.m16n8k16.row.col.f16.f16(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7)
+  ret { <2 x half>, <2 x half> } %9
+}
+
+define { float, float, float, float } @nvvm_mma_m16n8k16_f32_f16(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7) {
+  %9 = call { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.f32.f16(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7)
+  ret { float, float, float, float } %9
+}
+
+define { <2 x half>, <2 x half> } @nvvm_mma_m16n8k16_f16_f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, float %6, float %7, float %8, float %9) {
+  %11 = call { <2 x half>, <2 x half> } @llvm.nvvm.mma.m16n8k16.row.col.f16.f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, float %6, float %7, float %8, float %9)
+  ret { <2 x half>, <2 x half> } %11
+}
+
+define { float, float, float, float } @nvvm_mma_m16n8k16_f32_f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, float %6, float %7, float %8, float %9) {
+  %11 = call { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.f32.f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, float %6, float %7, float %8, float %9)
+  ret { float, float, float, float } %11
+}
+
+define { i32, i32, i32, i32 } @nvvm_mma_m16n8k16_s8_s8(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6) {
+  %8 = call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k16.row.col.s8(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6)
+  ret { i32, i32, i32, i32 } %8
+}
+
+define { i32, i32, i32, i32 } @nvvm_mma_m16n8k16_s8_u8(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6) {
+  %8 = call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k16.row.col.satfinite.s8.u8(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6)
+  ret { i32, i32, i32, i32 } %8
+}
+
+define { i32, i32, i32, i32 } @nvvm_mma_m16n8k128_b1_b1(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6) {
+  %8 = call { i32, i32, i32, i32 } @llvm.nvvm.mma.xor.popc.m16n8k128.row.col.b1(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6)
+  ret { i32, i32, i32, i32 } %8
+}
+
+define { i32, i32, i32, i32 } @nvvm_mma_m16n8k32_s4_s4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6) {
+  %8 = call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k32.row.col.satfinite.s4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6)
+  ret { i32, i32, i32, i32 } %8
+}
+
+define { double, double } @nvvm_mma_m8n8k4_f64_f64(double %0, double %1, double %2, double %3) {
+  %5 = call { double, double } @llvm.nvvm.mma.m8n8k4.row.col.f64(double %0, double %1, double %2, double %3)
+  ret { double, double } %5
+}
+
+define { float, float, float, float } @nvvm_mma_m16n8k4_tf32_f32(i32 %0, i32 %1, i32 %2, float %3, float %4, float %5, float %6) {
+  %8 = call { float, float, float, float } @llvm.nvvm.mma.m16n8k4.row.col.tf32(i32 %0, i32 %1, i32 %2, float %3, float %4, float %5, float %6)
+  ret { float, float, float, float } %8
+}
+
+define void @gpu_wmma_load_op(ptr addrspace(3) %0, i32 %1) {
+  %3 = call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p3(ptr addrspace(3) %0, i32 %1)
+  ret void
+}
+
+define void @gpu_wmma_store_op(ptr addrspace(3) %0, i32 %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5) {
+  call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16.p3(ptr addrspace(3) %0, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, i32 %1)
+  ret void
+}
+
+define void @gpu_wmma_mma_op(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7, <2 x half> %8, <2 x half> %9, <2 x half> %10, <2 x half> %11, <2 x half> %12, <2 x half> %13, <2 x half> %14, <2 x half> %15, <2 x half> %16, <2 x half> %17, <2 x half> %18, <2 x half> %19) {
+  %21 = call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7, <2 x half> %8, <2 x half> %9, <2 x half> %10, <2 x half> %11, <2 x half> %12, <2 x half> %13, <2 x half> %14, <2 x half> %15, <2 x half> %16, <2 x half> %17, <2 x half> %18, <2 x half> %19)
+  ret void
+}
+
+define void @nvvm_wmma_load_tf32(ptr %0, i32 %1) {
+  %3 = call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0(ptr %0, i32 %1)
+  ret void
+}
+
+define void @nvvm_wmma_mma(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, float %8, float %9, float %10, float %11, float %12, float %13, float %14, float %15) {
+  %17 = call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.row.row.tf32(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, float %8, float %9, float %10, float %11, float %12, float %13, float %14, float %15)
+  ret void
+}
+
+define void @cp_async(ptr addrspace(3) %0, ptr addrspace(1) %1) {
+  call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %0, ptr addrspace(1) %1)
+  call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %0, ptr addrspace(1) %1)
+  call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %0, ptr addrspace(1) %1)
+  call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %0, ptr addrspace(1) %1)
+  call void @llvm.nvvm.cp.async.commit.group()
+  call void @llvm.nvvm.cp.async.wait.group(i32 0)
+  ret void
+}
+
+define void @ld_matrix(ptr addrspace(3) %0) {
+  %2 = call i32 @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.b16.p3(ptr addrspace(3) %0)
+  %3 = call { i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.b16.p3(ptr addrspace(3) %0)
+  %4 = call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) %0)
+  %5 = call i32 @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.trans.b16.p3(ptr addrspace(3) %0)
+  %6 = call { i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.trans.b16.p3(ptr addrspace(3) %0)
+  %7 = call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) %0)
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.y() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.z() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.z() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.warpsize() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.laneid() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.clusterid.x() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.clusterid.y() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.clusterid.z() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.cluster.ctarank() #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.cluster.nctarank() #0
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rcp.approx.ftz.f(float) #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare float @llvm.nvvm.shfl.sync.bfly.f32(i32, float, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.up.i32(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare float @llvm.nvvm.shfl.sync.up.f32(i32, float, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare float @llvm.nvvm.shfl.sync.down.f32(i32, float, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare float @llvm.nvvm.shfl.sync.idx.f32(i32, float, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare { i32, i1 } @llvm.nvvm.shfl.sync.bfly.i32p(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32, float, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare { i32, i1 } @llvm.nvvm.shfl.sync.up.i32p(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare { float, i1 } @llvm.nvvm.shfl.sync.up.f32p(i32, float, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare { i32, i1 } @llvm.nvvm.shfl.sync.down.i32p(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare { float, i1 } @llvm.nvvm.shfl.sync.down.f32p(i32, float, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare { i32, i1 } @llvm.nvvm.shfl.sync.idx.i32p(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare { float, i1 } @llvm.nvvm.shfl.sync.idx.f32p(i32, float, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.vote.ballot.sync(i32, i1) #3
+
+; Function Attrs: nocallback nounwind memory(none)
+declare { float, float, float, float, float, float, float, float } @llvm.nvvm.mma.m8n8k4.row.col.f32.f32(<2 x half>, <2 x half>, <2 x half>, <2 x half>, float, float, float, float, float, float, float, float) #4
+
+; Function Attrs: nocallback nounwind memory(none)
+declare { <2 x half>, <2 x half> } @llvm.nvvm.mma.m16n8k16.row.col.f16.f16(<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>) #4
+
+; Function Attrs: nocallback nounwind memory(none)
+declare { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.f32.f16(<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>) #4
+
+; Function Attrs: nocallback nounwind memory(none)
+declare { <2 x half>, <2 x half> } @llvm.nvvm.mma.m16n8k16.row.col.f16.f32(<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, float, float, float, float) #4
+
+; Function Attrs: nocallback nounwind memory(none)
+declare { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.f32.f32(<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, float, float, float, float) #4
+
+; Function Attrs: nocallback nounwind memory(none)
+declare { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k16.row.col.s8(i32, i32, i32, i32, i32, i32, i32) #4
+
+; Function Attrs: nocallback nounwind memory(none)
+declare { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k16.row.col.satfinite.s8.u8(i32, i32, i32, i32, i32, i32, i32) #4
+
+; Function Attrs: nocallback nounwind memory(none)
+declare { i32, i32, i32, i32 } @llvm.nvvm.mma.xor.popc.m16n8k128.row.col.b1(i32, i32, i32, i32, i32, i32, i32) #4
+
+; Function Attrs: nocallback nounwind memory(none)
+declare { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k32.row.col.satfinite.s4(i32, i32, i32, i32, i32, i32, i32) #4
+
+; Function Attrs: nocallback nounwind memory(none)
+declare { double, double } @llvm.nvvm.mma.m8n8k4.row.col.f64(double, double, double, double) #4
+
+; Function Attrs: nocallback nounwind memory(none)
+declare { float, float, float, float } @llvm.nvvm.mma.m16n8k4.row.col.tf32(i32, i32, i32, float, float, float, float) #4
+
+; Function Attrs: nocallback nounwind memory(argmem: read)
+declare { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p3(ptr addrspace(3) nocapture readonly, i32) #5
+
+; Function Attrs: nocallback nounwind memory(argmem: write)
+declare void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16.p3(ptr addrspace(3) nocapture writeonly, <2 x half>, <2 x half>, <2 x half>, <2 x half>, i32) #6
+
+; Function Attrs: nocallback nounwind memory(none)
+declare { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16(<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>) #4
+
+; Function Attrs: nocallback nounwind memory(argmem: read)
+declare { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0(ptr nocapture readonly, i32) #5
+
+; Function Attrs: nocallback nounwind memory(none)
+declare { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.row.row.tf32(i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float) #4
+
+; Function Attrs: nocallback nounwind memory(argmem: readwrite)
+declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) noalias writeonly, ptr addrspace(1) noalias readonly) #7
+
+; Function Attrs: nocallback nounwind memory(argmem: readwrite)
+declare void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) noalias writeonly, ptr addrspace(1) noalias readonly) #7
+
+; Function Attrs: nocallback nounwind memory(argmem: readwrite)
+declare void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) noalias writeonly, ptr addrspace(1) noalias readonly) #7
+
+; Function Attrs: nocallback nounwind memory(argmem: readwrite)
+declare void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) noalias writeonly, ptr addrspace(1) noalias readonly) #7
+
+; Function Attrs: nounwind
+declare void @llvm.nvvm.cp.async.commit.group() #8
+
+; Function Attrs: nounwind
+declare void @llvm.nvvm.cp.async.wait.group(i32 immarg) #8
+
+; Function Attrs: nocallback nounwind memory(argmem: read)
+declare i32 @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.b16.p3(ptr addrspace(3) nocapture readonly) #5
+
+; Function Attrs: nocallback nounwind memory(argmem: read)
+declare { i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.b16.p3(ptr addrspace(3) nocapture readonly) #5
+
+; Function Attrs: nocallback nounwind memory(argmem: read)
+declare { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) nocapture readonly) #5
+
+; Function Attrs: nocallback nounwind memory(argmem: read)
+declare i32 @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.trans.b16.p3(ptr addrspace(3) nocapture readonly) #5
+
+; Function Attrs: nocallback nounwind memory(argmem: read)
+declare { i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.trans.b16.p3(ptr addrspace(3) nocapture readonly) #5
+
+; Function Attrs: nocallback nounwind memory(argmem: read)
+declare { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) nocapture readonly) #5

>From 6e91a93789d1cfd901f8d49b34fde6d770bc2761 Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <ivanov.i.aa at m.titech.ac.jp>
Date: Tue, 21 Nov 2023 06:40:41 +0900
Subject: [PATCH 06/11] TMP

---
 mlir/test/Target/LLVMIR/Import/nvvmir.ll.bak | 130 +++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 mlir/test/Target/LLVMIR/Import/nvvmir.ll.bak

diff --git a/mlir/test/Target/LLVMIR/Import/nvvmir.ll.bak b/mlir/test/Target/LLVMIR/Import/nvvmir.ll.bak
new file mode 100644
index 0000000000000..34bb6793ee29b
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/Import/nvvmir.ll.bak
@@ -0,0 +1,130 @@
+// CHECK-LABEL: @nvvm_special_regs
+  %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+  call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+  call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+  call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+  call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+  call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+  call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+  call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+  call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+  call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  call i32 @llvm.nvvm.read.ptx.sreg.laneid()
+  call i32 @llvm.nvvm.read.ptx.sreg.clusterid.x
+  call i32 @llvm.nvvm.read.ptx.sreg.clusterid.y
+  call i32 @llvm.nvvm.read.ptx.sreg.clusterid.z
+  call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x
+  call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y
+  call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z
+  call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid
+  call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid
+  call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid
+  call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid
+  call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid
+  call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid
+  call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctarank
+  call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctarank
+// CHECK-LABEL: @nvvm_rcp
+  call float @llvm.nvvm.rcp.approx.ftz.f
+// CHECK-LABEL: @llvm_nvvm_barrier0
+  call void @llvm.nvvm.barrier0()
+// CHECK-LABEL: @nvvm_shfl
+  call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  call float @llvm.nvvm.shfl.sync.bfly.f32(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  call i32 @llvm.nvvm.shfl.sync.up.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  call float @llvm.nvvm.shfl.sync.up.f32(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  call float @llvm.nvvm.shfl.sync.down.f32(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  call float @llvm.nvvm.shfl.sync.idx.f32(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+// CHECK-LABEL: @nvvm_shfl_pred
+  call { i32, i1 } @llvm.nvvm.shfl.sync.bfly.i32p(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  call { i32, i1 } @llvm.nvvm.shfl.sync.up.i32p(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  call { float, i1 } @llvm.nvvm.shfl.sync.up.f32p(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  call { i32, i1 } @llvm.nvvm.shfl.sync.down.i32p(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  call { float, i1 } @llvm.nvvm.shfl.sync.down.f32p(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  call { i32, i1 } @llvm.nvvm.shfl.sync.idx.i32p(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  call { float, i1 } @llvm.nvvm.shfl.sync.idx.f32p(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+// CHECK-LABEL: @nvvm_vote
+  call i32 @llvm.nvvm.vote.ballot.sync(i32 %{{.*}}, i1 %{{.*}})
+// CHECK-LABEL: @nvvm_mma_mn8n8k4_row_col_f32_f32
+  call { float, float, float, float, float, float, float, float } @llvm.nvvm.mma.m8n8k4.row.col.f32.f32
+// CHECK-LABEL: @nvvm_mma_m16n8k16_f16_f16
+  call { <2 x half>, <2 x half> } @llvm.nvvm.mma.m16n8k16.row.col.f16.f16
+// CHECK-LABEL: @nvvm_mma_m16n8k16_f32_f16
+  call { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.f32.f16
+// CHECK-LABEL: @nvvm_mma_m16n8k16_f16_f32
+  call { <2 x half>, <2 x half> } @llvm.nvvm.mma.m16n8k16.row.col.f16.f32
+// CHECK-LABEL: @nvvm_mma_m16n8k16_f32_f32
+  call { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.f32.f32
+// CHECK-LABEL: @nvvm_mma_m16n8k16_s8_s8
+  call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k16.row.col.s8
+// CHECK-LABEL: @nvvm_mma_m16n8k16_s8_u8
+  call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k16.row.col.satfinite.s8.u8
+// CHECK-LABEL: @nvvm_mma_m16n8k128_b1_b1
+  call { i32, i32, i32, i32 } @llvm.nvvm.mma.xor.popc.m16n8k128.row.col.b1
+// CHECK-LABEL: @nvvm_mma_m16n8k32_s4_s4
+  call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k32.row.col.satfinite.s4
+// CHECK-LABEL: @nvvm_mma_m8n8k4_f64_f64
+  call { double, double } @llvm.nvvm.mma.m8n8k4.row.col.f64
+// CHECK-LABEL: @nvvm_mma_m16n8k4_tf32_f32
+  call { float, float, float, float } @llvm.nvvm.mma.m16n8k4.row.col.tf32
+// CHECK-LABEL: @gpu_wmma_load_op
+  call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p3(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
+// CHECK-LABEL: @gpu_wmma_store_op
+  call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16.p3(ptr addrspace(3) %{{.*}}, <2 x half> {{.*}}, <2 x half> %{{.*}}, <2 x half> %{{.*}}, <2 x half> %{{.*}}, i32 %{{.*}})
+// CHECK-LABEL: @gpu_wmma_mma_op
+  call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
+// CHECK-LABEL: @nvvm_wmma_load_tf32
+  call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0(ptr %{{.*}}, i32 %{{.*}})
+// CHECK-LABEL: @nvvm_wmma_mma
+  { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.row.row.tf32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})
+// CHECK-LABEL: @cp_async
+call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
+call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
+call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
+call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
+call void @llvm.nvvm.cp.async.commit.group()
+call void @llvm.nvvm.cp.async.wait.group(i32 0)
+// CHECK-LABEL: @ld_matrix
+  call i32 @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.b16.p3(ptr addrspace(3) %{{.*}})
+  call { i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.b16.p3(ptr addrspace(3) %{{.*}})
+  call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) %{{.*}})
+  call i32 @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.trans.b16.p3(ptr addrspace(3) %{{.*}})
+  call { i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.trans.b16.p3(ptr addrspace(3) %{{.*}})
+  call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) %{{.*}})
+!nvvm.annotations =
+// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
+{ptr @kernel_func, !"kernel", i32 1}
+!nvvm.annotations =
+// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
+{ptr @kernel_func, !"kernel", i32 1}
+{ptr @kernel_func, !"maxntidx", i32 1}
+{ptr @kernel_func, !"maxntidy", i32 23}
+{ptr @kernel_func, !"maxntidz", i32 32}
+!nvvm.annotations =
+// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
+{ptr @kernel_func, !"kernel", i32 1}
+{ptr @kernel_func, !"reqntidx", i32 1}
+{ptr @kernel_func, !"reqntidy", i32 23}
+{ptr @kernel_func, !"reqntidz", i32 32}
+!nvvm.annotations =
+// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
+{ptr @kernel_func, !"kernel", i32 1}
+{ptr @kernel_func, !"minctasm", i32 16}
+!nvvm.annotations =
+// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
+{ptr @kernel_func, !"kernel", i32 1}
+{ptr @kernel_func, !"maxnreg", i32 16}
+!nvvm.annotations =
+// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
+{ptr @kernel_func, !"kernel", i32 1}
+{ptr @kernel_func, !"maxnreg", i32 32}
+{ptr @kernel_func, !"maxntidx", i32 1}
+{ptr @kernel_func, !"maxntidy", i32 23}
+{ptr @kernel_func, !"maxntidz", i32 32}
+{ptr @kernel_func, !"minctasm", i32 16}

>From 4803bdf09533fcc5dab0ae049db01ae8e50393a3 Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <ivanov.i.aa at m.titech.ac.jp>
Date: Mon, 11 Dec 2023 14:25:33 +0900
Subject: [PATCH 07/11] remove stray file

---
 mlir/test/Target/LLVMIR/Import/nvvmir.ll.bak | 130 -------------------
 1 file changed, 130 deletions(-)
 delete mode 100644 mlir/test/Target/LLVMIR/Import/nvvmir.ll.bak

diff --git a/mlir/test/Target/LLVMIR/Import/nvvmir.ll.bak b/mlir/test/Target/LLVMIR/Import/nvvmir.ll.bak
deleted file mode 100644
index 34bb6793ee29b..0000000000000
--- a/mlir/test/Target/LLVMIR/Import/nvvmir.ll.bak
+++ /dev/null
@@ -1,130 +0,0 @@
-// CHECK-LABEL: @nvvm_special_regs
-  %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-  call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
-  call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
-  call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
-  call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-  call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
-  call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
-  call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
-  call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
-  call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
-  call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  call i32 @llvm.nvvm.read.ptx.sreg.laneid()
-  call i32 @llvm.nvvm.read.ptx.sreg.clusterid.x
-  call i32 @llvm.nvvm.read.ptx.sreg.clusterid.y
-  call i32 @llvm.nvvm.read.ptx.sreg.clusterid.z
-  call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x
-  call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y
-  call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z
-  call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid
-  call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid
-  call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid
-  call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid
-  call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid
-  call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid
-  call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctarank
-  call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctarank
-// CHECK-LABEL: @nvvm_rcp
-  call float @llvm.nvvm.rcp.approx.ftz.f
-// CHECK-LABEL: @llvm_nvvm_barrier0
-  call void @llvm.nvvm.barrier0()
-// CHECK-LABEL: @nvvm_shfl
-  call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  call float @llvm.nvvm.shfl.sync.bfly.f32(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  call i32 @llvm.nvvm.shfl.sync.up.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  call float @llvm.nvvm.shfl.sync.up.f32(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  call float @llvm.nvvm.shfl.sync.down.f32(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  call float @llvm.nvvm.shfl.sync.idx.f32(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-// CHECK-LABEL: @nvvm_shfl_pred
-  call { i32, i1 } @llvm.nvvm.shfl.sync.bfly.i32p(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  call { i32, i1 } @llvm.nvvm.shfl.sync.up.i32p(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  call { float, i1 } @llvm.nvvm.shfl.sync.up.f32p(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  call { i32, i1 } @llvm.nvvm.shfl.sync.down.i32p(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  call { float, i1 } @llvm.nvvm.shfl.sync.down.f32p(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  call { i32, i1 } @llvm.nvvm.shfl.sync.idx.i32p(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-  call { float, i1 } @llvm.nvvm.shfl.sync.idx.f32p(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
-// CHECK-LABEL: @nvvm_vote
-  call i32 @llvm.nvvm.vote.ballot.sync(i32 %{{.*}}, i1 %{{.*}})
-// CHECK-LABEL: @nvvm_mma_mn8n8k4_row_col_f32_f32
-  call { float, float, float, float, float, float, float, float } @llvm.nvvm.mma.m8n8k4.row.col.f32.f32
-// CHECK-LABEL: @nvvm_mma_m16n8k16_f16_f16
-  call { <2 x half>, <2 x half> } @llvm.nvvm.mma.m16n8k16.row.col.f16.f16
-// CHECK-LABEL: @nvvm_mma_m16n8k16_f32_f16
-  call { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.f32.f16
-// CHECK-LABEL: @nvvm_mma_m16n8k16_f16_f32
-  call { <2 x half>, <2 x half> } @llvm.nvvm.mma.m16n8k16.row.col.f16.f32
-// CHECK-LABEL: @nvvm_mma_m16n8k16_f32_f32
-  call { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.f32.f32
-// CHECK-LABEL: @nvvm_mma_m16n8k16_s8_s8
-  call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k16.row.col.s8
-// CHECK-LABEL: @nvvm_mma_m16n8k16_s8_u8
-  call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k16.row.col.satfinite.s8.u8
-// CHECK-LABEL: @nvvm_mma_m16n8k128_b1_b1
-  call { i32, i32, i32, i32 } @llvm.nvvm.mma.xor.popc.m16n8k128.row.col.b1
-// CHECK-LABEL: @nvvm_mma_m16n8k32_s4_s4
-  call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k32.row.col.satfinite.s4
-// CHECK-LABEL: @nvvm_mma_m8n8k4_f64_f64
-  call { double, double } @llvm.nvvm.mma.m8n8k4.row.col.f64
-// CHECK-LABEL: @nvvm_mma_m16n8k4_tf32_f32
-  call { float, float, float, float } @llvm.nvvm.mma.m16n8k4.row.col.tf32
-// CHECK-LABEL: @gpu_wmma_load_op
-  call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p3(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
-// CHECK-LABEL: @gpu_wmma_store_op
-  call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16.p3(ptr addrspace(3) %{{.*}}, <2 x half> {{.*}}, <2 x half> %{{.*}}, <2 x half> %{{.*}}, <2 x half> %{{.*}}, i32 %{{.*}})
-// CHECK-LABEL: @gpu_wmma_mma_op
-  call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
-// CHECK-LABEL: @nvvm_wmma_load_tf32
-  call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0(ptr %{{.*}}, i32 %{{.*}})
-// CHECK-LABEL: @nvvm_wmma_mma
-  { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.row.row.tf32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})
-// CHECK-LABEL: @cp_async
-call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
-call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
-call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
-call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
-call void @llvm.nvvm.cp.async.commit.group()
-call void @llvm.nvvm.cp.async.wait.group(i32 0)
-// CHECK-LABEL: @ld_matrix
-  call i32 @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.b16.p3(ptr addrspace(3) %{{.*}})
-  call { i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.b16.p3(ptr addrspace(3) %{{.*}})
-  call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) %{{.*}})
-  call i32 @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.trans.b16.p3(ptr addrspace(3) %{{.*}})
-  call { i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.trans.b16.p3(ptr addrspace(3) %{{.*}})
-  call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) %{{.*}})
-!nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-{ptr @kernel_func, !"kernel", i32 1}
-!nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-{ptr @kernel_func, !"kernel", i32 1}
-{ptr @kernel_func, !"maxntidx", i32 1}
-{ptr @kernel_func, !"maxntidy", i32 23}
-{ptr @kernel_func, !"maxntidz", i32 32}
-!nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-{ptr @kernel_func, !"kernel", i32 1}
-{ptr @kernel_func, !"reqntidx", i32 1}
-{ptr @kernel_func, !"reqntidy", i32 23}
-{ptr @kernel_func, !"reqntidz", i32 32}
-!nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-{ptr @kernel_func, !"kernel", i32 1}
-{ptr @kernel_func, !"minctasm", i32 16}
-!nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-{ptr @kernel_func, !"kernel", i32 1}
-{ptr @kernel_func, !"maxnreg", i32 16}
-!nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-{ptr @kernel_func, !"kernel", i32 1}
-{ptr @kernel_func, !"maxnreg", i32 32}
-{ptr @kernel_func, !"maxntidx", i32 1}
-{ptr @kernel_func, !"maxntidy", i32 23}
-{ptr @kernel_func, !"maxntidz", i32 32}
-{ptr @kernel_func, !"minctasm", i32 16}

>From 16cc248c52c919771b44c4767cf044ce2b839504 Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <ivanov.i.aa at m.titech.ac.jp>
Date: Mon, 11 Dec 2023 14:25:50 +0900
Subject: [PATCH 08/11] Test intrinsic import

---
 mlir/test/Target/LLVMIR/Import/nvvmir.ll | 299 +++++++++++++----------
 1 file changed, 166 insertions(+), 133 deletions(-)

diff --git a/mlir/test/Target/LLVMIR/Import/nvvmir.ll b/mlir/test/Target/LLVMIR/Import/nvvmir.ll
index cfe4d3862a898..6b29cf05e3a92 100644
--- a/mlir/test/Target/LLVMIR/Import/nvvmir.ll
+++ b/mlir/test/Target/LLVMIR/Import/nvvmir.ll
@@ -1,175 +1,208 @@
 ; RUN: mlir-translate -import-llvm %s | FileCheck %s
 
+; CHECK-LABEL: @nvvm_special_regs
 define i32 @nvvm_special_regs() {
+  ; CHECK: = nvvm.read.ptx.sreg.tid.x : i32
   %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  ; CHECK: = nvvm.read.ptx.sreg.tid.y : i32
   %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+  ; CHECK: = nvvm.read.ptx.sreg.tid.z : i32
   %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+  ; CHECK: = nvvm.read.ptx.sreg.ntid.x : i32
   %4 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  ; CHECK: = nvvm.read.ptx.sreg.ntid.y : i32
   %5 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+  ; CHECK: = nvvm.read.ptx.sreg.ntid.z : i32
   %6 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+  ; CHECK: = nvvm.read.ptx.sreg.ctaid.x : i32
   %7 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  ; CHECK: = nvvm.read.ptx.sreg.ctaid.y : i32
   %8 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+  ; CHECK: = nvvm.read.ptx.sreg.ctaid.z : i32
   %9 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+  ; CHECK: = nvvm.read.ptx.sreg.nctaid.x : i32
   %10 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+  ; CHECK: = nvvm.read.ptx.sreg.nctaid.y : i32
   %11 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+  ; CHECK: = nvvm.read.ptx.sreg.nctaid.z : i32
   %12 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+  ; CHECK: = nvvm.read.ptx.sreg.warpsize : i32
   %13 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  ; CHECK: = nvvm.read.ptx.sreg.laneid : i32
   %14 = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
+  ; CHECK: = nvvm.read.ptx.sreg.clusterid.x : i32
   %15 = call i32 @llvm.nvvm.read.ptx.sreg.clusterid.x()
+  ; CHECK: = nvvm.read.ptx.sreg.clusterid.y : i32
   %16 = call i32 @llvm.nvvm.read.ptx.sreg.clusterid.y()
+  ; CHECK: = nvvm.read.ptx.sreg.clusterid.z : i32
   %17 = call i32 @llvm.nvvm.read.ptx.sreg.clusterid.z()
+  ; CHECK: = nvvm.read.ptx.sreg.nclusterid.x : i32
   %18 = call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x()
+  ; CHECK: = nvvm.read.ptx.sreg.nclusterid.y : i32
   %19 = call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y()
+  ; CHECK: = nvvm.read.ptx.sreg.nclusterid.z : i32
   %20 = call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z()
+  ; CHECK: = nvvm.read.ptx.sreg.cluster.ctaid.x : i32
   %21 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
+  ; CHECK: = nvvm.read.ptx.sreg.cluster.ctaid.y : i32
   %22 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
+  ; CHECK: = nvvm.read.ptx.sreg.cluster.ctaid.z : i32
   %23 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
+  ; CHECK: = nvvm.read.ptx.sreg.cluster.nctaid.x : i32
   %24 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
+  ; CHECK: = nvvm.read.ptx.sreg.cluster.nctaid.y : i32
   %25 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
+  ; CHECK: = nvvm.read.ptx.sreg.cluster.nctaid.z : i32
   %26 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
+  ; CHECK: = nvvm.read.ptx.sreg.cluster.ctarank : i32
   %27 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctarank()
+  ; CHECK: = nvvm.read.ptx.sreg.cluster.nctarank : i32
   %28 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctarank()
   ret i32 %1
 }
 
+; CHECK-LABEL: @nvvm_rcp
 define float @nvvm_rcp(float %0) {
+  ; CHECK: = nvvm.rcp.approx.ftz.f %{{.*}} : f32
   %2 = call float @llvm.nvvm.rcp.approx.ftz.f(float %0)
   ret float %2
 }
 
-define void @llvm_nvvm_barrier0() {
-  call void @llvm.nvvm.barrier0()
-  ret void
-}
-
-define i32 @nvvm_shfl(i32 %0, i32 %1, i32 %2, i32 %3, float %4) {
-  %6 = call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 %0, i32 %3, i32 %1, i32 %2)
-  %7 = call float @llvm.nvvm.shfl.sync.bfly.f32(i32 %0, float %4, i32 %1, i32 %2)
-  %8 = call i32 @llvm.nvvm.shfl.sync.up.i32(i32 %0, i32 %3, i32 %1, i32 %2)
-  %9 = call float @llvm.nvvm.shfl.sync.up.f32(i32 %0, float %4, i32 %1, i32 %2)
-  %10 = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %0, i32 %3, i32 %1, i32 %2)
-  %11 = call float @llvm.nvvm.shfl.sync.down.f32(i32 %0, float %4, i32 %1, i32 %2)
-  %12 = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %0, i32 %3, i32 %1, i32 %2)
-  %13 = call float @llvm.nvvm.shfl.sync.idx.f32(i32 %0, float %4, i32 %1, i32 %2)
-  ret i32 %6
-}
-
-define { i32, i1 } @nvvm_shfl_pred(i32 %0, i32 %1, i32 %2, i32 %3, float %4) {
-  %6 = call { i32, i1 } @llvm.nvvm.shfl.sync.bfly.i32p(i32 %0, i32 %3, i32 %1, i32 %2)
-  %7 = call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 %0, float %4, i32 %1, i32 %2)
-  %8 = call { i32, i1 } @llvm.nvvm.shfl.sync.up.i32p(i32 %0, i32 %3, i32 %1, i32 %2)
-  %9 = call { float, i1 } @llvm.nvvm.shfl.sync.up.f32p(i32 %0, float %4, i32 %1, i32 %2)
-  %10 = call { i32, i1 } @llvm.nvvm.shfl.sync.down.i32p(i32 %0, i32 %3, i32 %1, i32 %2)
-  %11 = call { float, i1 } @llvm.nvvm.shfl.sync.down.f32p(i32 %0, float %4, i32 %1, i32 %2)
-  %12 = call { i32, i1 } @llvm.nvvm.shfl.sync.idx.i32p(i32 %0, i32 %3, i32 %1, i32 %2)
-  %13 = call { float, i1 } @llvm.nvvm.shfl.sync.idx.f32p(i32 %0, float %4, i32 %1, i32 %2)
-  ret { i32, i1 } %6
-}
-
-define i32 @nvvm_vote(i32 %0, i1 %1) {
-  %3 = call i32 @llvm.nvvm.vote.ballot.sync(i32 %0, i1 %1)
-  ret i32 %3
-}
-
-define { float, float, float, float, float, float, float, float } @nvvm_mma_mn8n8k4_row_col_f32_f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, float %4, float %5, float %6, float %7, float %8, float %9, float %10, float %11) {
-  %13 = call { float, float, float, float, float, float, float, float } @llvm.nvvm.mma.m8n8k4.row.col.f32.f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, float %4, float %5, float %6, float %7, float %8, float %9, float %10, float %11)
-  ret { float, float, float, float, float, float, float, float } %13
-}
-
-define { <2 x half>, <2 x half> } @nvvm_mma_m16n8k16_f16_f16(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7) {
-  %9 = call { <2 x half>, <2 x half> } @llvm.nvvm.mma.m16n8k16.row.col.f16.f16(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7)
-  ret { <2 x half>, <2 x half> } %9
-}
-
-define { float, float, float, float } @nvvm_mma_m16n8k16_f32_f16(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7) {
-  %9 = call { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.f32.f16(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7)
-  ret { float, float, float, float } %9
-}
-
-define { <2 x half>, <2 x half> } @nvvm_mma_m16n8k16_f16_f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, float %6, float %7, float %8, float %9) {
-  %11 = call { <2 x half>, <2 x half> } @llvm.nvvm.mma.m16n8k16.row.col.f16.f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, float %6, float %7, float %8, float %9)
-  ret { <2 x half>, <2 x half> } %11
-}
-
-define { float, float, float, float } @nvvm_mma_m16n8k16_f32_f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, float %6, float %7, float %8, float %9) {
-  %11 = call { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.f32.f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, float %6, float %7, float %8, float %9)
-  ret { float, float, float, float } %11
-}
-
-define { i32, i32, i32, i32 } @nvvm_mma_m16n8k16_s8_s8(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6) {
-  %8 = call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k16.row.col.s8(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6)
-  ret { i32, i32, i32, i32 } %8
-}
-
-define { i32, i32, i32, i32 } @nvvm_mma_m16n8k16_s8_u8(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6) {
-  %8 = call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k16.row.col.satfinite.s8.u8(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6)
-  ret { i32, i32, i32, i32 } %8
-}
-
-define { i32, i32, i32, i32 } @nvvm_mma_m16n8k128_b1_b1(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6) {
-  %8 = call { i32, i32, i32, i32 } @llvm.nvvm.mma.xor.popc.m16n8k128.row.col.b1(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6)
-  ret { i32, i32, i32, i32 } %8
-}
-
-define { i32, i32, i32, i32 } @nvvm_mma_m16n8k32_s4_s4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6) {
-  %8 = call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k32.row.col.satfinite.s4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6)
-  ret { i32, i32, i32, i32 } %8
-}
-
-define { double, double } @nvvm_mma_m8n8k4_f64_f64(double %0, double %1, double %2, double %3) {
-  %5 = call { double, double } @llvm.nvvm.mma.m8n8k4.row.col.f64(double %0, double %1, double %2, double %3)
-  ret { double, double } %5
-}
-
-define { float, float, float, float } @nvvm_mma_m16n8k4_tf32_f32(i32 %0, i32 %1, i32 %2, float %3, float %4, float %5, float %6) {
-  %8 = call { float, float, float, float } @llvm.nvvm.mma.m16n8k4.row.col.tf32(i32 %0, i32 %1, i32 %2, float %3, float %4, float %5, float %6)
-  ret { float, float, float, float } %8
-}
-
-define void @gpu_wmma_load_op(ptr addrspace(3) %0, i32 %1) {
-  %3 = call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p3(ptr addrspace(3) %0, i32 %1)
-  ret void
-}
-
-define void @gpu_wmma_store_op(ptr addrspace(3) %0, i32 %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5) {
-  call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16.p3(ptr addrspace(3) %0, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, i32 %1)
-  ret void
-}
-
-define void @gpu_wmma_mma_op(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7, <2 x half> %8, <2 x half> %9, <2 x half> %10, <2 x half> %11, <2 x half> %12, <2 x half> %13, <2 x half> %14, <2 x half> %15, <2 x half> %16, <2 x half> %17, <2 x half> %18, <2 x half> %19) {
-  %21 = call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7, <2 x half> %8, <2 x half> %9, <2 x half> %10, <2 x half> %11, <2 x half> %12, <2 x half> %13, <2 x half> %14, <2 x half> %15, <2 x half> %16, <2 x half> %17, <2 x half> %18, <2 x half> %19)
-  ret void
-}
-
-define void @nvvm_wmma_load_tf32(ptr %0, i32 %1) {
-  %3 = call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0(ptr %0, i32 %1)
-  ret void
-}
-
-define void @nvvm_wmma_mma(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, float %8, float %9, float %10, float %11, float %12, float %13, float %14, float %15) {
-  %17 = call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.row.row.tf32(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, float %8, float %9, float %10, float %11, float %12, float %13, float %14, float %15)
-  ret void
-}
-
-define void @cp_async(ptr addrspace(3) %0, ptr addrspace(1) %1) {
-  call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %0, ptr addrspace(1) %1)
-  call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %0, ptr addrspace(1) %1)
-  call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %0, ptr addrspace(1) %1)
-  call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %0, ptr addrspace(1) %1)
-  call void @llvm.nvvm.cp.async.commit.group()
-  call void @llvm.nvvm.cp.async.wait.group(i32 0)
-  ret void
-}
-
-define void @ld_matrix(ptr addrspace(3) %0) {
-  %2 = call i32 @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.b16.p3(ptr addrspace(3) %0)
-  %3 = call { i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.b16.p3(ptr addrspace(3) %0)
-  %4 = call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) %0)
-  %5 = call i32 @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.trans.b16.p3(ptr addrspace(3) %0)
-  %6 = call { i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.trans.b16.p3(ptr addrspace(3) %0)
-  %7 = call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) %0)
-  ret void
-}
+; TODO Below intrinsics not yet supported
+
+; define void @llvm_nvvm_barrier0() {
+;   call void @llvm.nvvm.barrier0()
+;   ret void
+; }
+;
+; define i32 @nvvm_shfl(i32 %0, i32 %1, i32 %2, i32 %3, float %4) {
+;   %6 = call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 %0, i32 %3, i32 %1, i32 %2)
+;   %7 = call float @llvm.nvvm.shfl.sync.bfly.f32(i32 %0, float %4, i32 %1, i32 %2)
+;   %8 = call i32 @llvm.nvvm.shfl.sync.up.i32(i32 %0, i32 %3, i32 %1, i32 %2)
+;   %9 = call float @llvm.nvvm.shfl.sync.up.f32(i32 %0, float %4, i32 %1, i32 %2)
+;   %10 = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %0, i32 %3, i32 %1, i32 %2)
+;   %11 = call float @llvm.nvvm.shfl.sync.down.f32(i32 %0, float %4, i32 %1, i32 %2)
+;   %12 = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %0, i32 %3, i32 %1, i32 %2)
+;   %13 = call float @llvm.nvvm.shfl.sync.idx.f32(i32 %0, float %4, i32 %1, i32 %2)
+;   ret i32 %6
+; }
+;
+; define { i32, i1 } @nvvm_shfl_pred(i32 %0, i32 %1, i32 %2, i32 %3, float %4) {
+;   %6 = call { i32, i1 } @llvm.nvvm.shfl.sync.bfly.i32p(i32 %0, i32 %3, i32 %1, i32 %2)
+;   %7 = call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 %0, float %4, i32 %1, i32 %2)
+;   %8 = call { i32, i1 } @llvm.nvvm.shfl.sync.up.i32p(i32 %0, i32 %3, i32 %1, i32 %2)
+;   %9 = call { float, i1 } @llvm.nvvm.shfl.sync.up.f32p(i32 %0, float %4, i32 %1, i32 %2)
+;   %10 = call { i32, i1 } @llvm.nvvm.shfl.sync.down.i32p(i32 %0, i32 %3, i32 %1, i32 %2)
+;   %11 = call { float, i1 } @llvm.nvvm.shfl.sync.down.f32p(i32 %0, float %4, i32 %1, i32 %2)
+;   %12 = call { i32, i1 } @llvm.nvvm.shfl.sync.idx.i32p(i32 %0, i32 %3, i32 %1, i32 %2)
+;   %13 = call { float, i1 } @llvm.nvvm.shfl.sync.idx.f32p(i32 %0, float %4, i32 %1, i32 %2)
+;   ret { i32, i1 } %6
+; }
+;
+; define i32 @nvvm_vote(i32 %0, i1 %1) {
+;   %3 = call i32 @llvm.nvvm.vote.ballot.sync(i32 %0, i1 %1)
+;   ret i32 %3
+; }
+;
+; define { float, float, float, float, float, float, float, float } @nvvm_mma_mn8n8k4_row_col_f32_f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, float %4, float %5, float %6, float %7, float %8, float %9, float %10, float %11) {
+;   %13 = call { float, float, float, float, float, float, float, float } @llvm.nvvm.mma.m8n8k4.row.col.f32.f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, float %4, float %5, float %6, float %7, float %8, float %9, float %10, float %11)
+;   ret { float, float, float, float, float, float, float, float } %13
+; }
+;
+; define { <2 x half>, <2 x half> } @nvvm_mma_m16n8k16_f16_f16(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7) {
+;   %9 = call { <2 x half>, <2 x half> } @llvm.nvvm.mma.m16n8k16.row.col.f16.f16(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7)
+;   ret { <2 x half>, <2 x half> } %9
+; }
+;
+; define { float, float, float, float } @nvvm_mma_m16n8k16_f32_f16(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7) {
+;   %9 = call { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.f32.f16(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7)
+;   ret { float, float, float, float } %9
+; }
+;
+; define { <2 x half>, <2 x half> } @nvvm_mma_m16n8k16_f16_f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, float %6, float %7, float %8, float %9) {
+;   %11 = call { <2 x half>, <2 x half> } @llvm.nvvm.mma.m16n8k16.row.col.f16.f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, float %6, float %7, float %8, float %9)
+;   ret { <2 x half>, <2 x half> } %11
+; }
+;
+; define { float, float, float, float } @nvvm_mma_m16n8k16_f32_f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, float %6, float %7, float %8, float %9) {
+;   %11 = call { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.f32.f32(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, float %6, float %7, float %8, float %9)
+;   ret { float, float, float, float } %11
+; }
+;
+; define { i32, i32, i32, i32 } @nvvm_mma_m16n8k16_s8_s8(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6) {
+;   %8 = call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k16.row.col.s8(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6)
+;   ret { i32, i32, i32, i32 } %8
+; }
+;
+; define { i32, i32, i32, i32 } @nvvm_mma_m16n8k16_s8_u8(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6) {
+;   %8 = call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k16.row.col.satfinite.s8.u8(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6)
+;   ret { i32, i32, i32, i32 } %8
+; }
+;
+; define { i32, i32, i32, i32 } @nvvm_mma_m16n8k128_b1_b1(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6) {
+;   %8 = call { i32, i32, i32, i32 } @llvm.nvvm.mma.xor.popc.m16n8k128.row.col.b1(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6)
+;   ret { i32, i32, i32, i32 } %8
+; }
+;
+; define { i32, i32, i32, i32 } @nvvm_mma_m16n8k32_s4_s4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6) {
+;   %8 = call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k32.row.col.satfinite.s4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6)
+;   ret { i32, i32, i32, i32 } %8
+; }
+;
+; define { double, double } @nvvm_mma_m8n8k4_f64_f64(double %0, double %1, double %2, double %3) {
+;   %5 = call { double, double } @llvm.nvvm.mma.m8n8k4.row.col.f64(double %0, double %1, double %2, double %3)
+;   ret { double, double } %5
+; }
+;
+; define { float, float, float, float } @nvvm_mma_m16n8k4_tf32_f32(i32 %0, i32 %1, i32 %2, float %3, float %4, float %5, float %6) {
+;   %8 = call { float, float, float, float } @llvm.nvvm.mma.m16n8k4.row.col.tf32(i32 %0, i32 %1, i32 %2, float %3, float %4, float %5, float %6)
+;   ret { float, float, float, float } %8
+; }
+;
+; define void @gpu_wmma_load_op(ptr addrspace(3) %0, i32 %1) {
+;   %3 = call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p3(ptr addrspace(3) %0, i32 %1)
+;   ret void
+; }
+;
+; define void @gpu_wmma_store_op(ptr addrspace(3) %0, i32 %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5) {
+;   call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16.p3(ptr addrspace(3) %0, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, i32 %1)
+;   ret void
+; }
+;
+; define void @gpu_wmma_mma_op(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7, <2 x half> %8, <2 x half> %9, <2 x half> %10, <2 x half> %11, <2 x half> %12, <2 x half> %13, <2 x half> %14, <2 x half> %15, <2 x half> %16, <2 x half> %17, <2 x half> %18, <2 x half> %19) {
+;   %21 = call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16(<2 x half> %0, <2 x half> %1, <2 x half> %2, <2 x half> %3, <2 x half> %4, <2 x half> %5, <2 x half> %6, <2 x half> %7, <2 x half> %8, <2 x half> %9, <2 x half> %10, <2 x half> %11, <2 x half> %12, <2 x half> %13, <2 x half> %14, <2 x half> %15, <2 x half> %16, <2 x half> %17, <2 x half> %18, <2 x half> %19)
+;   ret void
+; }
+;
+; define void @nvvm_wmma_load_tf32(ptr %0, i32 %1) {
+;   %3 = call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0(ptr %0, i32 %1)
+;   ret void
+; }
+;
+; define void @nvvm_wmma_mma(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, float %8, float %9, float %10, float %11, float %12, float %13, float %14, float %15) {
+;   %17 = call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.row.row.tf32(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, float %8, float %9, float %10, float %11, float %12, float %13, float %14, float %15)
+;   ret void
+; }
+;
+; define void @cp_async(ptr addrspace(3) %0, ptr addrspace(1) %1) {
+;   call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %0, ptr addrspace(1) %1)
+;   call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %0, ptr addrspace(1) %1)
+;   call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %0, ptr addrspace(1) %1)
+;   call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %0, ptr addrspace(1) %1)
+;   call void @llvm.nvvm.cp.async.commit.group()
+;   call void @llvm.nvvm.cp.async.wait.group(i32 0)
+;   ret void
+; }
+;
+; define void @ld_matrix(ptr addrspace(3) %0) {
+;   %2 = call i32 @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.b16.p3(ptr addrspace(3) %0)
+;   %3 = call { i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.b16.p3(ptr addrspace(3) %0)
+;   %4 = call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) %0)
+;   %5 = call i32 @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.trans.b16.p3(ptr addrspace(3) %0)
+;   %6 = call { i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.trans.b16.p3(ptr addrspace(3) %0)
+;   %7 = call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) %0)
+;   ret void
+; }
 
 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

>From 91851d0cea2c725892297af4dd7b3d051d6db5d5 Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <ivanov.i.aa at m.titech.ac.jp>
Date: Mon, 11 Dec 2023 14:28:59 +0900
Subject: [PATCH 09/11] Remove nvvm from llvm intrisic test

---
 mlir/test/Target/LLVMIR/Import/intrinsic.ll | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
index 3f7705ea78979..c8dcde11d93e6 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
@@ -878,14 +878,6 @@ define float @ssa_copy(float %0) {
   ret float %2
 }
 
-; CHECK-LABEL: llvm.func @nvvm
-define void @nvvm() {
-  ; CHECK: %{{.*}} = nvvm.read.ptx.sreg.ntid.x : i32
-  %1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  ret void
-}
-
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 declare float @llvm.fmuladd.f32(float, float, float)
 declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>)
 declare float @llvm.fma.f32(float, float, float)

>From 6ec007c54bf6615898cd6cb57bad9c7f4e72c085 Mon Sep 17 00:00:00 2001
From: "Ivan R. Ivanov" <ivanov.i.aa at m.titech.ac.jp>
Date: Mon, 11 Dec 2023 17:05:30 +0900
Subject: [PATCH 10/11] Update mlir/test/Target/LLVMIR/Import/nvvmir.ll

Co-authored-by: Tobias Gysi <tobias.gysi at nextsilicon.com>
---
 mlir/test/Target/LLVMIR/Import/nvvmir.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/Target/LLVMIR/Import/nvvmir.ll b/mlir/test/Target/LLVMIR/Import/nvvmir.ll
index 6b29cf05e3a92..1f8255ddffa65 100644
--- a/mlir/test/Target/LLVMIR/Import/nvvmir.ll
+++ b/mlir/test/Target/LLVMIR/Import/nvvmir.ll
@@ -68,7 +68,7 @@ define float @nvvm_rcp(float %0) {
   ret float %2
 }
 
-; TODO Below intrinsics not yet supported
+; TODO: Support the intrinsics below once they derive from NVVM_IntrOp rather than from NVVM_Op.
 
 ; define void @llvm_nvvm_barrier0() {
 ;   call void @llvm.nvvm.barrier0()

>From 02a3f6f9d430216224bc53f0cc740c8fc1818572 Mon Sep 17 00:00:00 2001
From: "Ivan R. Ivanov" <ivanov.i.aa at m.titech.ac.jp>
Date: Mon, 11 Dec 2023 17:07:07 +0900
Subject: [PATCH 11/11] Update mlir/test/Target/LLVMIR/Import/nvvmir.ll

Co-authored-by: Christian Ulmann <christianulmann at gmail.com>
---
 mlir/test/Target/LLVMIR/Import/nvvmir.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/Target/LLVMIR/Import/nvvmir.ll b/mlir/test/Target/LLVMIR/Import/nvvmir.ll
index 1f8255ddffa65..6369e2dd3b517 100644
--- a/mlir/test/Target/LLVMIR/Import/nvvmir.ll
+++ b/mlir/test/Target/LLVMIR/Import/nvvmir.ll
@@ -205,7 +205,7 @@ define float @nvvm_rcp(float %0) {
 ; }
 
 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 
 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.y() #0