r359248 - [CUDA] Implemented _[bi]mma* builtins.

Artem Belevich via cfe-commits cfe-commits at lists.llvm.org
Thu Apr 25 15:28:09 PDT 2019


Author: tra
Date: Thu Apr 25 15:28:09 2019
New Revision: 359248

URL: http://llvm.org/viewvc/llvm-project?rev=359248&view=rev
Log:
[CUDA] Implemented _[bi]mma* builtins.

These builtins provide access to the new integer and
sub-integer variants of MMA (matrix multiply-accumulate) instructions
provided by CUDA-10.x on sm_75 (AKA Turing) GPUs.

Also added a feature for PTX 6.4. While Clang/LLVM does not generate
any PTX instructions that need it, we still need to pass it through to
ptxas in order to be able to compile code that uses the new 'mma'
instruction as inline assembly (e.g used by NVIDIA's CUTLASS library
https://github.com/NVIDIA/cutlass/blob/master/cutlass/arch/mma.h#L101)

Differential Revision: https://reviews.llvm.org/D60279

Added:
    cfe/trunk/test/CodeGen/builtins-nvptx-mma.cu
    cfe/trunk/test/CodeGen/builtins-nvptx-mma.py
Modified:
    cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def
    cfe/trunk/lib/Basic/Targets/NVPTX.cpp
    cfe/trunk/lib/CodeGen/CGBuiltin.cpp
    cfe/trunk/lib/Driver/ToolChains/Cuda.cpp

Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def?rev=359248&r1=359247&r2=359248&view=diff
==============================================================================
--- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def (original)
+++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def Thu Apr 25 15:28:09 2019
@@ -18,13 +18,22 @@
 #endif
 
 #pragma push_macro("SM_70")
-#define SM_70 "sm_70|sm_71"
+#pragma push_macro("SM_72")
+#pragma push_macro("SM_75")
+#define SM_75 "sm_75"
+#define SM_72 "sm_72|" SM_75
+#define SM_70 "sm_70|" SM_72
+
 #pragma push_macro("SM_60")
 #define SM_60 "sm_60|sm_61|sm_62|" SM_70
 
-#pragma push_macro("PTX61")
-#define PTX61 "ptx61"
 #pragma push_macro("PTX60")
+#pragma push_macro("PTX61")
+#pragma push_macro("PTX63")
+#pragma push_macro("PTX64")
+#define PTX64 "ptx64"
+#define PTX63 "ptx63|" PTX64
+#define PTX61 "ptx61|" PTX63
 #define PTX60 "ptx60|" PTX61
 
 #pragma push_macro("AND")
@@ -666,10 +675,53 @@ TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f1
 TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", AND(SM_70,PTX61))
 TARGET_BUILTIN(__hmma_m8n32k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", AND(SM_70,PTX61))
 
+// Builtins to support integer and sub-integer WMMA instructions on sm_72/sm_75
+TARGET_BUILTIN(__bmma_m8n8k128_ld_a_b1, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__bmma_m8n8k128_ld_b_b1, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__bmma_m8n8k128_ld_c, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__bmma_m8n8k128_mma_xor_popc_b1, "vi*iC*iC*iC*Ii", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__bmma_m8n8k128_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__imma_m16n16k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m16n16k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m16n16k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m16n16k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m16n16k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m16n16k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m16n16k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m16n16k16_st_c_i32, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m32n8k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m32n8k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m32n8k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m32n8k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m32n8k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m32n8k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m32n8k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m32n8k16_st_c_i32, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n32k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n32k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n32k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n32k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n32k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n32k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n32k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n32k16_st_c_i32, "vi*iC*UiIi", "", AND(SM_72,PTX63))
+TARGET_BUILTIN(__imma_m8n8k32_ld_a_s4, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__imma_m8n8k32_ld_a_u4, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__imma_m8n8k32_ld_b_s4, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__imma_m8n8k32_ld_b_u4, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__imma_m8n8k32_ld_c, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__imma_m8n8k32_mma_s4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__imma_m8n8k32_mma_u4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63))
+TARGET_BUILTIN(__imma_m8n8k32_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
 #pragma pop_macro("AND")
 #pragma pop_macro("SM_60")
 #pragma pop_macro("SM_70")
+#pragma pop_macro("SM_72")
+#pragma pop_macro("SM_75")
 #pragma pop_macro("PTX60")
 #pragma pop_macro("PTX61")
+#pragma pop_macro("PTX63")
+#pragma pop_macro("PTX64")

Modified: cfe/trunk/lib/Basic/Targets/NVPTX.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Targets/NVPTX.cpp?rev=359248&r1=359247&r2=359248&view=diff
==============================================================================
--- cfe/trunk/lib/Basic/Targets/NVPTX.cpp (original)
+++ cfe/trunk/lib/Basic/Targets/NVPTX.cpp Thu Apr 25 15:28:09 2019
@@ -44,6 +44,8 @@ NVPTXTargetInfo::NVPTXTargetInfo(const l
     if (!Feature.startswith("+ptx"))
       continue;
     PTXVersion = llvm::StringSwitch<unsigned>(Feature)
+                     .Case("+ptx64", 64)
+                     .Case("+ptx63", 63)
                      .Case("+ptx61", 61)
                      .Case("+ptx60", 60)
                      .Case("+ptx50", 50)

Modified: cfe/trunk/lib/CodeGen/CGBuiltin.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGBuiltin.cpp?rev=359248&r1=359247&r2=359248&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CGBuiltin.cpp (original)
+++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp Thu Apr 25 15:28:09 2019
@@ -12925,8 +12925,252 @@ Value *CodeGenFunction::EmitSystemZBuilt
   }
 }
 
-Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
-                                             const CallExpr *E) {
+namespace {
+// Helper classes for mapping MMA builtins to particular LLVM intrinsic variant.
+struct NVPTXMmaLdstInfo {
+  unsigned NumResults;  // Number of elements to load/store
+  // Intrinsic IDs for row/col variants. 0 if particular layout is unsupported.
+  unsigned IID_col;
+  unsigned IID_row;
+};
+
+#define MMA_INTR(geom_op_type, layout) \
+  Intrinsic::nvvm_wmma_##geom_op_type##_##layout##_stride
+#define MMA_LDST(n, geom_op_type)                                              \
+  { n, MMA_INTR(geom_op_type, col), MMA_INTR(geom_op_type, row) }
+
+static NVPTXMmaLdstInfo getNVPTXMmaLdstInfo(unsigned BuiltinID) {
+  switch (BuiltinID) {
+  // FP MMA loads
+  case NVPTX::BI__hmma_m16n16k16_ld_a:
+    return MMA_LDST(8, m16n16k16_load_a_f16);
+  case NVPTX::BI__hmma_m16n16k16_ld_b:
+    return MMA_LDST(8, m16n16k16_load_b_f16);
+  case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
+    return MMA_LDST(4, m16n16k16_load_c_f16);
+  case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
+    return MMA_LDST(8, m16n16k16_load_c_f32);
+  case NVPTX::BI__hmma_m32n8k16_ld_a:
+    return MMA_LDST(8, m32n8k16_load_a_f16);
+  case NVPTX::BI__hmma_m32n8k16_ld_b:
+    return MMA_LDST(8, m32n8k16_load_b_f16);
+  case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
+    return MMA_LDST(4, m32n8k16_load_c_f16);
+  case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
+    return MMA_LDST(8, m32n8k16_load_c_f32);
+  case NVPTX::BI__hmma_m8n32k16_ld_a:
+    return MMA_LDST(8, m8n32k16_load_a_f16);
+  case NVPTX::BI__hmma_m8n32k16_ld_b:
+    return MMA_LDST(8, m8n32k16_load_b_f16);
+  case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
+    return MMA_LDST(4, m8n32k16_load_c_f16);
+  case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
+    return MMA_LDST(8, m8n32k16_load_c_f32);
+
+  // Integer MMA loads
+  case NVPTX::BI__imma_m16n16k16_ld_a_s8:
+    return MMA_LDST(2, m16n16k16_load_a_s8);
+  case NVPTX::BI__imma_m16n16k16_ld_a_u8:
+    return MMA_LDST(2, m16n16k16_load_a_u8);
+  case NVPTX::BI__imma_m16n16k16_ld_b_s8:
+    return MMA_LDST(2, m16n16k16_load_b_s8);
+  case NVPTX::BI__imma_m16n16k16_ld_b_u8:
+    return MMA_LDST(2, m16n16k16_load_b_u8);
+  case NVPTX::BI__imma_m16n16k16_ld_c:
+    return MMA_LDST(8, m16n16k16_load_c_s32);
+  case NVPTX::BI__imma_m32n8k16_ld_a_s8:
+    return MMA_LDST(4, m32n8k16_load_a_s8);
+  case NVPTX::BI__imma_m32n8k16_ld_a_u8:
+    return MMA_LDST(4, m32n8k16_load_a_u8);
+  case NVPTX::BI__imma_m32n8k16_ld_b_s8:
+    return MMA_LDST(1, m32n8k16_load_b_s8);
+  case NVPTX::BI__imma_m32n8k16_ld_b_u8:
+    return MMA_LDST(1, m32n8k16_load_b_u8);
+  case NVPTX::BI__imma_m32n8k16_ld_c:
+    return MMA_LDST(8, m32n8k16_load_c_s32);
+  case NVPTX::BI__imma_m8n32k16_ld_a_s8:
+    return MMA_LDST(1, m8n32k16_load_a_s8);
+  case NVPTX::BI__imma_m8n32k16_ld_a_u8:
+    return MMA_LDST(1, m8n32k16_load_a_u8);
+  case NVPTX::BI__imma_m8n32k16_ld_b_s8:
+    return MMA_LDST(4, m8n32k16_load_b_s8);
+  case NVPTX::BI__imma_m8n32k16_ld_b_u8:
+    return MMA_LDST(4, m8n32k16_load_b_u8);
+  case NVPTX::BI__imma_m8n32k16_ld_c:
+    return MMA_LDST(8, m8n32k16_load_c_s32);
+
+  // Sub-integer MMA loads.
+  // Only row/col layout is supported by A/B fragments.
+  case NVPTX::BI__imma_m8n8k32_ld_a_s4:
+    return {1, 0, MMA_INTR(m8n8k32_load_a_s4, row)};
+  case NVPTX::BI__imma_m8n8k32_ld_a_u4:
+    return {1, 0, MMA_INTR(m8n8k32_load_a_u4, row)};
+  case NVPTX::BI__imma_m8n8k32_ld_b_s4:
+    return {1, MMA_INTR(m8n8k32_load_b_s4, col), 0};
+  case NVPTX::BI__imma_m8n8k32_ld_b_u4:
+    return {1, MMA_INTR(m8n8k32_load_b_u4, col), 0};
+  case NVPTX::BI__imma_m8n8k32_ld_c:
+    return MMA_LDST(2, m8n8k32_load_c_s32);
+  case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
+    return {1, 0, MMA_INTR(m8n8k128_load_a_b1, row)};
+  case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
+    return {1, MMA_INTR(m8n8k128_load_b_b1, col), 0};
+  case NVPTX::BI__bmma_m8n8k128_ld_c:
+    return MMA_LDST(2, m8n8k128_load_c_s32);
+
+  // NOTE: We need to follow inconsitent naming scheme used by NVCC.  Unlike
+  // PTX and LLVM IR where stores always use fragment D, NVCC builtins always
+  // use fragment C for both loads and stores.
+  // FP MMA stores.
+  case NVPTX::BI__hmma_m16n16k16_st_c_f16:
+    return MMA_LDST(4, m16n16k16_store_d_f16);
+  case NVPTX::BI__hmma_m16n16k16_st_c_f32:
+    return MMA_LDST(8, m16n16k16_store_d_f32);
+  case NVPTX::BI__hmma_m32n8k16_st_c_f16:
+    return MMA_LDST(4, m32n8k16_store_d_f16);
+  case NVPTX::BI__hmma_m32n8k16_st_c_f32:
+    return MMA_LDST(8, m32n8k16_store_d_f32);
+  case NVPTX::BI__hmma_m8n32k16_st_c_f16:
+    return MMA_LDST(4, m8n32k16_store_d_f16);
+  case NVPTX::BI__hmma_m8n32k16_st_c_f32:
+    return MMA_LDST(8, m8n32k16_store_d_f32);
+
+  // Integer and sub-integer MMA stores.
+  // Another naming quirk. Unlike other MMA builtins that use PTX types in the
+  // name, integer loads/stores use LLVM's i32.
+  case NVPTX::BI__imma_m16n16k16_st_c_i32:
+    return MMA_LDST(8, m16n16k16_store_d_s32);
+  case NVPTX::BI__imma_m32n8k16_st_c_i32:
+    return MMA_LDST(8, m32n8k16_store_d_s32);
+  case NVPTX::BI__imma_m8n32k16_st_c_i32:
+    return MMA_LDST(8, m8n32k16_store_d_s32);
+  case NVPTX::BI__imma_m8n8k32_st_c_i32:
+    return MMA_LDST(2, m8n8k32_store_d_s32);
+  case NVPTX::BI__bmma_m8n8k128_st_c_i32:
+    return MMA_LDST(2, m8n8k128_store_d_s32);
+
+  default:
+    llvm_unreachable("Unknown MMA builtin");
+  }
+}
+#undef MMA_LDST
+#undef MMA_INTR
+
+
+struct NVPTXMmaInfo {
+  unsigned NumEltsA;
+  unsigned NumEltsB;
+  unsigned NumEltsC;
+  unsigned NumEltsD;
+  std::array<unsigned, 8> Variants;
+
+  unsigned getMMAIntrinsic(int Layout, bool Satf) {
+    unsigned Index = Layout * 2 + Satf;
+    if (Index >= Variants.size())
+      return 0;
+    return Variants[Index];
+  }
+};
+
+  // Returns an intrinsic that matches Layout and Satf for valid combinations of
+  // Layout and Satf, 0 otherwise.
+static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) {
+  // clang-format off
+#define MMA_VARIANTS(geom, type) {{                                 \
+      Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type,             \
+      Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \
+      Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type,             \
+      Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
+      Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type,             \
+      Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \
+      Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type,             \
+      Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite  \
+    }}
+// Sub-integer MMA only supports row.col layout.
+#define MMA_VARIANTS_I4(geom, type) {{ \
+      0, \
+      0, \
+      Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type,             \
+      Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
+      0, \
+      0, \
+      0, \
+      0  \
+    }}
+// b1 MMA does not support .satfinite.
+#define MMA_VARIANTS_B1(geom, type) {{ \
+      0, \
+      0, \
+      Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type,             \
+      0, \
+      0, \
+      0, \
+      0, \
+      0  \
+    }}
+    // clang-format on
+    switch (BuiltinID) {
+    // FP MMA
+    // Note that 'type' argument of MMA_VARIANT uses D_C notation, while
+    // NumEltsN of return value are ordered as A,B,C,D.
+    case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
+      return {8, 8, 4, 4, MMA_VARIANTS(m16n16k16, f16_f16)};
+    case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
+      return {8, 8, 4, 8, MMA_VARIANTS(m16n16k16, f32_f16)};
+    case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
+      return {8, 8, 8, 4, MMA_VARIANTS(m16n16k16, f16_f32)};
+    case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
+      return {8, 8, 8, 8, MMA_VARIANTS(m16n16k16, f32_f32)};
+    case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
+      return {8, 8, 4, 4, MMA_VARIANTS(m32n8k16, f16_f16)};
+    case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
+      return {8, 8, 4, 8, MMA_VARIANTS(m32n8k16, f32_f16)};
+    case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
+      return {8, 8, 8, 4, MMA_VARIANTS(m32n8k16, f16_f32)};
+    case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
+      return {8, 8, 8, 8, MMA_VARIANTS(m32n8k16, f32_f32)};
+    case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
+      return {8, 8, 4, 4, MMA_VARIANTS(m8n32k16, f16_f16)};
+    case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
+      return {8, 8, 4, 8, MMA_VARIANTS(m8n32k16, f32_f16)};
+    case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
+      return {8, 8, 8, 4, MMA_VARIANTS(m8n32k16, f16_f32)};
+    case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
+      return {8, 8, 8, 8, MMA_VARIANTS(m8n32k16, f32_f32)};
+
+    // Integer MMA
+    case NVPTX::BI__imma_m16n16k16_mma_s8:
+      return {2, 2, 8, 8, MMA_VARIANTS(m16n16k16, s8)};
+    case NVPTX::BI__imma_m16n16k16_mma_u8:
+      return {2, 2, 8, 8, MMA_VARIANTS(m16n16k16, u8)};
+    case NVPTX::BI__imma_m32n8k16_mma_s8:
+      return {4, 1, 8, 8, MMA_VARIANTS(m32n8k16, s8)};
+    case NVPTX::BI__imma_m32n8k16_mma_u8:
+      return {4, 1, 8, 8, MMA_VARIANTS(m32n8k16, u8)};
+    case NVPTX::BI__imma_m8n32k16_mma_s8:
+      return {1, 4, 8, 8, MMA_VARIANTS(m8n32k16, s8)};
+    case NVPTX::BI__imma_m8n32k16_mma_u8:
+      return {1, 4, 8, 8, MMA_VARIANTS(m8n32k16, u8)};
+
+    // Sub-integer MMA
+    case NVPTX::BI__imma_m8n8k32_mma_s4:
+      return {1, 1, 2, 2, MMA_VARIANTS_I4(m8n8k32, s4)};
+    case NVPTX::BI__imma_m8n8k32_mma_u4:
+      return {1, 1, 2, 2, MMA_VARIANTS_I4(m8n8k32, u4)};
+    case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
+      return {1, 1, 2, 2, MMA_VARIANTS_B1(m8n8k128, b1)};
+    default:
+      llvm_unreachable("Unexpected builtin ID.");
+    }
+#undef MMA_VARIANTS
+#undef MMA_VARIANTS_I4
+#undef MMA_VARIANTS_B1
+}
+
+} // namespace
+
+Value *
+CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E) {
   auto MakeLdg = [&](unsigned IntrinsicID) {
     Value *Ptr = EmitScalarExpr(E->getArg(0));
     clang::CharUnits Align =
@@ -13189,6 +13433,8 @@ Value *CodeGenFunction::EmitNVPTXBuiltin
     Builder.CreateStore(Pred, PredOutPtr);
     return Builder.CreateExtractValue(ResultPair, 0);
   }
+
+  // FP MMA loads
   case NVPTX::BI__hmma_m16n16k16_ld_a:
   case NVPTX::BI__hmma_m16n16k16_ld_b:
   case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
@@ -13200,7 +13446,33 @@ Value *CodeGenFunction::EmitNVPTXBuiltin
   case NVPTX::BI__hmma_m8n32k16_ld_a:
   case NVPTX::BI__hmma_m8n32k16_ld_b:
   case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
-  case NVPTX::BI__hmma_m8n32k16_ld_c_f32: {
+  case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
+  // Integer MMA loads.
+  case NVPTX::BI__imma_m16n16k16_ld_a_s8:
+  case NVPTX::BI__imma_m16n16k16_ld_a_u8:
+  case NVPTX::BI__imma_m16n16k16_ld_b_s8:
+  case NVPTX::BI__imma_m16n16k16_ld_b_u8:
+  case NVPTX::BI__imma_m16n16k16_ld_c:
+  case NVPTX::BI__imma_m32n8k16_ld_a_s8:
+  case NVPTX::BI__imma_m32n8k16_ld_a_u8:
+  case NVPTX::BI__imma_m32n8k16_ld_b_s8:
+  case NVPTX::BI__imma_m32n8k16_ld_b_u8:
+  case NVPTX::BI__imma_m32n8k16_ld_c:
+  case NVPTX::BI__imma_m8n32k16_ld_a_s8:
+  case NVPTX::BI__imma_m8n32k16_ld_a_u8:
+  case NVPTX::BI__imma_m8n32k16_ld_b_s8:
+  case NVPTX::BI__imma_m8n32k16_ld_b_u8:
+  case NVPTX::BI__imma_m8n32k16_ld_c:
+  // Sub-integer MMA loads.
+  case NVPTX::BI__imma_m8n8k32_ld_a_s4:
+  case NVPTX::BI__imma_m8n8k32_ld_a_u4:
+  case NVPTX::BI__imma_m8n8k32_ld_b_s4:
+  case NVPTX::BI__imma_m8n8k32_ld_b_u4:
+  case NVPTX::BI__imma_m8n8k32_ld_c:
+  case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
+  case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
+  case NVPTX::BI__bmma_m8n8k128_ld_c:
+  {
     Address Dst = EmitPointerWithAlignment(E->getArg(0));
     Value *Src = EmitScalarExpr(E->getArg(1));
     Value *Ldm = EmitScalarExpr(E->getArg(2));
@@ -13208,82 +13480,28 @@ Value *CodeGenFunction::EmitNVPTXBuiltin
     if (!E->getArg(3)->isIntegerConstantExpr(isColMajorArg, getContext()))
       return nullptr;
     bool isColMajor = isColMajorArg.getSExtValue();
-    unsigned IID;
-    unsigned NumResults;
-    switch (BuiltinID) {
-    case NVPTX::BI__hmma_m16n16k16_ld_a:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride
-                       : Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride;
-      NumResults = 8;
-      break;
-    case NVPTX::BI__hmma_m16n16k16_ld_b:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride
-                       : Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride;
-      NumResults = 8;
-      break;
-    case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride
-                       : Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride;
-      NumResults = 4;
-      break;
-    case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride
-                       : Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride;
-      NumResults = 8;
-      break;
-    case NVPTX::BI__hmma_m32n8k16_ld_a:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride
-                       : Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride;
-      NumResults = 8;
-      break;
-    case NVPTX::BI__hmma_m32n8k16_ld_b:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride
-                       : Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride;
-      NumResults = 8;
-      break;
-    case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride
-                       : Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride;
-      NumResults = 4;
-      break;
-    case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride
-                       : Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride;
-      NumResults = 8;
-      break;
-    case NVPTX::BI__hmma_m8n32k16_ld_a:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride
-                       : Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride;
-      NumResults = 8;
-      break;
-    case NVPTX::BI__hmma_m8n32k16_ld_b:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride
-                       : Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride;
-      NumResults = 8;
-      break;
-    case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride
-                       : Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride;
-      NumResults = 4;
-      break;
-    case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride
-                       : Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride;
-      NumResults = 8;
-      break;
-    default:
-      llvm_unreachable("Unexpected builtin ID.");
-    }
+    NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
+    unsigned IID = isColMajor ? II.IID_col : II.IID_row;
+    if (IID == 0)
+      return nullptr;
+
     Value *Result =
         Builder.CreateCall(CGM.getIntrinsic(IID, Src->getType()), {Src, Ldm});
 
     // Save returned values.
-    for (unsigned i = 0; i < NumResults; ++i) {
-      Builder.CreateAlignedStore(
-          Builder.CreateBitCast(Builder.CreateExtractValue(Result, i),
-                                Dst.getElementType()),
-          Builder.CreateGEP(Dst.getPointer(), llvm::ConstantInt::get(IntTy, i)),
-          CharUnits::fromQuantity(4));
+    assert(II.NumResults);
+    if (II.NumResults == 1) {
+      Builder.CreateAlignedStore(Result, Dst.getPointer(),
+                                 CharUnits::fromQuantity(4));
+    } else {
+      for (unsigned i = 0; i < II.NumResults; ++i) {
+        Builder.CreateAlignedStore(
+            Builder.CreateBitCast(Builder.CreateExtractValue(Result, i),
+                                  Dst.getElementType()),
+            Builder.CreateGEP(Dst.getPointer(),
+                              llvm::ConstantInt::get(IntTy, i)),
+            CharUnits::fromQuantity(4));
+      }
     }
     return Result;
   }
@@ -13293,7 +13511,12 @@ Value *CodeGenFunction::EmitNVPTXBuiltin
   case NVPTX::BI__hmma_m32n8k16_st_c_f16:
   case NVPTX::BI__hmma_m32n8k16_st_c_f32:
   case NVPTX::BI__hmma_m8n32k16_st_c_f16:
-  case NVPTX::BI__hmma_m8n32k16_st_c_f32: {
+  case NVPTX::BI__hmma_m8n32k16_st_c_f32:
+  case NVPTX::BI__imma_m16n16k16_st_c_i32:
+  case NVPTX::BI__imma_m32n8k16_st_c_i32:
+  case NVPTX::BI__imma_m8n32k16_st_c_i32:
+  case NVPTX::BI__imma_m8n8k32_st_c_i32:
+  case NVPTX::BI__bmma_m8n8k128_st_c_i32: {
     Value *Dst = EmitScalarExpr(E->getArg(0));
     Address Src = EmitPointerWithAlignment(E->getArg(1));
     Value *Ldm = EmitScalarExpr(E->getArg(2));
@@ -13301,45 +13524,15 @@ Value *CodeGenFunction::EmitNVPTXBuiltin
     if (!E->getArg(3)->isIntegerConstantExpr(isColMajorArg, getContext()))
       return nullptr;
     bool isColMajor = isColMajorArg.getSExtValue();
-    unsigned IID;
-    unsigned NumResults = 8;
-    // PTX Instructions (and LLVM intrinsics) are defined for slice _d_, yet
-    // for some reason nvcc builtins use _c_.
-    switch (BuiltinID) {
-    case NVPTX::BI__hmma_m16n16k16_st_c_f16:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride
-                       : Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride;
-      NumResults = 4;
-      break;
-    case NVPTX::BI__hmma_m16n16k16_st_c_f32:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride
-                       : Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride;
-      break;
-    case NVPTX::BI__hmma_m32n8k16_st_c_f16:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride
-                       : Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride;
-      NumResults = 4;
-      break;
-    case NVPTX::BI__hmma_m32n8k16_st_c_f32:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride
-                       : Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride;
-      break;
-    case NVPTX::BI__hmma_m8n32k16_st_c_f16:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride
-                       : Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride;
-      NumResults = 4;
-      break;
-    case NVPTX::BI__hmma_m8n32k16_st_c_f32:
-      IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride
-                       : Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride;
-      break;
-    default:
-      llvm_unreachable("Unexpected builtin ID.");
-    }
-    Function *Intrinsic = CGM.getIntrinsic(IID, Dst->getType());
+    NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
+    unsigned IID = isColMajor ? II.IID_col : II.IID_row;
+    if (IID == 0)
+      return nullptr;
+    Function *Intrinsic =
+        CGM.getIntrinsic(IID, Dst->getType());
     llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(1);
     SmallVector<Value *, 10> Values = {Dst};
-    for (unsigned i = 0; i < NumResults; ++i) {
+    for (unsigned i = 0; i < II.NumResults; ++i) {
       Value *V = Builder.CreateAlignedLoad(
           Builder.CreateGEP(Src.getPointer(), llvm::ConstantInt::get(IntTy, i)),
           CharUnits::fromQuantity(4));
@@ -13363,7 +13556,16 @@ Value *CodeGenFunction::EmitNVPTXBuiltin
   case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
   case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
   case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
-  case NVPTX::BI__hmma_m8n32k16_mma_f16f32: {
+  case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
+  case NVPTX::BI__imma_m16n16k16_mma_s8:
+  case NVPTX::BI__imma_m16n16k16_mma_u8:
+  case NVPTX::BI__imma_m32n8k16_mma_s8:
+  case NVPTX::BI__imma_m32n8k16_mma_u8:
+  case NVPTX::BI__imma_m8n32k16_mma_s8:
+  case NVPTX::BI__imma_m8n32k16_mma_u8:
+  case NVPTX::BI__imma_m8n8k32_mma_s4:
+  case NVPTX::BI__imma_m8n8k32_mma_u4:
+  case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1: {
     Address Dst = EmitPointerWithAlignment(E->getArg(0));
     Address SrcA = EmitPointerWithAlignment(E->getArg(1));
     Address SrcB = EmitPointerWithAlignment(E->getArg(2));
@@ -13375,119 +13577,40 @@ Value *CodeGenFunction::EmitNVPTXBuiltin
     if (Layout < 0 || Layout > 3)
       return nullptr;
     llvm::APSInt SatfArg;
-    if (!E->getArg(5)->isIntegerConstantExpr(SatfArg, getContext()))
+    if (BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1)
+      SatfArg = 0;  // .b1 does not have satf argument.
+    else if (!E->getArg(5)->isIntegerConstantExpr(SatfArg, getContext()))
       return nullptr;
     bool Satf = SatfArg.getSExtValue();
-
-    // clang-format off
-#define MMA_VARIANTS(geom, type) {{                                 \
-      Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type,             \
-      Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \
-      Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type,             \
-      Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
-      Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type,             \
-      Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \
-      Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type,             \
-      Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite  \
-    }}
-    // clang-format on
-
-    auto getMMAIntrinsic = [Layout, Satf](std::array<unsigned, 8> Variants) {
-      unsigned Index = Layout * 2 + Satf;
-      assert(Index < 8);
-      return Variants[Index];
-    };
-    unsigned IID;
-    unsigned NumEltsC;
-    unsigned NumEltsD;
-    switch (BuiltinID) {
-    case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
-      IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f16));
-      NumEltsC = 4;
-      NumEltsD = 4;
-      break;
-    case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
-      IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f16));
-      NumEltsC = 4;
-      NumEltsD = 8;
-      break;
-    case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
-      IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f32));
-      NumEltsC = 8;
-      NumEltsD = 4;
-      break;
-    case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
-      IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f32));
-      NumEltsC = 8;
-      NumEltsD = 8;
-      break;
-    case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
-      IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f16));
-      NumEltsC = 4;
-      NumEltsD = 4;
-      break;
-    case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
-      IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f16));
-      NumEltsC = 4;
-      NumEltsD = 8;
-      break;
-    case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
-      IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f32));
-      NumEltsC = 8;
-      NumEltsD = 4;
-      break;
-    case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
-      IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f32));
-      NumEltsC = 8;
-      NumEltsD = 8;
-      break;
-    case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
-      IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f16));
-      NumEltsC = 4;
-      NumEltsD = 4;
-      break;
-    case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
-      IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f16));
-      NumEltsC = 4;
-      NumEltsD = 8;
-      break;
-    case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
-      IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f32));
-      NumEltsC = 8;
-      NumEltsD = 4;
-      break;
-    case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
-      IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f32));
-      NumEltsC = 8;
-      NumEltsD = 8;
-      break;
-    default:
-      llvm_unreachable("Unexpected builtin ID.");
-    }
-#undef MMA_VARIANTS
+    NVPTXMmaInfo MI = getNVPTXMmaInfo(BuiltinID);
+    unsigned IID = MI.getMMAIntrinsic(Layout, Satf);
+    if (IID == 0)  // Unsupported combination of Layout/Satf.
+      return nullptr;
 
     SmallVector<Value *, 24> Values;
     Function *Intrinsic = CGM.getIntrinsic(IID);
-    llvm::Type *ABType = Intrinsic->getFunctionType()->getParamType(0);
+    llvm::Type *AType = Intrinsic->getFunctionType()->getParamType(0);
     // Load A
-    for (unsigned i = 0; i < 8; ++i) {
+    for (unsigned i = 0; i < MI.NumEltsA; ++i) {
       Value *V = Builder.CreateAlignedLoad(
           Builder.CreateGEP(SrcA.getPointer(),
                             llvm::ConstantInt::get(IntTy, i)),
           CharUnits::fromQuantity(4));
-      Values.push_back(Builder.CreateBitCast(V, ABType));
+      Values.push_back(Builder.CreateBitCast(V, AType));
     }
     // Load B
-    for (unsigned i = 0; i < 8; ++i) {
+    llvm::Type *BType = Intrinsic->getFunctionType()->getParamType(MI.NumEltsA);
+    for (unsigned i = 0; i < MI.NumEltsB; ++i) {
       Value *V = Builder.CreateAlignedLoad(
           Builder.CreateGEP(SrcB.getPointer(),
                             llvm::ConstantInt::get(IntTy, i)),
           CharUnits::fromQuantity(4));
-      Values.push_back(Builder.CreateBitCast(V, ABType));
+      Values.push_back(Builder.CreateBitCast(V, BType));
     }
     // Load C
-    llvm::Type *CType = Intrinsic->getFunctionType()->getParamType(16);
-    for (unsigned i = 0; i < NumEltsC; ++i) {
+    llvm::Type *CType =
+        Intrinsic->getFunctionType()->getParamType(MI.NumEltsA + MI.NumEltsB);
+    for (unsigned i = 0; i < MI.NumEltsC; ++i) {
       Value *V = Builder.CreateAlignedLoad(
           Builder.CreateGEP(SrcC.getPointer(),
                             llvm::ConstantInt::get(IntTy, i)),
@@ -13496,7 +13619,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltin
     }
     Value *Result = Builder.CreateCall(Intrinsic, Values);
     llvm::Type *DType = Dst.getElementType();
-    for (unsigned i = 0; i < NumEltsD; ++i)
+    for (unsigned i = 0; i < MI.NumEltsD; ++i)
       Builder.CreateAlignedStore(
           Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), DType),
           Builder.CreateGEP(Dst.getPointer(), llvm::ConstantInt::get(IntTy, i)),

Modified: cfe/trunk/lib/Driver/ToolChains/Cuda.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains/Cuda.cpp?rev=359248&r1=359247&r2=359248&view=diff
==============================================================================
--- cfe/trunk/lib/Driver/ToolChains/Cuda.cpp (original)
+++ cfe/trunk/lib/Driver/ToolChains/Cuda.cpp Thu Apr 25 15:28:09 2019
@@ -644,19 +644,25 @@ void CudaToolChain::addClangTargetOption
   CC1Args.push_back("-mlink-builtin-bitcode");
   CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
 
-  // Libdevice in CUDA-7.0 requires PTX version that's more recent than LLVM
-  // defaults to. Use PTX4.2 by default, which is the PTX version that came with
-  // CUDA-7.0.
-  const char *PtxFeature = "+ptx42";
-  // TODO(tra): CUDA-10+ needs PTX 6.3 to support new features. However that
-  // requires fair amount of work on LLVM side. We'll keep using PTX 6.1 until
-  // all prerequisites are in place.
-  if (CudaInstallation.version() >= CudaVersion::CUDA_91) {
-    // CUDA-9.1 uses new instructions that are only available in PTX6.1+
-    PtxFeature = "+ptx61";
-  } else if (CudaInstallation.version() >= CudaVersion::CUDA_90) {
-    // CUDA-9.0 uses new instructions that are only available in PTX6.0+
-    PtxFeature = "+ptx60";
+  // New CUDA versions often introduce new instructions that are only supported
+  // by new PTX version, so we need to raise PTX level to enable them in NVPTX
+  // back-end.
+  const char *PtxFeature = nullptr;
+  switch(CudaInstallation.version()) {
+    case CudaVersion::CUDA_101:
+      PtxFeature = "+ptx64";
+      break;
+    case CudaVersion::CUDA_100:
+      PtxFeature = "+ptx63";
+      break;
+    case CudaVersion::CUDA_91:
+      PtxFeature = "+ptx61";
+      break;
+    case CudaVersion::CUDA_90:
+      PtxFeature = "+ptx60";
+      break;
+    default:
+      PtxFeature = "+ptx42";
   }
   CC1Args.append({"-target-feature", PtxFeature});
   if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr,

Added: cfe/trunk/test/CodeGen/builtins-nvptx-mma.cu
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/builtins-nvptx-mma.cu?rev=359248&view=auto
==============================================================================
--- cfe/trunk/test/CodeGen/builtins-nvptx-mma.cu (added)
+++ cfe/trunk/test/CodeGen/builtins-nvptx-mma.cu Thu Apr 25 15:28:09 2019
@@ -0,0 +1,755 @@
+
+//
+// *** DO NOT EDIT ***
+//
+//  This test has been automatically generated by
+//  builtins-nvtx-mma.py --ptx=63 --gpu-arch=75
+//
+// Make sure we can handle all builtins available on sm_75 with PTX63
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_75 \
+// RUN:            -fcuda-is-device -target-feature +ptx63 \
+// RUN:            -DPTX=63 -DSM=75 \
+// RUN:            -S -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefixes=CHECK_PTX61_SM70,CHECK_PTX63_SM75,CHECK_PTX63_SM72,CHECK_PTX60_SM70 %s
+// Verify that all builtins have correct constraints.
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown \
+// RUN:   -target-cpu sm_60 -target-feature +ptx42 \
+// RUN:   -DPTX=63 -DSM=75 -fcuda-is-device -S -o /dev/null -x cuda \
+// RUN:   -verify %s
+
+
+#if !defined(CUDA_VERSION)
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+
+typedef unsigned long long uint64_t;
+#endif
+
+// CHECK-LABEL: test_wmma_buitins
+__device__ void test_wmma_buitins(int *src, int *dst,
+                                  float *fsrc, float *fdst, int ldm) {
+
+
+#if (PTX >= 60) && (SM >= 70)
+
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16
+  // expected-error-re at +1 {{'__hmma_m16n16k16_ld_a' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_ld_a(dst, src, ldm, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16
+  // expected-error-re at +1 {{'__hmma_m16n16k16_ld_a' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_ld_a(dst, src, ldm, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16
+  // expected-error-re at +1 {{'__hmma_m16n16k16_ld_b' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_ld_b(dst, src, ldm, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16
+  // expected-error-re at +1 {{'__hmma_m16n16k16_ld_b' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_ld_b(dst, src, ldm, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f16
+  // expected-error-re at +1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_ld_c_f16(dst, src, ldm, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f16
+  // expected-error-re at +1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_ld_c_f16(dst, src, ldm, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32
+  // expected-error-re at +1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32
+  // expected-error-re at +1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f16
+  // expected-error-re at +1 {{'__hmma_m16n16k16_st_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_st_c_f16(dst, src, ldm, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16
+  // expected-error-re at +1 {{'__hmma_m16n16k16_st_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_st_c_f16(dst, src, ldm, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32
+  // expected-error-re at +1 {{'__hmma_m16n16k16_st_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32
+  // expected-error-re at +1 {{'__hmma_m16n16k16_st_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 1);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 0);
+  // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+  __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 1);
+#endif // (PTX >= 60) && (SM >= 70) 
+
+#if (PTX >= 61) && (SM >= 70)
+
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.f16
+  // expected-error-re at +1 {{'__hmma_m32n8k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_ld_a(dst, src, ldm, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.f16
+  // expected-error-re at +1 {{'__hmma_m32n8k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_ld_a(dst, src, ldm, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.f16
+  // expected-error-re at +1 {{'__hmma_m32n8k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_ld_b(dst, src, ldm, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.f16
+  // expected-error-re at +1 {{'__hmma_m32n8k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_ld_b(dst, src, ldm, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f16
+  // expected-error-re at +1 {{'__hmma_m32n8k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_ld_c_f16(dst, src, ldm, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f16
+  // expected-error-re at +1 {{'__hmma_m32n8k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_ld_c_f16(dst, src, ldm, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f32
+  // expected-error-re at +1 {{'__hmma_m32n8k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_ld_c_f32(fdst, fsrc, ldm, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f32
+  // expected-error-re at +1 {{'__hmma_m32n8k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_ld_c_f32(fdst, fsrc, ldm, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f16
+  // expected-error-re at +1 {{'__hmma_m32n8k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_st_c_f16(dst, src, ldm, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f16
+  // expected-error-re at +1 {{'__hmma_m32n8k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_st_c_f16(dst, src, ldm, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f32
+  // expected-error-re at +1 {{'__hmma_m32n8k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_st_c_f32(fdst, fsrc, ldm, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f32
+  // expected-error-re at +1 {{'__hmma_m32n8k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_st_c_f32(fdst, fsrc, ldm, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.f16
+  // expected-error-re at +1 {{'__hmma_m8n32k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_ld_a(dst, src, ldm, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.f16
+  // expected-error-re at +1 {{'__hmma_m8n32k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_ld_a(dst, src, ldm, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.f16
+  // expected-error-re at +1 {{'__hmma_m8n32k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_ld_b(dst, src, ldm, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.f16
+  // expected-error-re at +1 {{'__hmma_m8n32k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_ld_b(dst, src, ldm, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f16
+  // expected-error-re at +1 {{'__hmma_m8n32k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_ld_c_f16(dst, src, ldm, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f16
+  // expected-error-re at +1 {{'__hmma_m8n32k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_ld_c_f16(dst, src, ldm, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f32
+  // expected-error-re at +1 {{'__hmma_m8n32k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_ld_c_f32(fdst, fsrc, ldm, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f32
+  // expected-error-re at +1 {{'__hmma_m8n32k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_ld_c_f32(fdst, fsrc, ldm, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f16
+  // expected-error-re at +1 {{'__hmma_m8n32k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_st_c_f16(dst, src, ldm, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f16
+  // expected-error-re at +1 {{'__hmma_m8n32k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_st_c_f16(dst, src, ldm, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f32
+  // expected-error-re at +1 {{'__hmma_m8n32k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_st_c_f32(fdst, fsrc, ldm, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f32
+  // expected-error-re at +1 {{'__hmma_m8n32k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_st_c_f32(fdst, fsrc, ldm, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 3, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 3, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f16
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 2, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 2, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f16
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 1, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 1, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 0, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 0, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f16
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 3, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 3, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f16
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 2, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 2, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f16
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 1, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 1, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f16
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 0, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 0, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f32
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 3, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 3, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f32
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 2, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 2, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f32
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 1, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 1, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f32
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 0, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 0, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 3, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 3, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f32
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 2, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 2, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f32
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 1, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 1, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 0, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 0, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 3, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 3, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f16
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 2, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 2, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f16
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 1, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 1, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 0, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 0, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f16
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 3, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 3, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f16
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 2, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 2, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f16
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 1, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 1, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f16
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 0, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f16.satfinite
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 0, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f32
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 3, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 3, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f32
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 2, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 2, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f32
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 1, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 1, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f32
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 0, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 0, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 3, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 3, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f32
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 2, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 2, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f32
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 1, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 1, 1);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 0, 0);
+  // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32.satfinite
+  // expected-error-re at +1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 0, 1);
+#endif // (PTX >= 61) && (SM >= 70) 
+
+#if (PTX >= 63) && (SM >= 72)
+
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.s8
+  // expected-error-re at +1 {{'__imma_m16n16k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_ld_a_s8(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.s8
+  // expected-error-re at +1 {{'__imma_m16n16k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_ld_a_s8(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.u8
+  // expected-error-re at +1 {{'__imma_m16n16k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_ld_a_u8(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.u8
+  // expected-error-re at +1 {{'__imma_m16n16k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_ld_a_u8(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.s8
+  // expected-error-re at +1 {{'__imma_m16n16k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_ld_b_s8(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.s8
+  // expected-error-re at +1 {{'__imma_m16n16k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_ld_b_s8(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.u8
+  // expected-error-re at +1 {{'__imma_m16n16k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_ld_b_u8(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.u8
+  // expected-error-re at +1 {{'__imma_m16n16k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_ld_b_u8(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.s32
+  // expected-error-re at +1 {{'__imma_m16n16k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_ld_c(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.s32
+  // expected-error-re at +1 {{'__imma_m16n16k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_ld_c(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.s32
+  // expected-error-re at +1 {{'__imma_m16n16k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_st_c_i32(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.s32
+  // expected-error-re at +1 {{'__imma_m16n16k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_st_c_i32(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.s8
+  // expected-error-re at +1 {{'__imma_m32n8k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_ld_a_s8(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.s8
+  // expected-error-re at +1 {{'__imma_m32n8k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_ld_a_s8(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.u8
+  // expected-error-re at +1 {{'__imma_m32n8k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_ld_a_u8(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.u8
+  // expected-error-re at +1 {{'__imma_m32n8k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_ld_a_u8(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.s8
+  // expected-error-re at +1 {{'__imma_m32n8k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_ld_b_s8(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.s8
+  // expected-error-re at +1 {{'__imma_m32n8k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_ld_b_s8(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.u8
+  // expected-error-re at +1 {{'__imma_m32n8k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_ld_b_u8(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.u8
+  // expected-error-re at +1 {{'__imma_m32n8k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_ld_b_u8(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.s32
+  // expected-error-re at +1 {{'__imma_m32n8k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_ld_c(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.s32
+  // expected-error-re at +1 {{'__imma_m32n8k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_ld_c(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.s32
+  // expected-error-re at +1 {{'__imma_m32n8k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_st_c_i32(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.s32
+  // expected-error-re at +1 {{'__imma_m32n8k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_st_c_i32(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.s8
+  // expected-error-re at +1 {{'__imma_m8n32k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_ld_a_s8(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.s8
+  // expected-error-re at +1 {{'__imma_m8n32k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_ld_a_s8(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.u8
+  // expected-error-re at +1 {{'__imma_m8n32k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_ld_a_u8(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.u8
+  // expected-error-re at +1 {{'__imma_m8n32k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_ld_a_u8(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.s8
+  // expected-error-re at +1 {{'__imma_m8n32k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_ld_b_s8(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.s8
+  // expected-error-re at +1 {{'__imma_m8n32k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_ld_b_s8(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.u8
+  // expected-error-re at +1 {{'__imma_m8n32k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_ld_b_u8(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.u8
+  // expected-error-re at +1 {{'__imma_m8n32k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_ld_b_u8(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.s32
+  // expected-error-re at +1 {{'__imma_m8n32k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_ld_c(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.s32
+  // expected-error-re at +1 {{'__imma_m8n32k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_ld_c(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.s32
+  // expected-error-re at +1 {{'__imma_m8n32k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_st_c_i32(dst, src, ldm, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.s32
+  // expected-error-re at +1 {{'__imma_m8n32k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_st_c_i32(dst, src, ldm, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.s8
+  // expected-error-re at +1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_mma_s8(dst, src, src, src, 3, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.s8.satfinite
+  // expected-error-re at +1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_mma_s8(dst, src, src, src, 3, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.s8
+  // expected-error-re at +1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_mma_s8(dst, src, src, src, 2, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.s8.satfinite
+  // expected-error-re at +1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_mma_s8(dst, src, src, src, 2, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.s8
+  // expected-error-re at +1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_mma_s8(dst, src, src, src, 1, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.s8.satfinite
+  // expected-error-re at +1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_mma_s8(dst, src, src, src, 1, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.s8
+  // expected-error-re at +1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_mma_s8(dst, src, src, src, 0, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.s8.satfinite
+  // expected-error-re at +1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_mma_s8(dst, src, src, src, 0, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.u8
+  // expected-error-re at +1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_mma_u8(dst, src, src, src, 3, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.u8.satfinite
+  // expected-error-re at +1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_mma_u8(dst, src, src, src, 3, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.u8
+  // expected-error-re at +1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_mma_u8(dst, src, src, src, 2, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.u8.satfinite
+  // expected-error-re at +1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_mma_u8(dst, src, src, src, 2, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.u8
+  // expected-error-re at +1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_mma_u8(dst, src, src, src, 1, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.u8.satfinite
+  // expected-error-re at +1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_mma_u8(dst, src, src, src, 1, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.u8
+  // expected-error-re at +1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_mma_u8(dst, src, src, src, 0, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.u8.satfinite
+  // expected-error-re at +1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m16n16k16_mma_u8(dst, src, src, src, 0, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.s8
+  // expected-error-re at +1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_mma_s8(dst, src, src, src, 3, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.s8.satfinite
+  // expected-error-re at +1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_mma_s8(dst, src, src, src, 3, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.s8
+  // expected-error-re at +1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_mma_s8(dst, src, src, src, 2, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.s8.satfinite
+  // expected-error-re at +1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_mma_s8(dst, src, src, src, 2, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.s8
+  // expected-error-re at +1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_mma_s8(dst, src, src, src, 1, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.s8.satfinite
+  // expected-error-re at +1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_mma_s8(dst, src, src, src, 1, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.s8
+  // expected-error-re at +1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_mma_s8(dst, src, src, src, 0, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.s8.satfinite
+  // expected-error-re at +1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_mma_s8(dst, src, src, src, 0, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.u8
+  // expected-error-re at +1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_mma_u8(dst, src, src, src, 3, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.u8.satfinite
+  // expected-error-re at +1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_mma_u8(dst, src, src, src, 3, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.u8
+  // expected-error-re at +1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_mma_u8(dst, src, src, src, 2, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.u8.satfinite
+  // expected-error-re at +1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_mma_u8(dst, src, src, src, 2, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.u8
+  // expected-error-re at +1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_mma_u8(dst, src, src, src, 1, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.u8.satfinite
+  // expected-error-re at +1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_mma_u8(dst, src, src, src, 1, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.u8
+  // expected-error-re at +1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_mma_u8(dst, src, src, src, 0, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.u8.satfinite
+  // expected-error-re at +1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m32n8k16_mma_u8(dst, src, src, src, 0, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.s8
+  // expected-error-re at +1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_mma_s8(dst, src, src, src, 3, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.s8.satfinite
+  // expected-error-re at +1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_mma_s8(dst, src, src, src, 3, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.s8
+  // expected-error-re at +1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_mma_s8(dst, src, src, src, 2, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.s8.satfinite
+  // expected-error-re at +1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_mma_s8(dst, src, src, src, 2, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.s8
+  // expected-error-re at +1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_mma_s8(dst, src, src, src, 1, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.s8.satfinite
+  // expected-error-re at +1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_mma_s8(dst, src, src, src, 1, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.s8
+  // expected-error-re at +1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_mma_s8(dst, src, src, src, 0, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.s8.satfinite
+  // expected-error-re at +1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_mma_s8(dst, src, src, src, 0, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.u8
+  // expected-error-re at +1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_mma_u8(dst, src, src, src, 3, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.u8.satfinite
+  // expected-error-re at +1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_mma_u8(dst, src, src, src, 3, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.u8
+  // expected-error-re at +1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_mma_u8(dst, src, src, src, 2, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.u8.satfinite
+  // expected-error-re at +1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_mma_u8(dst, src, src, src, 2, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.u8
+  // expected-error-re at +1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_mma_u8(dst, src, src, src, 1, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.u8.satfinite
+  // expected-error-re at +1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_mma_u8(dst, src, src, src, 1, 1);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.u8
+  // expected-error-re at +1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_mma_u8(dst, src, src, src, 0, 0);
+  // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.u8.satfinite
+  // expected-error-re at +1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+  __imma_m8n32k16_mma_u8(dst, src, src, src, 0, 1);
+#endif // (PTX >= 63) && (SM >= 72) 
+
+#if (PTX >= 63) && (SM >= 75)
+
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.load.a.row.stride.b1
+  // expected-error-re at +1 {{'__bmma_m8n8k128_ld_a_b1' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __bmma_m8n8k128_ld_a_b1(dst, src, ldm, 0);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.load.b.col.stride.b1
+  // expected-error-re at +1 {{'__bmma_m8n8k128_ld_b_b1' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __bmma_m8n8k128_ld_b_b1(dst, src, ldm, 1);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.load.c.col.stride.s32
+  // expected-error-re at +1 {{'__bmma_m8n8k128_ld_c' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __bmma_m8n8k128_ld_c(dst, src, ldm, 1);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.load.c.row.stride.s32
+  // expected-error-re at +1 {{'__bmma_m8n8k128_ld_c' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __bmma_m8n8k128_ld_c(dst, src, ldm, 0);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.store.d.col.stride.s32
+  // expected-error-re at +1 {{'__bmma_m8n8k128_st_c_i32' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __bmma_m8n8k128_st_c_i32(dst, src, ldm, 1);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.store.d.row.stride.s32
+  // expected-error-re at +1 {{'__bmma_m8n8k128_st_c_i32' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __bmma_m8n8k128_st_c_i32(dst, src, ldm, 0);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.a.row.stride.s4
+  // expected-error-re at +1 {{'__imma_m8n8k32_ld_a_s4' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __imma_m8n8k32_ld_a_s4(dst, src, ldm, 0);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.a.row.stride.u4
+  // expected-error-re at +1 {{'__imma_m8n8k32_ld_a_u4' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __imma_m8n8k32_ld_a_u4(dst, src, ldm, 0);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.b.col.stride.s4
+  // expected-error-re at +1 {{'__imma_m8n8k32_ld_b_s4' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __imma_m8n8k32_ld_b_s4(dst, src, ldm, 1);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.b.col.stride.u4
+  // expected-error-re at +1 {{'__imma_m8n8k32_ld_b_u4' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __imma_m8n8k32_ld_b_u4(dst, src, ldm, 1);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.c.col.stride.s32
+  // expected-error-re at +1 {{'__imma_m8n8k32_ld_c' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __imma_m8n8k32_ld_c(dst, src, ldm, 1);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.c.row.stride.s32
+  // expected-error-re at +1 {{'__imma_m8n8k32_ld_c' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __imma_m8n8k32_ld_c(dst, src, ldm, 0);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.store.d.col.stride.s32
+  // expected-error-re at +1 {{'__imma_m8n8k32_st_c_i32' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __imma_m8n8k32_st_c_i32(dst, src, ldm, 1);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.store.d.row.stride.s32
+  // expected-error-re at +1 {{'__imma_m8n8k32_st_c_i32' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __imma_m8n8k32_st_c_i32(dst, src, ldm, 0);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.mma.row.col.b1
+  // expected-error-re at +1 {{'__bmma_m8n8k128_mma_xor_popc_b1' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __bmma_m8n8k128_mma_xor_popc_b1(dst, src, src, src, 1);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.mma.row.col.s4
+  // expected-error-re at +1 {{'__imma_m8n8k32_mma_s4' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __imma_m8n8k32_mma_s4(dst, src, src, src, 1, 0);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.mma.row.col.s4.satfinite
+  // expected-error-re at +1 {{'__imma_m8n8k32_mma_s4' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __imma_m8n8k32_mma_s4(dst, src, src, src, 1, 1);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.mma.row.col.u4
+  // expected-error-re at +1 {{'__imma_m8n8k32_mma_u4' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __imma_m8n8k32_mma_u4(dst, src, src, src, 1, 0);
+  // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.mma.row.col.u4.satfinite
+  // expected-error-re at +1 {{'__imma_m8n8k32_mma_u4' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+  __imma_m8n8k32_mma_u4(dst, src, src, src, 1, 1);
+#endif // (PTX >= 63) && (SM >= 75) 
+}

Added: cfe/trunk/test/CodeGen/builtins-nvptx-mma.py
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/builtins-nvptx-mma.py?rev=359248&view=auto
==============================================================================
--- cfe/trunk/test/CodeGen/builtins-nvptx-mma.py (added)
+++ cfe/trunk/test/CodeGen/builtins-nvptx-mma.py Thu Apr 25 15:28:09 2019
@@ -0,0 +1,343 @@
+# This script generates all variants of wmma builtins, verifies that clang calls
+# correct LLVM instrinsics, and checks that availability of specific builtins is
+# constrained by the correct PTX version and the target GPU variant.
+
+# Dummy test run to avoid lit warnings.
+# RUN: echo "This is not a real test. It's a generator for builtins-nvpts-mma.cu" >/dev/null
+
+from __future__ import print_function
+
+import argparse
+from collections import defaultdict
+from itertools import product
+from string import Template
+
+class MMAFrag:
+  def __init__(self, geom, frag, ptx_elt_type):
+    self.geom = geom
+    self.frag = frag
+    self.ptx_type = ptx_elt_type;
+
+  def __repr__(self):
+    return "%s:%s:%s" % (self.geom, self.frag, self.ptx_type)
+
+class MMAOp:
+  def __init__(self, a, b, c, d):
+    self.a = a
+    self.b = b
+    self.c = c
+    self.d = d
+
+  def __repr__(self):
+    return ("{A:%s, B:%s, C:%s, D:%s}" % (self.a, self.b, self.c, self.d ))
+
+def make_mma_ops(geoms, types_a, types_b, types_c, types_d):
+  ops = []
+  for geom, type_a, type_c in product( geoms,  types_a, types_c):
+    for type_b, type_d in product(types_b if types_b else [type_a],
+                                  types_d if types_d else [type_c]):
+      ops.append(MMAOp(MMAFrag(geom, "a", type_a),
+                       MMAFrag(geom, "b", type_b),
+                       MMAFrag(geom, "c", type_c),
+                       MMAFrag(geom, "d", type_d)))
+  return ops
+
+def make_ldst_ops(geoms, frags, types):
+  return [MMAFrag(geom, frag, ptx_type) for (geom, frag, ptx_type)
+          in product(geoms, frags, types)]
+
+def get_mma_ops():
+  return (make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+                       ["f16"], [], ["f16", "f32"], ["f16", "f32"]) +
+          make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+                       ["s8", "u8"], [], ["s32"], []) +
+          make_mma_ops(["m8n8k32"],
+                       ["s4", "u4"], [], ["s32"], []) +
+          make_mma_ops(["m8n8k128"],
+                       ["b1"], [], ["s32"], []))
+def get_ldst_ops():
+  return (make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+                        ["a", "b"], ["f16", "u8", "s8"]) +
+          make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+                        ["c", "d"], ["f16", "f32", "s32"]) +
+          make_ldst_ops(["m8n8k32"], ["a", "b"], ["s4","u4"]) +
+          make_ldst_ops(["m8n8k128"], ["a", "b"], ["b1"]) +
+          make_ldst_ops(["m8n8k32", "m8n8k128"],  ["c", "d"], ["s32"]))
+
+def is_geom_supported(geom):
+  # geometries for FP and ints.
+  if geom in ["m8n32k16", "m32n8k16"]:
+    return ptx_version >= 61
+  # geometries for sub-ints.
+  if geom in ["m8n8k32", "m8n8k128"]:
+    return ptx_version >= 63 and gpu_arch >= 75
+  if geom == "m16n16k16":
+    return ptx_version >= 60
+  assert(False) # Unexpected geometry.
+
+def is_type_supported(ptx_type):
+  if ptx_type in ["s8", "u8", "s32"]:
+    return ptx_version >= 63 and gpu_arch >= 72
+  if ptx_type in ["s4", "u4", "b1"]:
+    return ptx_version >= 63 and gpu_arch >= 75
+  return ptx_version >= 60 and gpu_arch >= 70
+
+def is_mma_variant_supported(op, layout_a, layout_b, satf):
+  if not (is_type_supported(op.a.ptx_type)
+          and is_geom_supported(op.a.geom)):
+    return False
+  # sub-integer require row/col layout, and no satf.
+  if op.a.ptx_type in ["s4", "u4", "b1"]:
+    if op.a.ptx_type == "b1" and satf:
+      return False
+    return layout_a == "row" and layout_b == "col"
+  return True
+
+def is_ldst_variant_supported(frag, layout):
+  if not (is_type_supported(frag.ptx_type)
+          and is_geom_supported(frag.geom)):
+    return False
+  if frag.ptx_type in ["s4", "u4", "b1"]:
+    # sub-integer require sm_75 and ptx63, row/col layout for a/b.
+    return ((frag.frag == "a" and layout == "row")
+            or (frag.frag == "b" and layout == "col")
+            or frag.frag in ["c", "d"])
+  return True
+
+def get_builtin_prefix(frag):
+  prefix = None
+  if frag.geom in ["m16n16k16", "m32n8k16", "m8n32k16"]:
+    if frag.ptx_type in ["f16", "f32"]:
+      prefix = "__hmma"
+    else:
+      prefix = "__imma"
+  elif frag.geom == "m8n8k32":
+    prefix = "__imma" # sub-integers
+  elif frag.geom == "m8n8k128":
+    prefix = "__bmma"
+  assert prefix
+  return prefix
+
+def get_ldst_builtin_name(frag):
+  prefix = get_builtin_prefix(frag)
+
+  if prefix == "__hmma":
+    suffix = "" if frag.frag in ["a","b"] else frag.ptx_type
+  elif prefix in ["__imma", "__bmma"]:
+    suffix = "" if frag.frag in ["c"] else frag.ptx_type
+    if suffix == "s32":
+      suffix = "i32"
+  if frag.frag == "d":
+    ifrag = "c"
+    op = "st"
+  else:
+    ifrag = frag.frag
+    op = "ld"
+
+  name = "%s_%s_%s_%s%s" % (prefix, frag.geom, op, ifrag,
+                             "_" + suffix if suffix else "")
+  return name
+
+def get_mma_builtin_name(op):
+  prefix = get_builtin_prefix(op.a)
+
+  if prefix == "__hmma":
+    suffix = op.d.ptx_type + op.c.ptx_type
+  else:
+    suffix = op.a.ptx_type
+
+  name = "%s_%s_mma%s_%s" % (prefix, op.a.geom,
+                             "_xor_popc" if op.a.ptx_type == "b1" else "",
+                             suffix)
+  return name
+
+
+def get_required_sm(frag):
+  if frag.ptx_type in ["u4", "s4", "b1"]:
+    return 75
+  if frag.ptx_type in ["s8", "u8"]:
+    return 72
+  if frag.ptx_type == "s32":
+    if frag.geom in ["m8n8k32", "m8n8k128"]: # s4/u4/b1
+      return 75
+    else:                       # s8/u8
+      return 72
+  if frag.ptx_type in ["f16", "f32"]:
+    return 70
+  assert(False)
+
+def get_required_ptx(frag):
+  if frag.ptx_type in ["f16", "f32"]:
+    return 60 if frag.geom == "m16n16k16" else 61
+  return 63
+
+def gen_wmma_ldst_tests(results):
+  load_template = """
+  // CHECK${check_suffix}: call {{.*}} @${intrinsic}
+  // expected-error-re at +1 {{'${builtin}' needs target feature sm_${min_sm}{{.*}},ptx${min_ptx}{{.*}}}}
+  ${builtin}(${dst}, ${src}, ldm, ${blayout});
+""".rstrip()
+  intrinsic_template = "llvm.nvvm.wmma.${geom}.${op}.${frag}.${ilayout}.stride.${itype}"
+
+  for frag, layout in sorted(product(get_ldst_ops(), ["row","col"]), key=str):
+
+    if not is_ldst_variant_supported(frag, layout):
+      continue
+
+    is_fp = frag.ptx_type  == "f32"
+    min_sm = get_required_sm(frag)
+    min_ptx = get_required_ptx(frag)
+    params = {
+        "check_suffix" : "_PTX%d_SM%d" % (min_ptx, min_sm),
+        "builtin" : get_ldst_builtin_name(frag),
+        "min_ptx" : min_ptx,
+        "min_sm" : min_sm,
+        "dst": "fdst" if is_fp else "dst",
+        "src": "fsrc" if is_fp else "src",
+        "blayout" : 0 if layout == "row" else 1,
+        "intrinsic" : Template(intrinsic_template).substitute({
+            "frag" : frag.frag,
+            "geom"   : frag.geom,
+            "ilayout" : layout,
+            "itype" : frag.ptx_type,
+            "op" : "store" if frag.frag == "d" else "load",
+        })
+    }
+    results[(min_ptx,min_sm)] += Template(load_template).substitute(params)
+
+  return results
+
+def mma_signature(op):
+  if op.a.ptx_type in ["s8", "u8", "s4", "u4", "b1"]:
+    # int and sub-int ops are identified by input type.
+    return op.a.ptx_type
+  else:
+    # the rest are FP ops identified by accumulator & result type.
+    return "%s.%s" % (op.d.ptx_type, op.c.ptx_type)
+
+# Get numeric value for rowcol parameter of the builtin
+# AFAICT it uses the encoding accepted by NVVM intrinsics:
+# https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-matrix-mma
+def get_ilayout(a, b):
+  return {
+      "row.row" : 0,
+      "row.col" : 1,
+      "col.row" : 2,
+      "col.col" : 3
+  }[a + "." + b]
+
+def gen_wmma_mma_tests(results):
+  mma_template = """
+  // CHECK${check_suffix}: call {{.*}} @${intrinsic}
+  // expected-error-re at +1 {{'${builtin}' needs target feature sm_${min_sm}{{.*}},ptx${min_ptx}{{.*}}}}
+  ${builtin}(${dst}, ${asrc}, ${asrc}, ${csrc}, ${ilayout}${maybe_isatf});
+""".rstrip()
+  intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${intrinsic_signature}${satf}"
+
+  for op, alayout, blayout, satf in sorted(product( get_mma_ops(),
+                                                    ["row","col"],
+                                                    ["row","col"],
+                                                    [".satfinite", ""]),
+                                           key=str):
+
+    if not is_mma_variant_supported(op, alayout, blayout, satf):
+      continue
+
+    a_is_fp = op.a.ptx_type  == "f32"
+    c_is_fp = op.c.ptx_type  == "f32"
+    d_is_fp = op.d.ptx_type  == "f32"
+    min_sm = get_required_sm(op.a)
+    min_ptx = get_required_ptx(op.a)
+    if op.a.ptx_type == "b1": # .b1 MMA has no satf argument.
+       isatf_arg = ""
+    else:
+       isatf_arg = ", 1" if satf else ", 0"
+    params = {
+        "check_suffix" : "_PTX%d_SM%d" % (min_ptx, min_sm),
+        "builtin" : get_mma_builtin_name(op),
+        "min_ptx" : min_ptx,
+        "min_sm" : min_sm,
+        "dst": "fdst" if d_is_fp else "dst",
+        "asrc": "fsrc" if a_is_fp else "src",
+        "csrc": "fsrc" if c_is_fp else "src",
+        "ilayout" : get_ilayout(alayout, blayout),
+        "maybe_isatf" : isatf_arg,
+        "intrinsic" : Template(intrinsic_template).substitute({
+            "geom"  : op.a.geom,
+            "alayout" : alayout,
+            "blayout" : blayout,
+            "intrinsic_signature" : mma_signature(op),
+            "satf"  : satf,
+        })
+    }
+    results[(min_ptx, min_sm)] += Template(mma_template).substitute(params)
+
+  return results
+
+def gen_tests():
+  results = gen_wmma_ldst_tests(defaultdict(str))
+  results = gen_wmma_mma_tests(results)
+
+  run_template = r"""
+//
+// *** DO NOT EDIT ***
+//
+//  This test has been automatically generated by
+//  builtins-nvtx-mma.py --ptx=${ptx} --gpu-arch=${sm}
+//
+// Make sure we can handle all builtins available on sm_${sm} with PTX${ptx}
+// ${run}: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_${sm} \
+// ${run}:            -fcuda-is-device -target-feature +ptx${ptx} \
+// ${run}:            -DPTX=${ptx} -DSM=${sm} \
+// ${run}:            -S -emit-llvm -o - -x cuda %s \
+// ${run}:   | FileCheck -check-prefixes=${check_labels} %s
+// Verify that all builtins have correct constraints.
+// ${run}: %clang_cc1 -triple nvptx-unknown-unknown \
+// ${run}:   -target-cpu sm_60 -target-feature +ptx42 \
+// ${run}:   -DPTX=${ptx} -DSM=${sm} -fcuda-is-device -S -o /dev/null -x cuda \
+// ${run}:   -verify %s
+"""
+  def supported_variants(ptx, sm, results):
+    return [(ptx_, sm_) for ptx_, sm_ in results if ptx_ <= ptx and sm_ <= sm]
+
+  print(Template(run_template).substitute({
+      "run" : "RUN", # To avoid lit misinterpreting the template
+      "ptx" : ptx_version,
+      "sm" : gpu_arch,
+      "check_labels" : ",".join(["CHECK_PTX%d_SM%d" % (ptx_, sm_)
+                                 for ptx_, sm_
+                                 in supported_variants(ptx_version, gpu_arch,
+                                                       results)])
+  }))
+
+  print("""
+#if !defined(CUDA_VERSION)
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+
+typedef unsigned long long uint64_t;
+#endif
+
+// CHECK-LABEL: test_wmma_buitins
+__device__ void test_wmma_buitins(int *src, int *dst,
+                                  float *fsrc, float *fdst, int ldm) {
+""");
+
+  for (ptx, sm), tests in sorted(results.items()):
+    print()
+    print("#if (PTX >= %d) && (SM >= %d)" % (ptx, sm))
+    print(tests)
+    print("#endif // (PTX >= %d) && (SM >= %d) "% (ptx, sm))
+
+  print("}")
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--ptx", type=int, default=60)
+parser.add_argument("--gpu-arch", type=int, default=70)
+args = parser.parse_args()
+ptx_version = args.ptx
+gpu_arch = args.gpu_arch
+
+gen_tests()




More information about the cfe-commits mailing list