[clang] 27439a7 - [AMDGPU] New gfx940 mfma instructions
Stanislav Mekhanoshin via cfe-commits
cfe-commits at lists.llvm.org
Thu Mar 24 12:13:04 PDT 2022
Author: Stanislav Mekhanoshin
Date: 2022-03-24T12:12:52-07:00
New Revision: 27439a764230e5eb54568b2fc053a20c9005970f
URL: https://github.com/llvm/llvm-project/commit/27439a764230e5eb54568b2fc053a20c9005970f
DIFF: https://github.com/llvm/llvm-project/commit/27439a764230e5eb54568b2fc053a20c9005970f.diff
LOG: [AMDGPU] New gfx940 mfma instructions
Differential Revision: https://reviews.llvm.org/D122044
Added:
clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx940.mir
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll
Modified:
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/SISchedule.td
llvm/lib/Target/AMDGPU/VOP3PInstructions.td
llvm/test/MC/AMDGPU/mai-gfx940.s
llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index d2e60f85b9feb..3870b1cca6caa 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -305,5 +305,10 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x16bf16_1k, "V4fV4sV4sV4fIiIiIi",
TARGET_BUILTIN(__builtin_amdgcn_mfma_f64_16x16x4f64, "V4dddV4dIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f64_4x4x4f64, "ddddIiIiIi", "nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_16x16x32_i8, "V4iWiWiV4iIiIiIi", "nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x16_i8, "V16iWiWiV16iIiIiIi", "nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x8_xf32, "V4fV2fV2fV4fIiIiIi", "nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4_xf32, "V16fV2fV2fV16fIiIiIi", "nc", "mai-insts")
+
#undef BUILTIN
#undef TARGET_BUILTIN
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index 19ac40fe41605..fc29faf9ad1c5 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -1,9 +1,11 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
#pragma OPENCL EXTENSION cl_khr_fp64:enable
+typedef float v2f __attribute__((ext_vector_type(2)));
typedef float v4f __attribute__((ext_vector_type(4)));
typedef float v16f __attribute__((ext_vector_type(16)));
typedef float v32f __attribute__((ext_vector_type(32)));
@@ -216,3 +218,33 @@ void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c)
}
#endif // MFMA_GFX90A_TESTS
+
+#ifdef MFMA_GFX940_TESTS
+// CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8
+// CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
+void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c)
+{
+ *out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_mfma_i32_32x32x16_i8
+// CHECK-GFX940: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 %a, i64 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
+void test_mfma_i32_32x32x16_i8(global v16i* out, long a, long b, v16i c)
+{
+ *out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x8_xf32
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> %a, <2 x float> %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x8_xf32(global v4f* out, v2f a, v2f b, v4f c)
+{
+ *out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x4_xf32
+// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> %a, <2 x float> %b, <16 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_32x32x4_xf32(global v16f* out, v2f a, v2f b, v16f c)
+{
+ *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, 0);
+}
+#endif // MFMA_GFX940_TESTS
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl
new file mode 100644
index 0000000000000..9e50a11b6af2d
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx940 -verify -S -o - %s
+
+typedef float v2f __attribute__((ext_vector_type(2)));
+typedef float v4f __attribute__((ext_vector_type(4)));
+typedef float v16f __attribute__((ext_vector_type(16)));
+typedef int v4i __attribute__((ext_vector_type(4)));
+typedef int v16i __attribute__((ext_vector_type(16)));
+
+void test_mfma_i32_16x16x32i8(global v4i* out, long a, long b, v4i c, int d)
+{
+ *out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x32_i8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x32_i8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x32_i8' must be a constant integer}}
+}
+
+void test_mfma_i32_32x32x16i8(global v16i* out, long a, long b, v16i c, int d)
+{
+ *out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x16_i8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x16_i8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x16_i8' must be a constant integer}}
+}
+
+void test_mfma_f32_16x16x8xf32(global v4f* out, v2f a, v2f b, v4f c, int d)
+{
+ *out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x8_xf32' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x8_xf32' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x8_xf32' must be a constant integer}}
+}
+
+void test_mfma_f32_32x32x4xf32(global v16f* out, v2f a, v2f b, v16f c, int d)
+{
+ *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4_xf32' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4_xf32' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4_xf32' must be a constant integer}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 9ff3d96383086..2701c0edf6f0d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1999,6 +1999,11 @@ def int_amdgcn_ds_fadd_v2bf16 : Intrinsic<
[IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>]>,
GCCBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
+def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;
+def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
+def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>;
+def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;
+
//===----------------------------------------------------------------------===//
// Special Intrinsics for backend internal use only. No frontend
// should emit calls to these.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 50132c43a210b..f9890a3bef69c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4243,7 +4243,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
- case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
+ case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
+ case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
+ case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
+ case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
+ case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: {
// Default for MAI intrinsics.
// srcC can also be an immediate which can be folded later.
// FIXME: Should we eventually add an alternative mapping with AGPR src
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index caec50c8462cc..354fea5a52fa0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -295,6 +295,10 @@ def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8bf16_1k>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16bf16_1k>;
def : SourceOfDivergence<int_amdgcn_mfma_f64_16x16x4f64>;
def : SourceOfDivergence<int_amdgcn_mfma_f64_4x4x4f64>;
+def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x32_i8>;
+def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x16_i8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8_xf32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4_xf32>;
// The dummy boolean output is divergent from the IR's perspective,
// but the mask results are uniform. These produce a divergent and
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 30c67d92b4cde..5298659c39d34 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2407,6 +2407,11 @@ def VOP_V4F32_V4I16_V4I16_V4F32 : VOPProfile <[v4f32, v4i16, v4i16, v4f32]>;
def VOP_V16F32_V4I16_V4I16_V16F32 : VOPProfile <[v16f32, v4i16, v4i16, v16f32]>;
def VOP_V32F32_V4I16_V4I16_V32F32 : VOPProfile <[v32f32, v4i16, v4i16, v32f32]>;
+def VOP_V4I32_I64_I64_V4I32 : VOPProfile <[v4i32, i64, i64, v4i32]>;
+def VOP_V16I32_I64_I64_V16I32 : VOPProfile <[v16i32, i64, i64, v16i32]>;
+def VOP_V4F32_V2F32_V2F32_V4F32 : VOPProfile <[v4f32, v2f32, v2f32, v4f32]>;
+def VOP_V16F32_V2F32_V2F32_V16F32 : VOPProfile <[v16f32, v2f32, v2f32, v16f32]>;
+
class Commutable_REV <string revOp, bit isOrig> {
string RevOp = revOp;
bit IsOrig = isOrig;
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index 1ab03ddde72a8..371b14db53fff 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -264,10 +264,14 @@ def : InstRW<[WriteCopy], (instrs COPY)>;
def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>;
+def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X8X")>;
def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X16")>;
+def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X32")>;
def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X[14][FBI]")>;
+def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X4XF")>;
def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X8")>;
+def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X16")>;
def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X[124][FBI]")>;
def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 2f87a75c9496b..014de26cbd3be 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -412,6 +412,10 @@ def VOPProfileMAI_F32_V4I16_X16 : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, A
def VOPProfileMAI_F32_V4I16_X32 : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
def VOPProfileMAI_F64_16X16X4F64 : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, AISrc_256_f64, ADst_256, AVSrc_64>;
def VOPProfileMAI_F64_4X4X4F64 : VOPProfileMAI<VOP_F64_F64_F64_F64, AISrc_64_f64, ADst_64, AVSrc_64>;
+def VOPProfileMAI_I32_I64_X16 : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32, AISrc_128_b32, ADst_128, AVSrc_64>;
+def VOPProfileMAI_I32_I64_X32 : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32, AISrc_512_b32, ADst_512, AVSrc_64>;
+def VOPProfileMAI_F32_V2F32_X16 : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>;
+def VOPProfileMAI_F32_V2F32_X32 : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>;
def VOPProfileMAI_F32_F32_X4_VCD : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, VISrc_128_f32, VDst_128>;
def VOPProfileMAI_F32_F32_X16_VCD : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, VISrc_512_f32, VDst_512>;
@@ -430,6 +434,10 @@ def VOPProfileMAI_F32_V4I16_X16_VCD : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F
def VOPProfileMAI_F32_V4I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>;
def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, VISrc_256_f64, VDst_256, AVSrc_64>;
def VOPProfileMAI_F64_4X4X4F64_VCD : VOPProfileMAI<VOP_F64_F64_F64_F64, VISrc_64_f64, VDst_64, AVSrc_64>;
+def VOPProfileMAI_I32_I64_X16_VCD : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32, VISrc_128_b32, VDst_128, AVSrc_64>;
+def VOPProfileMAI_I32_I64_X32_VCD : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32, VISrc_512_b32, VDst_512, AVSrc_64>;
+def VOPProfileMAI_F32_V2F32_X16_VCD : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>;
+def VOPProfileMAI_F32_V2F32_X32_VCD : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>;
class MFMATable <bit is_mac, string Name> {
bit IsMac = is_mac;
@@ -527,6 +535,13 @@ let Predicates = [isGFX90APlus] in {
defm V_MFMA_F64_4X4X4F64 : MAIInst<"v_mfma_f64_4x4x4f64", "F64_4X4X4F64", int_amdgcn_mfma_f64_4x4x4f64>;
} // End Predicates = [isGFX90APlus]
+let Predicates = [isGFX940Plus] in {
+ defm V_MFMA_I32_32X32X16I8 : MAIInst<"v_mfma_i32_32x32x16i8", "I32_I64_X32", int_amdgcn_mfma_i32_32x32x16_i8>;
+ defm V_MFMA_I32_16X16X32I8 : MAIInst<"v_mfma_i32_16x16x32i8", "I32_I64_X16", int_amdgcn_mfma_i32_16x16x32_i8>;
+ defm V_MFMA_F32_16X16X8XF32 : MAIInst<"v_mfma_f32_16x16x8xf32", "F32_V2F32_X16", int_amdgcn_mfma_f32_16x16x8_xf32>;
+ defm V_MFMA_F32_32X32X4XF32 : MAIInst<"v_mfma_f32_32x32x4xf32", "F32_V2F32_X32", int_amdgcn_mfma_f32_32x32x4_xf32>;
+} // End Predicates = [isGFX940Plus]
+
let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in {
defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>;
defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;
@@ -727,6 +742,11 @@ defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx90a <0x67>;
defm V_MFMA_F64_16X16X4F64 : VOP3P_Real_MFMA_gfx90a <0x6e>;
defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>;
+defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">;
+defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
+defm V_MFMA_F32_16X16X8XF32 : VOP3P_Real_MFMA_gfx940 <0x3e, "v_mfma_f32_16x16x8_xf32">;
+defm V_MFMA_F32_32X32X4XF32 : VOP3P_Real_MFMA_gfx940 <0x3f, "v_mfma_f32_32x32x4_xf32">;
+
defm V_MFMA_F32_32X32X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5d, "v_mfma_f32_32x32x4_2b_bf16">;
defm V_MFMA_F32_16X16X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5e, "v_mfma_f32_16x16x4_4b_bf16">;
defm V_MFMA_F32_4X4X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5f, "v_mfma_f32_4x4x4_16b_bf16">;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx940.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx940.mir
new file mode 100644
index 0000000000000..a1eb2b9c1680a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx940.mir
@@ -0,0 +1,119 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx940 -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST
+# RUN: llc -march=amdgcn -mcpu=gfx940 -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY
+
+---
+name: mfma_i32_16x16x32_i8_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr1_vgpr2, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3
+
+ ; FAST-LABEL: name: mfma_i32_16x16x32_i8_vva
+ ; FAST: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3
+ ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+ ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+ ; FAST: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+ ; FAST: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.16x16x32.i8), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<4 x s32>), 0, 0, 0
+ ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+ ; GREEDY-LABEL: name: mfma_i32_16x16x32_i8_vva
+ ; GREEDY: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3
+ ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+ ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+ ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+ ; GREEDY: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.16x16x32.i8), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<4 x s32>), 0, 0, 0
+ ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+ %0:_(s64) = COPY $vgpr0_vgpr1
+ %1:_(s64) = COPY $vgpr2_vgpr3
+ %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+ %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.16x16x32.i8), %0, %1, %2, 0, 0, 0
+ $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_i32_32x32x16_i8_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+
+ ; FAST-LABEL: name: mfma_i32_32x32x16_i8_vva
+ ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+ ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+ ; FAST: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; FAST: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.32x32x16.i8), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<16 x s32>), 0, 0, 0
+ ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+ ; GREEDY-LABEL: name: mfma_i32_32x32x16_i8_vva
+ ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+ ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+ ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; GREEDY: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.32x32x16.i8), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<16 x s32>), 0, 0, 0
+ ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+ %0:_(s64) = COPY $vgpr0_vgpr1
+ %1:_(s64) = COPY $vgpr2_vgpr3
+ %2:_(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.32x32x16.i8), %0, %1, %2, 0, 0, 0
+ $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_f32_16x16x8_xf32_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr1_vgpr2, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3
+
+ ; FAST-LABEL: name: mfma_f32_16x16x8_xf32_vva
+ ; FAST: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3
+ ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+ ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+ ; FAST: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+ ; FAST: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x8.xf32), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<4 x s32>), 0, 0, 0
+ ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+ ; GREEDY-LABEL: name: mfma_f32_16x16x8_xf32_vva
+ ; GREEDY: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3
+ ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+ ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+ ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+ ; GREEDY: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x8.xf32), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<4 x s32>), 0, 0, 0
+ ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+ %0:_(s64) = COPY $vgpr0_vgpr1
+ %1:_(s64) = COPY $vgpr2_vgpr3
+ %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+ %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x8.xf32), %0, %1, %2, 0, 0, 0
+ $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_f32_32x32x4_xf32_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+
+ ; FAST-LABEL: name: mfma_f32_32x32x4_xf32_vva
+ ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+ ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+ ; FAST: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; FAST: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4.xf32), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<16 x s32>), 0, 0, 0
+ ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+ ; GREEDY-LABEL: name: mfma_f32_32x32x4_xf32_vva
+ ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+ ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+ ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; GREEDY: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4.xf32), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<16 x s32>), 0, 0, 0
+ ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+ %0:_(s64) = COPY $vgpr0_vgpr1
+ %1:_(s64) = COPY $vgpr2_vgpr3
+ %2:_(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4.xf32), %0, %1, %2, 0, 0, 0
+ $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
new file mode 100644
index 0000000000000..629147725f085
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
@@ -0,0 +1,83 @@
+; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940 %s
+; RUN: llc -march=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+; RUN: llc -march=amdgcn -mcpu=gfx940 -stress-regalloc=10 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940 %s
+; RUN: llc -march=amdgcn -mcpu=gfx940 -stress-regalloc=10 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+
+declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32)
+declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32)
+
+; GCN-LABEL: {{^}}test_mfma_i32_16x16x32i8:
+; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
+; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
+; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
+; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
+; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
+; GFX940: v_mfma_i32_16x16x32_i8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GISEL: v_mfma_i32_16x16x32_i8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT: v_accvgpr_read_b32
+; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_i32_16x16x32i8(<4 x i32> addrspace(1)* %arg) #0 {
+bb:
+ %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg
+ %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 4294967298, i64 12884901892, <4 x i32> %in.1, i32 1, i32 2, i32 3)
+ store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_i32_32x32x16i8:
+; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
+; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
+; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
+; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
+; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
+; GFX940: v_mfma_i32_32x32x16_i8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GISEL: v_mfma_i32_32x32x16_i8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT: v_accvgpr_read_b32
+; GCN-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_i32_32x32x16i8(<16 x i32> addrspace(1)* %arg) #0 {
+bb:
+ %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg
+ %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 4294967298, i64 12884901892, <16 x i32> %in.1, i32 1, i32 2, i32 3)
+ store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_16x16x8xf32:
+; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1.0
+; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2.0
+; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 0x40400000
+; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4.0
+; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
+; GFX940: v_mfma_f32_16x16x8_xf32 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:[[TWO]]], v{{\[}}[[THREE]]:[[FOUR]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GISEL: v_mfma_f32_16x16x8_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT: v_accvgpr_read_b32
+; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(<4 x float> addrspace(1)* %arg) #0 {
+bb:
+ %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+ %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <4 x float> %in.1, i32 1, i32 2, i32 3)
+ store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x4xf32:
+; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1.0
+; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2.0
+; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 0x40400000
+; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4.0
+; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
+; GFX940: v_mfma_f32_32x32x4_xf32 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:[[TWO]]], v{{\[}}[[THREE]]:[[FOUR]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GISEL: v_mfma_f32_32x32x4_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT: v_accvgpr_read_b32
+; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(<16 x float> addrspace(1)* %arg) #0 {
+bb:
+ %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+ %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <16 x float> %in.1, i32 1, i32 2, i32 3)
+ store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+ ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll
new file mode 100644
index 0000000000000..652ab09179648
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+
+declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32)
+declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32)
+
+; GCN-LABEL: {{^}}test_mfma_i32_16x16x32i8:
+; GCN: v_mfma_i32_16x16x32_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+define amdgpu_kernel void @test_mfma_i32_16x16x32i8(<4 x i32> addrspace(1)* %arg) {
+bb:
+ %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg
+ %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 4294967298, i64 12884901892, <4 x i32> %in.1, i32 0, i32 0, i32 0)
+ store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_i32_32x32x16i8:
+; GCN: v_mfma_i32_32x32x16_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+define amdgpu_kernel void @test_mfma_i32_32x32x16i8(<16 x i32> addrspace(1)* %arg) {
+bb:
+ %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg
+ %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 4294967298, i64 12884901892, <16 x i32> %in.1, i32 0, i32 0, i32 0)
+ store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_16x16x8xf32:
+; GCN: v_mfma_f32_16x16x8_xf32 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(<4 x float> addrspace(1)* %arg) {
+bb:
+ %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+ %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <4 x float> %in.1, i32 0, i32 0, i32 0)
+ store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x4xf32:
+; GCN: v_mfma_f32_32x32x4_xf32 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(<16 x float> addrspace(1)* %arg) {
+bb:
+ %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+ %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <16 x float> %in.1, i32 0, i32 0, i32 0)
+ store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+ ret void
+}
diff --git a/llvm/test/MC/AMDGPU/mai-gfx940.s b/llvm/test/MC/AMDGPU/mai-gfx940.s
index 6570b6229bc24..4c3efbde1ebb5 100644
--- a/llvm/test/MC/AMDGPU/mai-gfx940.s
+++ b/llvm/test/MC/AMDGPU/mai-gfx940.s
@@ -262,6 +262,54 @@ v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[34:65] blgp:7
v_mfma_f32_32x32x1f32 v[0:31], v0, v1, v[34:65] blgp:7
// GFX940: v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, v[34:65] blgp:7 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x8a,0xe4]
+v_mfma_i32_32x32x16_i8 v[0:15], v[2:3], v[4:5], v[0:15]
+// GFX940: v_mfma_i32_32x32x16_i8 v[0:15], v[2:3], v[4:5], v[0:15] ; encoding: [0x00,0x00,0xd6,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15]
+// GFX940: v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] ; encoding: [0x00,0x80,0xd6,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_i32_32x32x16_i8 v[0:15], v[2:3], v[4:5], v[0:15]
+// GFX940: v_mfma_i32_32x32x16_i8 v[0:15], v[2:3], v[4:5], v[0:15] ; encoding: [0x00,0x00,0xd6,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15]
+// GFX940: v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] ; encoding: [0x00,0x80,0xd6,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5
+// GFX940: v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5 ; encoding: [0x00,0x80,0xd6,0xd3,0x02,0x09,0x02,0xa4]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_i32_32x32x16i8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5
+// GFX940: v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5 ; encoding: [0x00,0x80,0xd6,0xd3,0x02,0x09,0x02,0xa4]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_i32_32x32x16i8 v[0:15], v[2:3], v[4:5], v[0:15] blgp:5
+// GFX940: v_mfma_i32_32x32x16_i8 v[0:15], v[2:3], v[4:5], v[0:15] blgp:5 ; encoding: [0x00,0x00,0xd6,0xd3,0x02,0x09,0x02,0xa4]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_i32_16x16x32_i8 v[0:3], v[2:3], v[4:5], v[0:3]
+// GFX940: v_mfma_i32_16x16x32_i8 v[0:3], v[2:3], v[4:5], v[0:3] ; encoding: [0x00,0x00,0xd7,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3]
+// GFX940: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] ; encoding: [0x00,0x80,0xd7,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5
+// GFX940: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5 ; encoding: [0x00,0x80,0xd7,0xd3,0x02,0x09,0x02,0xa4]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_i32_16x16x32i8 v[0:3], v[2:3], v[4:5], v[0:3] blgp:5
+// GFX940: v_mfma_i32_16x16x32_i8 v[0:3], v[2:3], v[4:5], v[0:3] blgp:5 ; encoding: [0x00,0x00,0xd7,0xd3,0x02,0x09,0x02,0xa4]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_i32_16x16x32i8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5
+// GFX940: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5 ; encoding: [0x00,0x80,0xd7,0xd3,0x02,0x09,0x02,0xa4]
+// GFX90A: error: instruction not supported on this GPU
+
v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[2:3], v[4:5], v[34:65]
// GFX940: v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[2:3], v[4:5], v[34:65] ; encoding: [0x00,0x00,0xdd,0xd3,0x02,0x09,0x8a,0x04]
// GFX90A: error: instruction not supported on this GPU
@@ -379,3 +427,35 @@ v_mfma_f32_16x16x16bf16_1k v[0:3], v[2:3], v[4:5], v[2:5]
v_mfma_f32_16x16x16bf16_1k a[0:3], v[2:3], v[4:5], a[2:5]
// GFX940: v_mfma_f32_16x16x16_bf16 a[0:3], v[2:3], v[4:5], a[2:5] ; encoding: [0x00,0x80,0xe1,0xd3,0x02,0x09,0x0a,0x04]
+
+v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5]
+// GFX940: v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5] ; encoding: [0x00,0x80,0xbe,0xd3,0x02,0x09,0x0a,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], v[2:3], v[4:5], v[2:5]
+// GFX940: v_mfma_f32_16x16x8_xf32 v[0:3], v[2:3], v[4:5], v[2:5] ; encoding: [0x00,0x00,0xbe,0xd3,0x02,0x09,0x0a,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8xf32 a[0:3], v[2:3], v[4:5], a[2:5]
+// GFX940: v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5] ; encoding: [0x00,0x80,0xbe,0xd3,0x02,0x09,0x0a,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8xf32 v[0:3], v[2:3], v[4:5], v[2:5]
+// GFX940: v_mfma_f32_16x16x8_xf32 v[0:3], v[2:3], v[4:5], v[2:5] ; encoding: [0x00,0x00,0xbe,0xd3,0x02,0x09,0x0a,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32 v[0:15], v[2:3], v[4:5], v[18:33]
+// GFX940: v_mfma_f32_32x32x4_xf32 v[0:15], v[2:3], v[4:5], v[18:33] ; encoding: [0x00,0x00,0xbf,0xd3,0x02,0x09,0x4a,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[4:5], a[18:33]
+// GFX940: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[4:5], a[18:33] ; encoding: [0x00,0x80,0xbf,0xd3,0x02,0x09,0x4a,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4xf32 v[0:15], v[2:3], v[4:5], v[18:33]
+// GFX940: v_mfma_f32_32x32x4_xf32 v[0:15], v[2:3], v[4:5], v[18:33] ; encoding: [0x00,0x00,0xbf,0xd3,0x02,0x09,0x4a,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4xf32 a[0:15], v[2:3], v[4:5], a[18:33]
+// GFX940: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[4:5], a[18:33] ; encoding: [0x00,0x80,0xbf,0xd3,0x02,0x09,0x4a,0x04]
+// GFX90A: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt b/llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt
index 9c0ff56743f87..a131a5d2aefbf 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt
@@ -3,6 +3,24 @@
# GFX940: v_accvgpr_write_b32 a10, s20 ; encoding: [0x0a,0x40,0xd9,0xd3,0x14,0x00,0x00,0x18]
0x0a,0x40,0xd9,0xd3,0x14,0x00,0x00,0x18
+# GFX940: v_mfma_i32_32x32x16_i8 v[0:15], v[2:3], v[4:5], v[0:15] ; encoding: [0x00,0x00,0xd6,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x00,0xd6,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] ; encoding: [0x00,0x80,0xd6,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x80,0xd6,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5 ; encoding: [0x00,0x80,0xd6,0xd3,0x02,0x09,0x02,0xa4]
+0x00,0x80,0xd6,0xd3,0x02,0x09,0x02,0xa4
+
+# GFX940: v_mfma_i32_16x16x32_i8 v[0:3], v[2:3], v[4:5], v[0:3] ; encoding: [0x00,0x00,0xd7,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x00,0xd7,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] ; encoding: [0x00,0x80,0xd7,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x80,0xd7,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5 ; encoding: [0x00,0x80,0xd7,0xd3,0x02,0x09,0x02,0xa4]
+0x00,0x80,0xd7,0xd3,0x02,0x09,0x02,0xa4
+
# GFX940: v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[2:3], v[4:5], v[2:33] ; encoding: [0x00,0x00,0xdd,0xd3,0x02,0x09,0x0a,0x04]
0x00,0x00,0xdd,0xd3,0x02,0x09,0x0a,0x04
@@ -32,3 +50,15 @@
# GFX940: v_mfma_f32_16x16x16_bf16 a[0:3], v[2:3], v[4:5], a[2:5] ; encoding: [0x00,0x80,0xe1,0xd3,0x02,0x09,0x0a,0x04]
0x00,0x80,0xe1,0xd3,0x02,0x09,0x0a,0x04
+
+# GFX940: v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5] ; encoding: [0x00,0x80,0xbe,0xd3,0x02,0x09,0x0a,0x04]
+0x00,0x80,0xbe,0xd3,0x02,0x09,0x0a,0x04
+
+# GFX940: v_mfma_f32_16x16x8_xf32 v[0:3], v[2:3], v[4:5], v[2:5] ; encoding: [0x00,0x00,0xbe,0xd3,0x02,0x09,0x0a,0x04]
+0x00,0x00,0xbe,0xd3,0x02,0x09,0x0a,0x04
+
+# GFX940: v_mfma_f32_32x32x4_xf32 v[0:15], v[2:3], v[4:5], v[2:17] ; encoding: [0x00,0x00,0xbf,0xd3,0x02,0x09,0x0a,0x04]
+0x00,0x00,0xbf,0xd3,0x02,0x09,0x0a,0x04
+
+# GFX940: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[4:5], a[2:17] ; encoding: [0x00,0x80,0xbf,0xd3,0x02,0x09,0x0a,0x04]
+0x00,0x80,0xbf,0xd3,0x02,0x09,0x0a,0x04
More information about the cfe-commits
mailing list