[clang] 2695f0a - [AMDGPU] Support for gfx940 fp8 mfma
Stanislav Mekhanoshin via cfe-commits
cfe-commits at lists.llvm.org
Mon Jul 18 12:11:40 PDT 2022
Author: Stanislav Mekhanoshin
Date: 2022-07-18T11:49:56-07:00
New Revision: 2695f0a688e9d26fcb0f3a4b686a2783f2eb145c
URL: https://github.com/llvm/llvm-project/commit/2695f0a688e9d26fcb0f3a4b686a2783f2eb145c
DIFF: https://github.com/llvm/llvm-project/commit/2695f0a688e9d26fcb0f3a4b686a2783f2eb145c.diff
LOG: [AMDGPU] Support for gfx940 fp8 mfma
Differential Revision: https://reviews.llvm.org/D129906
Added:
Modified:
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/VOP3PInstructions.td
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll
llvm/test/MC/AMDGPU/mai-gfx940.s
llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e9f25d783e596..e992e22ca527a 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -339,6 +339,14 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_16x16x32_i8, "V4iWiWiV4iIiIiIi", "nc",
TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x16_i8, "V16iWiWiV16iIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x8_xf32, "V4fV2fV2fV4fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4_xf32, "V16fV2fV2fV16fIiIiIi", "nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8, "V4fWiWiV4fIiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8, "V4fWiWiV4fIiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8, "V4fWiWiV4fIiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8, "V4fWiWiV4fIiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8, "V16fWiWiV16fIiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8, "V16fWiWiV16fIiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8, "V16fWiWiV16fIiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8, "V16fWiWiV16fIiIiIi", "nc", "fp8-insts")
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x32_f16, "V4fV4hV8hV4fiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x16_f16, "V16fV4hV8hV16fiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x32_bf16, "V4fV4sV8sV4fiIiIi", "nc", "mai-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index 8e3cc7e382e90..192bb1062381d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -251,6 +251,62 @@ void test_mfma_f32_32x32x4_xf32(global v16f* out, v2f a, v2f b, v16f c)
*out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, 0);
}
+// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_bf8
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x32_bf8_bf8(global v4f* out, long a, long b, v4f c)
+{
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_fp8
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x32_bf8_fp8(global v4f* out, long a, long b, v4f c)
+{
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_bf8
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x32_fp8_bf8(global v4f* out, long a, long b, v4f c)
+{
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_fp8
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x32_fp8_fp8(global v4f* out, long a, long b, v4f c)
+{
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_bf8_bf8
+// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_32x32x16_bf8_bf8(global v16f* out, long a, long b, v16f c)
+{
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_bf8_fp8
+// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_32x32x16_bf8_fp8(global v16f* out, long a, long b, v16f c)
+{
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_fp8_bf8
+// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_32x32x16_fp8_bf8(global v16f* out, long a, long b, v16f c)
+{
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_fp8_fp8
+// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_32x32x16_fp8_fp8(global v16f* out, long a, long b, v16f c)
+{
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(a, b, c, 0, 0, 0);
+}
+
// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_f16
// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
void test_smfmac_f32_16x16x32_f16(global v4f* out, v4h a, v8h b, v4f c, int idx)
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl
index 243d279b8c792..093642218b480 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl
@@ -39,6 +39,62 @@ void test_mfma_f32_32x32x4xf32(global v16f* out, v2f a, v2f b, v16f c, int d)
*out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4_xf32' must be a constant integer}}
}
+void test_mfma_f32_16x16x32_bf8_bf8(global v4f* out, long a, long b, v4f c, int d)
+{
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8' must be a constant integer}}
+}
+
+void test_mfma_f32_16x16x32_bf8_fp8(global v4f* out, long a, long b, v4f c, int d)
+{
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8' must be a constant integer}}
+}
+
+void test_mfma_f32_16x16x32_fp8_bf8(global v4f* out, long a, long b, v4f c, int d)
+{
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8' must be a constant integer}}
+}
+
+void test_mfma_f32_16x16x32_fp8_fp8(global v4f* out, long a, long b, v4f c, int d)
+{
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8' must be a constant integer}}
+}
+
+void test_mfma_f32_32x32x16_bf8_bf8(global v16f* out, long a, long b, v16f c, int d)
+{
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8' must be a constant integer}}
+}
+
+void test_mfma_f32_32x32x16_bf8_fp8(global v16f* out, long a, long b, v16f c, int d)
+{
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8' must be a constant integer}}
+}
+
+void test_mfma_f32_32x32x16_fp8_bf8(global v16f* out, long a, long b, v16f c, int d)
+{
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8' must be a constant integer}}
+}
+
+void test_mfma_f32_32x32x16_fp8_fp8(global v16f* out, long a, long b, v16f c, int d)
+{
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8' must be a constant integer}}
+}
+
void test_smfmac_f32_16x16x32_f16(global v4f* out, v4h a, v8h b, v4f c, int idx, int d)
{
*out = __builtin_amdgcn_smfmac_f32_16x16x32_f16(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x32_f16' must be a constant integer}}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 2936e495bb2df..8261ca2326e91 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2304,6 +2304,17 @@ def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, ll
def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>;
def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;
+class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> :
+ AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>;
+
+multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
+ foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
+ def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
+}
+
+defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>;
+defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>;
+
// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid
class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
ClangBuiltin<!subst("int", "__builtin", NAME)>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 0830cbd919a0c..6785a40a412b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4426,7 +4426,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
- case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: {
+ case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
+ case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
+ case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
+ case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
+ case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
+ case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
+ case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
+ case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
+ case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
// Default for MAI intrinsics.
// srcC can also be an immediate which can be folded later.
// FIXME: Should we eventually add an alternative mapping with AGPR src
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 8297635d7bb2a..d260e005da4dc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -340,6 +340,14 @@ def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x32_i8>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x16_i8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8_xf32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4_xf32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_bf8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_fp8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_bf8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_fp8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_f16>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_f16>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_bf16>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index ffbd08514deba..ad8908442b475 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2671,6 +2671,8 @@ def VOP_V4I32_I64_I64_V4I32 : VOPProfile <[v4i32, i64, i64, v4i32]>;
def VOP_V16I32_I64_I64_V16I32 : VOPProfile <[v16i32, i64, i64, v16i32]>;
def VOP_V4F32_V2F32_V2F32_V4F32 : VOPProfile <[v4f32, v2f32, v2f32, v4f32]>;
def VOP_V16F32_V2F32_V2F32_V16F32 : VOPProfile <[v16f32, v2f32, v2f32, v16f32]>;
+def VOP_V4F32_I64_I64_V4F32 : VOPProfile <[v4f32, i64, i64, v4f32]>;
+def VOP_V16F32_I64_I64_V16F32 : VOPProfile <[v16f32, i64, i64, v16f32]>;
def VOP_V4F32_V4F16_V8F16_I32 : VOPProfile <[v4f32, v4f16, v8f16, i32]>;
def VOP_V16F32_V4F16_V8F16_I32 : VOPProfile <[v16f32, v4f16, v8f16, i32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 59ce532af59b4..666ccccb719af 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -493,6 +493,8 @@ def VOPProfileMAI_I32_I64_X16 : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32, A
def VOPProfileMAI_I32_I64_X32 : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32, AISrc_512_b32, ADst_512, AVSrc_64>;
def VOPProfileMAI_F32_V2F32_X16 : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>;
def VOPProfileMAI_F32_V2F32_X32 : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>;
+def VOPProfileMAI_F32_I64_X32 : VOPProfileMAI<VOP_V4F32_I64_I64_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>;
+def VOPProfileMAI_F32_I64_X16 : VOPProfileMAI<VOP_V16F32_I64_I64_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>;
def VOPProfileMAI_F32_F32_X4_VCD : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, VISrc_128_f32, VDst_128>;
def VOPProfileMAI_F32_F32_X16_VCD : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, VISrc_512_f32, VDst_512>;
@@ -515,6 +517,8 @@ def VOPProfileMAI_I32_I64_X16_VCD : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32,
def VOPProfileMAI_I32_I64_X32_VCD : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32, VISrc_512_b32, VDst_512, AVSrc_64>;
def VOPProfileMAI_F32_V2F32_X16_VCD : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>;
def VOPProfileMAI_F32_V2F32_X32_VCD : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>;
+def VOPProfileMAI_F32_I64_X32_VCD : VOPProfileMAI<VOP_V4F32_I64_I64_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>;
+def VOPProfileMAI_F32_I64_X16_VCD : VOPProfileMAI<VOP_V16F32_I64_I64_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>;
def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I32, AVDst_128, AVSrc_64, AVSrc_128>;
def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
@@ -638,6 +642,14 @@ let Predicates = [isGFX940Plus], is_gfx940_xdl = 1 in {
defm V_MFMA_I32_16X16X32I8 : MAIInst<"v_mfma_i32_16x16x32i8", "I32_I64_X16", int_amdgcn_mfma_i32_16x16x32_i8>;
defm V_MFMA_F32_16X16X8XF32 : MAIInst<"v_mfma_f32_16x16x8xf32", "F32_V2F32_X16", int_amdgcn_mfma_f32_16x16x8_xf32>;
defm V_MFMA_F32_32X32X4XF32 : MAIInst<"v_mfma_f32_32x32x4xf32", "F32_V2F32_X32", int_amdgcn_mfma_f32_32x32x4_xf32>;
+ defm V_MFMA_F32_16X16X32_BF8_BF8 : MAIInst<"v_mfma_f32_16x16x32_bf8_bf8", "F32_I64_X32", int_amdgcn_mfma_f32_16x16x32_bf8_bf8>;
+ defm V_MFMA_F32_16X16X32_BF8_FP8 : MAIInst<"v_mfma_f32_16x16x32_bf8_fp8", "F32_I64_X32", int_amdgcn_mfma_f32_16x16x32_bf8_fp8>;
+ defm V_MFMA_F32_16X16X32_FP8_BF8 : MAIInst<"v_mfma_f32_16x16x32_fp8_bf8", "F32_I64_X32", int_amdgcn_mfma_f32_16x16x32_fp8_bf8>;
+ defm V_MFMA_F32_16X16X32_FP8_FP8 : MAIInst<"v_mfma_f32_16x16x32_fp8_fp8", "F32_I64_X32", int_amdgcn_mfma_f32_16x16x32_fp8_fp8>;
+ defm V_MFMA_F32_32X32X16_BF8_BF8 : MAIInst<"v_mfma_f32_32x32x16_bf8_bf8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_bf8_bf8>;
+ defm V_MFMA_F32_32X32X16_BF8_FP8 : MAIInst<"v_mfma_f32_32x32x16_bf8_fp8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_bf8_fp8>;
+ defm V_MFMA_F32_32X32X16_FP8_BF8 : MAIInst<"v_mfma_f32_32x32x16_fp8_bf8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_fp8_bf8>;
+ defm V_MFMA_F32_32X32X16_FP8_FP8 : MAIInst<"v_mfma_f32_32x32x16_fp8_fp8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_fp8_fp8>;
} // End Predicates = [isGFX940Plus], is_gfx940_xdl = 1
multiclass SMFMACInst<string OpName, string P, SDPatternOperator node> {
@@ -1121,6 +1133,14 @@ defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x
defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
defm V_MFMA_F32_16X16X8XF32 : VOP3P_Real_MFMA_gfx940 <0x3e, "v_mfma_f32_16x16x8_xf32">;
defm V_MFMA_F32_32X32X4XF32 : VOP3P_Real_MFMA_gfx940 <0x3f, "v_mfma_f32_32x32x4_xf32">;
+defm V_MFMA_F32_16X16X32_BF8_BF8 : VOP3P_Real_MFMA_gfx940 <0x70>;
+defm V_MFMA_F32_16X16X32_BF8_FP8 : VOP3P_Real_MFMA_gfx940 <0x71>;
+defm V_MFMA_F32_16X16X32_FP8_BF8 : VOP3P_Real_MFMA_gfx940 <0x72>;
+defm V_MFMA_F32_16X16X32_FP8_FP8 : VOP3P_Real_MFMA_gfx940 <0x73>;
+defm V_MFMA_F32_32X32X16_BF8_BF8 : VOP3P_Real_MFMA_gfx940 <0x74>;
+defm V_MFMA_F32_32X32X16_BF8_FP8 : VOP3P_Real_MFMA_gfx940 <0x75>;
+defm V_MFMA_F32_32X32X16_FP8_BF8 : VOP3P_Real_MFMA_gfx940 <0x76>;
+defm V_MFMA_F32_32X32X16_FP8_FP8 : VOP3P_Real_MFMA_gfx940 <0x77>;
defm V_MFMA_F32_32X32X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5d, "v_mfma_f32_32x32x4_2b_bf16">;
defm V_MFMA_F32_16X16X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5e, "v_mfma_f32_16x16x4_4b_bf16">;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
index 23f6f2cb54dd7..3474cace10c21 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
@@ -7,6 +7,14 @@ declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i3
declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32)
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64, i64, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64, i64, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64, i64, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64, i64, <4 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64, i64, <16 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64, i64, <16 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64, i64, <16 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64, i64, <16 x float>, i32, i32, i32)
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half>, <8 x half>, <4 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half>, <8 x half>, <16 x float>, i32, i32, i32)
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16>, <8 x i16>, <4 x float>, i32, i32, i32)
@@ -86,6 +94,150 @@ bb:
ret void
}
+; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_bf8:
+; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
+; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
+; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
+; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
+; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
+; GFX940: v_mfma_f32_16x16x32_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GISEL: v_mfma_f32_16x16x32_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT: v_accvgpr_read_b32
+; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(<4 x float> addrspace(1)* %arg) #0 {
+bb:
+ %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+ %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
+ store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_fp8:
+; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
+; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
+; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
+; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
+; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
+; GFX940: v_mfma_f32_16x16x32_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GISEL: v_mfma_f32_16x16x32_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT: v_accvgpr_read_b32
+; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(<4 x float> addrspace(1)* %arg) #0 {
+bb:
+ %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+ %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
+ store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_bf8:
+; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
+; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
+; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
+; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
+; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
+; GFX940: v_mfma_f32_16x16x32_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GISEL: v_mfma_f32_16x16x32_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT: v_accvgpr_read_b32
+; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(<4 x float> addrspace(1)* %arg) #0 {
+bb:
+ %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+ %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
+ store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_fp8:
+; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
+; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
+; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
+; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
+; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
+; GFX940: v_mfma_f32_16x16x32_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GISEL: v_mfma_f32_16x16x32_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT: v_accvgpr_read_b32
+; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(<4 x float> addrspace(1)* %arg) #0 {
+bb:
+ %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+ %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
+ store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_bf8:
+; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
+; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
+; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
+; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
+; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
+; GFX940: v_mfma_f32_32x32x16_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GISEL: v_mfma_f32_32x32x16_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT: v_accvgpr_read_b32
+; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(<16 x float> addrspace(1)* %arg) #0 {
+bb:
+ %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+ %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
+ store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_fp8:
+; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
+; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
+; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
+; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
+; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
+; GFX940: v_mfma_f32_32x32x16_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GISEL: v_mfma_f32_32x32x16_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT: v_accvgpr_read_b32
+; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(<16 x float> addrspace(1)* %arg) #0 {
+bb:
+ %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+ %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
+ store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_bf8:
+; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
+; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
+; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
+; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
+; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
+; GFX940: v_mfma_f32_32x32x16_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GISEL: v_mfma_f32_32x32x16_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT: v_accvgpr_read_b32
+; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(<16 x float> addrspace(1)* %arg) #0 {
+bb:
+ %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+ %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
+ store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_fp8:
+; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
+; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
+; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
+; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
+; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
+; GFX940: v_mfma_f32_32x32x16_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GISEL: v_mfma_f32_32x32x16_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GCN-NOT: v_accvgpr_read_b32
+; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
+define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(<16 x float> addrspace(1)* %arg) #0 {
+bb:
+ %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+ %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
+ store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+ ret void
+}
+
; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_f16:
; GCN: s_load_dwordx4 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll
index d47a4c38c0956..a1f1572b48a62 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll
@@ -5,6 +5,14 @@ declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i3
declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32)
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64, i64, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64, i64, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64, i64, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64, i64, <4 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64, i64, <16 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64, i64, <16 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64, i64, <16 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64, i64, <16 x float>, i32, i32, i32)
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half>, <8 x half>, <4 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half>, <8 x half>, <16 x float>, i32, i32, i32)
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16>, <8 x i16>, <4 x float>, i32, i32, i32)
@@ -52,6 +60,86 @@ bb:
ret void
}
+; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_bf8:
+; GCN: v_mfma_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(<4 x float> addrspace(1)* %arg) {
+bb:
+ %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+ %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0)
+ store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_fp8:
+; GCN: v_mfma_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(<4 x float> addrspace(1)* %arg) {
+bb:
+ %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+ %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0)
+ store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_bf8:
+; GCN: v_mfma_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(<4 x float> addrspace(1)* %arg) {
+bb:
+ %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+ %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0)
+ store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_fp8:
+; GCN: v_mfma_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(<4 x float> addrspace(1)* %arg) {
+bb:
+ %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+ %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0)
+ store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_bf8:
+; GCN: v_mfma_f32_32x32x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(<16 x float> addrspace(1)* %arg) {
+bb:
+ %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+ %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0)
+ store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_fp8:
+; GCN: v_mfma_f32_32x32x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(<16 x float> addrspace(1)* %arg) {
+bb:
+ %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+ %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0)
+ store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_bf8:
+; GCN: v_mfma_f32_32x32x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(<16 x float> addrspace(1)* %arg) {
+bb:
+ %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+ %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0)
+ store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_fp8:
+; GCN: v_mfma_f32_32x32x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(<16 x float> addrspace(1)* %arg) {
+bb:
+ %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+ %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0)
+ store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+ ret void
+}
+
; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_f16:
; GCN: v_smfmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(<4 x float> addrspace(1)* %arg, <4 x half> %a, <8 x half> %b, i32 %idx) {
diff --git a/llvm/test/MC/AMDGPU/mai-gfx940.s b/llvm/test/MC/AMDGPU/mai-gfx940.s
index 52b035fda971b..15c974d78ea29 100644
--- a/llvm/test/MC/AMDGPU/mai-gfx940.s
+++ b/llvm/test/MC/AMDGPU/mai-gfx940.s
@@ -468,6 +468,102 @@ v_mfma_f32_32x32x4xf32 a[0:15], v[2:3], v[4:5], a[18:33]
// GFX940: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[4:5], a[18:33] ; encoding: [0x00,0x80,0xbf,0xd3,0x02,0x09,0x4a,0x04]
// GFX90A: error: instruction not supported on this GPU
+v_mfma_f32_16x16x32_bf8_bf8 v[0:3], v[2:3], v[4:5], v[0:3]
+// GFX940: v_mfma_f32_16x16x32_bf8_bf8 v[0:3], v[2:3], v[4:5], v[0:3] ; encoding: [0x00,0x00,0xf0,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3]
+// GFX940: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] ; encoding: [0x00,0x80,0xf0,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5
+// GFX940: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5 ; encoding: [0x00,0x80,0xf0,0xd3,0x02,0x09,0x02,0xa4]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x32_bf8_fp8 v[0:3], v[2:3], v[4:5], v[0:3]
+// GFX940: v_mfma_f32_16x16x32_bf8_fp8 v[0:3], v[2:3], v[4:5], v[0:3] ; encoding: [0x00,0x00,0xf1,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3]
+// GFX940: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] ; encoding: [0x00,0x80,0xf1,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5
+// GFX940: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5 ; encoding: [0x00,0x80,0xf1,0xd3,0x02,0x09,0x02,0xa4]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x32_fp8_bf8 v[0:3], v[2:3], v[4:5], v[0:3]
+// GFX940: v_mfma_f32_16x16x32_fp8_bf8 v[0:3], v[2:3], v[4:5], v[0:3] ; encoding: [0x00,0x00,0xf2,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3]
+// GFX940: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] ; encoding: [0x00,0x80,0xf2,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5
+// GFX940: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5 ; encoding: [0x00,0x80,0xf2,0xd3,0x02,0x09,0x02,0xa4]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[2:3], v[4:5], v[0:3]
+// GFX940: v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[2:3], v[4:5], v[0:3] ; encoding: [0x00,0x00,0xf3,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3]
+// GFX940: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] ; encoding: [0x00,0x80,0xf3,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5
+// GFX940: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5 ; encoding: [0x00,0x80,0xf3,0xd3,0x02,0x09,0x02,0xa4]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[2:3], v[4:5], v[0:15]
+// GFX940: v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[2:3], v[4:5], v[0:15] ; encoding: [0x00,0x00,0xf4,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[2:3], v[4:5], a[0:15]
+// GFX940: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[2:3], v[4:5], a[0:15] ; encoding: [0x00,0x80,0xf4,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5
+// GFX940: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5 ; encoding: [0x00,0x80,0xf4,0xd3,0x02,0x09,0x02,0xa4]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[2:3], v[4:5], v[0:15]
+// GFX940: v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[2:3], v[4:5], v[0:15] ; encoding: [0x00,0x00,0xf5,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[2:3], v[4:5], a[0:15]
+// GFX940: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[2:3], v[4:5], a[0:15] ; encoding: [0x00,0x80,0xf5,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5
+// GFX940: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5 ; encoding: [0x00,0x80,0xf5,0xd3,0x02,0x09,0x02,0xa4]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[2:3], v[4:5], v[0:15]
+// GFX940: v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[2:3], v[4:5], v[0:15] ; encoding: [0x00,0x00,0xf6,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[2:3], v[4:5], a[0:15]
+// GFX940: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[2:3], v[4:5], a[0:15] ; encoding: [0x00,0x80,0xf6,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5
+// GFX940: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5 ; encoding: [0x00,0x80,0xf6,0xd3,0x02,0x09,0x02,0xa4]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[2:3], v[4:5], v[0:15]
+// GFX940: v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[2:3], v[4:5], v[0:15] ; encoding: [0x00,0x00,0xf7,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[2:3], v[4:5], a[0:15]
+// GFX940: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[2:3], v[4:5], a[0:15] ; encoding: [0x00,0x80,0xf7,0xd3,0x02,0x09,0x02,0x04]
+// GFX90A: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5
+// GFX940: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5 ; encoding: [0x00,0x80,0xf7,0xd3,0x02,0x09,0x02,0xa4]
+// GFX90A: error: instruction not supported on this GPU
+
//===----------------------------------------------------------------------===//
// SMFMAC opcodes.
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt b/llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt
index bd3a28042027b..24904215b4008 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt
@@ -63,6 +63,78 @@
# GFX940: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[4:5], a[2:17] ; encoding: [0x00,0x80,0xbf,0xd3,0x02,0x09,0x0a,0x04]
0x00,0x80,0xbf,0xd3,0x02,0x09,0x0a,0x04
+# GFX940: v_mfma_f32_16x16x32_bf8_bf8 v[0:3], v[2:3], v[4:5], v[0:3] ; encoding: [0x00,0x00,0xf0,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x00,0xf0,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] ; encoding: [0x00,0x80,0xf0,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x80,0xf0,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5 ; encoding: [0x00,0x80,0xf0,0xd3,0x02,0x09,0x02,0xa4]
+0x00,0x80,0xf0,0xd3,0x02,0x09,0x02,0xa4
+
+# GFX940: v_mfma_f32_16x16x32_bf8_fp8 v[0:3], v[2:3], v[4:5], v[0:3] ; encoding: [0x00,0x00,0xf1,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x00,0xf1,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] ; encoding: [0x00,0x80,0xf1,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x80,0xf1,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5 ; encoding: [0x00,0x80,0xf1,0xd3,0x02,0x09,0x02,0xa4]
+0x00,0x80,0xf1,0xd3,0x02,0x09,0x02,0xa4
+
+# GFX940: v_mfma_f32_16x16x32_fp8_bf8 v[0:3], v[2:3], v[4:5], v[0:3] ; encoding: [0x00,0x00,0xf2,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x00,0xf2,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] ; encoding: [0x00,0x80,0xf2,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x80,0xf2,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5 ; encoding: [0x00,0x80,0xf2,0xd3,0x02,0x09,0x02,0xa4]
+0x00,0x80,0xf2,0xd3,0x02,0x09,0x02,0xa4
+
+# GFX940: v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[2:3], v[4:5], v[0:3] ; encoding: [0x00,0x00,0xf3,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x00,0xf3,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] ; encoding: [0x00,0x80,0xf3,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x80,0xf3,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5 ; encoding: [0x00,0x80,0xf3,0xd3,0x02,0x09,0x02,0xa4]
+0x00,0x80,0xf3,0xd3,0x02,0x09,0x02,0xa4
+
+# GFX940: v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[2:3], v[4:5], v[0:15] ; encoding: [0x00,0x00,0xf4,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x00,0xf4,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[2:3], v[4:5], a[0:15] ; encoding: [0x00,0x80,0xf4,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x80,0xf4,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5 ; encoding: [0x00,0x80,0xf4,0xd3,0x02,0x09,0x02,0xa4]
+0x00,0x80,0xf4,0xd3,0x02,0x09,0x02,0xa4
+
+# GFX940: v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[2:3], v[4:5], v[0:15] ; encoding: [0x00,0x00,0xf5,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x00,0xf5,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[2:3], v[4:5], a[0:15] ; encoding: [0x00,0x80,0xf5,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x80,0xf5,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5 ; encoding: [0x00,0x80,0xf5,0xd3,0x02,0x09,0x02,0xa4]
+0x00,0x80,0xf5,0xd3,0x02,0x09,0x02,0xa4
+
+# GFX940: v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[2:3], v[4:5], v[0:15] ; encoding: [0x00,0x00,0xf6,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x00,0xf6,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[2:3], v[4:5], a[0:15] ; encoding: [0x00,0x80,0xf6,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x80,0xf6,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5 ; encoding: [0x00,0x80,0xf6,0xd3,0x02,0x09,0x02,0xa4]
+0x00,0x80,0xf6,0xd3,0x02,0x09,0x02,0xa4
+
+# GFX940: v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[2:3], v[4:5], v[0:15] ; encoding: [0x00,0x00,0xf7,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x00,0xf7,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[2:3], v[4:5], a[0:15] ; encoding: [0x00,0x80,0xf7,0xd3,0x02,0x09,0x02,0x04]
+0x00,0x80,0xf7,0xd3,0x02,0x09,0x02,0x04
+
+# GFX940: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5 ; encoding: [0x00,0x80,0xf7,0xd3,0x02,0x09,0x02,0xa4]
+0x00,0x80,0xf7,0xd3,0x02,0x09,0x02,0xa4
+
# GFX940: v_smfmac_f32_16x16x32_f16 v[10:13], a[2:3], v[4:7], v0 cbsz:3 abid:1 ; encoding: [0x0a,0x0b,0xe2,0xd3,0x02,0x09,0x02,0x0c]
0x0a,0x0b,0xe2,0xd3,0x02,0x09,0x02,0x0c
More information about the cfe-commits
mailing list