[llvm] AMDGPU: Use defset to cleanup marking MFMA intrinsics as divergent (PR #85915)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 20 03:10:24 PDT 2024
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/85915
None
>From 122bee59ebe6c2ed17356cc956d1b0ca9f6306ae Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 20 Mar 2024 15:19:29 +0530
Subject: [PATCH] AMDGPU: Use defset to cleanup marking MFMA intrinsics as
divergent
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 86 ++++++++++--------
.../Target/AMDGPU/AMDGPUSearchableTables.td | 87 +++----------------
2 files changed, 59 insertions(+), 114 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 051e603c0819d2..b24dab4ba72b93 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2653,6 +2653,8 @@ class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers.
// The content of the other 16-bit half is preserved from the input.
+
+defset list<Intrinsic> AMDGPUMFMAIntrinsicsGFX11 = {
def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
@@ -2668,6 +2670,7 @@ def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, l
// GFX12: The op_sel bit must be 0.
def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
+}
//===----------------------------------------------------------------------===//
// GFX12 Intrinsics
@@ -2687,20 +2690,6 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
-
-// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
-//
-// These operations perform a matrix multiplication and accumulation of
-// the form: D = A * B + C .
-
-// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
-def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
-// A and B are <16 x iu4>.
-def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
-
// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics
//
// These operations perform a sparse matrix multiplication and accumulation of
@@ -2734,6 +2723,20 @@ class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType I
[IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
>;
+defset list<Intrinsic> AMDGPUMFMAIntrinsicsGFX12 = {
+// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
+//
+// These operations perform a matrix multiplication and accumulation of
+// the form: D = A * B + C .
+
+// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
+def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+// A and B are <16 x iu4>.
+def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
+
def int_amdgcn_swmmac_f32_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f32_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f16_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
@@ -2745,6 +2748,7 @@ def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyin
def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+}
def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>;
@@ -3012,6 +3016,7 @@ class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
[IntrConvergent, IntrNoMem,
ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
+defset list<Intrinsic> AMDGPUMFMAIntrinsics908 = {
def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>;
def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
def int_amdgcn_mfma_f32_4x4x1f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>;
@@ -3032,6 +3037,7 @@ def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v
def int_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>;
def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>;
+}
//===----------------------------------------------------------------------===//
// gfx90a intrinsics
@@ -3043,6 +3049,7 @@ def int_amdgcn_flat_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_flat_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_flat_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
+defset list<Intrinsic> AMDGPUMFMAIntrinsics90A = {
def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>;
def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>;
@@ -3054,25 +3061,12 @@ def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, ll
// source operand.
def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>;
def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>;
+}
//===----------------------------------------------------------------------===//
// gfx940 intrinsics
// ===----------------------------------------------------------------------===//
-// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
-def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
-def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
-def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
- [llvm_v2i16_ty],
- [LLVMQualPointerType<3>, llvm_v2i16_ty],
- [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
- ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
-
-def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;
-def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
-def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>;
-def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;
-
class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> :
AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>;
@@ -3081,9 +3075,6 @@ multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
}
-defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>;
-defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>;
-
// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid
class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
ClangBuiltin<!subst("int", "__builtin", NAME)>,
@@ -3093,13 +3084,6 @@ class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
[IntrConvergent, IntrNoMem,
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
-def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
-def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
-def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
-def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
-def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
-def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
-
class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> :
AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>;
@@ -3108,8 +3092,34 @@ multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> {
def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>;
}
+// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
+def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
+def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
+def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
+ [llvm_v2i16_ty],
+ [LLVMQualPointerType<3>, llvm_v2i16_ty],
+ [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
+ ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
+
+defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = {
+def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;
+def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
+def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>;
+def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;
+
+defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>;
+defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>;
+
+def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
+def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
+def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
+def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
+def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
+def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
+
defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>;
defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>;
+}
// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3]
// byte_sel selects byte from srcA.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index bb1c6b73372999..40d432c3b17a9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -354,82 +354,17 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
def : SourceOfDivergence<int_amdgcn_update_dpp>;
def : SourceOfDivergence<int_amdgcn_writelane>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4f16>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_4x4x4i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x2bf16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x1f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16f16>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x4i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x16i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x2bf16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8bf16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x1f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4f16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8f16>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x4i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16_1k>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4bf16_1k>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4bf16_1k>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8bf16_1k>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16bf16_1k>;
-def : SourceOfDivergence<int_amdgcn_mfma_f64_16x16x4f64>;
-def : SourceOfDivergence<int_amdgcn_mfma_f64_4x4x4f64>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x32_i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x16_i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8_xf32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4_xf32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_bf8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_bf8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_f16>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_f16>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_bf16>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_bf16>;
-def : SourceOfDivergence<int_amdgcn_smfmac_i32_16x16x64_i8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_i32_32x32x32_i8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_bf8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_f16>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf16>;
-def : SourceOfDivergence<int_amdgcn_wmma_f16_16x16x16_f16>;
-def : SourceOfDivergence<int_amdgcn_wmma_bf16_16x16x16_bf16>;
-def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu8>;
-def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_bf8>;
-def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x32_iu4>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_f16>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf16>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f16_16x16x32_f16>;
-def : SourceOfDivergence<int_amdgcn_swmmac_bf16_16x16x32_bf16>;
-def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu8>;
-def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu4>;
-def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x64_iu4>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_bf8>;
+foreach intr = AMDGPUMFMAIntrinsics908 in
+def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUMFMAIntrinsics90A in
+def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUMFMAIntrinsics940 in
+def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUMFMAIntrinsicsGFX11 in
+def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUMFMAIntrinsicsGFX12 in
+def : SourceOfDivergence<intr>;
+
def : SourceOfDivergence<int_amdgcn_global_load_tr>;
// The dummy boolean output is divergent from the IR's perspective,
More information about the llvm-commits
mailing list