[llvm] AMDGPU: Support packed bf16 instructions on gfx1250 (PR #150283)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 23 11:08:15 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Changpeng Fang (changpeng)
<details>
<summary>Changes</summary>
---
Patch is 51.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150283.diff
8 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+10)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h (+3)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+6)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+2)
- (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+13)
- (modified) llvm/test/CodeGen/AMDGPU/bf16-math.ll (+363)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s (+301)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt (+225)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index e4e7bdce950ac..1c7ee724fef09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -559,6 +559,12 @@ def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts",
"Has bf16 conversion instructions"
>;
+def FeatureBF16PackedInsts : SubtargetFeature<"bf16-pk-insts",
+ "HasBF16PackedInsts",
+ "true",
+ "Has bf16 packed instructions (fma, add, mul, max, min)"
+>;
+
def FeatureVOP3P : SubtargetFeature<"vop3p",
"HasVOP3PInsts",
"true",
@@ -1989,6 +1995,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureTransposeLoadF4F6Insts,
FeatureBF16TransInsts,
FeatureBF16ConversionInsts,
+ FeatureBF16PackedInsts,
FeatureCvtPkF16F32Inst,
FeatureMinimum3Maximum3PKF16,
FeaturePrngInst,
@@ -2472,6 +2479,9 @@ def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">,
AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>;
+def HasBF16PackedInsts : Predicate<"Subtarget->hasBF16PackedInsts()">,
+ AssemblerPredicate<(all_of FeatureBF16PackedInsts)>;
+
def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
AssemblerPredicate<(all_of FeatureVOP3P)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 1e44be8e47201..6878744496cfe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -61,6 +61,7 @@ class AMDGPUSubtarget {
bool EnableRealTrue16Insts = false;
bool HasBF16TransInsts = false;
bool HasBF16ConversionInsts = false;
+ bool HasBF16PackedInsts = false;
bool HasMadMixInsts = false;
bool HasMadMacF32Insts = false;
bool HasDsSrc2Insts = false;
@@ -209,6 +210,8 @@ class AMDGPUSubtarget {
return HasBF16ConversionInsts;
}
+ bool hasBF16PackedInsts() const { return HasBF16PackedInsts; }
+
bool hasMadMixInsts() const {
return HasMadMixInsts;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d65c3ae76566b..441034b508c10 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -944,6 +944,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
}
+ if (Subtarget->hasBF16PackedInsts()) {
+ setOperationAction(
+ {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
+ MVT::v2bf16, Legal);
+ }
+
if (Subtarget->hasBF16TransInsts()) {
setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index bd4995b3c6e6f..b8537513ce986 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2873,10 +2873,12 @@ def VOP_I16_I32 : VOPProfile <[i16, i32, untyped, untyped]>;
def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>;
def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>;
+def VOP_V2BF16_V2BF16_V2BF16 : VOPProfile <[v2bf16, v2bf16, v2bf16, untyped]>;
def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>;
def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>;
def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>;
+def VOP_V2BF16_V2BF16_V2BF16_V2BF16 : VOPProfile <[v2bf16, v2bf16, v2bf16, v2bf16]>;
def VOP_V2I16_F32_F32 : VOPProfile <[v2i16, f32, f32, untyped]>;
def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index ed8e547419ceb..43f9bc946e6fe 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1196,6 +1196,14 @@ let isCommutable = 1, isReMaterializable = 1 in {
let SubtargetPredicate = HasPkMovB32, isAsCheapAsAMove = 1 in
defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
+
+ let SubtargetPredicate = HasBF16PackedInsts in {
+ defm V_PK_ADD_BF16 : VOP3PInst<"v_pk_add_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fadd>;
+ defm V_PK_MUL_BF16 : VOP3PInst<"v_pk_mul_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fmul>;
+ defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fminnum_like>;
+ defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fmaxnum_like>;
+ defm V_PK_FMA_BF16 : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fma>;
+ }
} // End isCommutable = 1, isReMaterializable = 1
def : AMDGPUMnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;
@@ -2222,6 +2230,11 @@ defm V_PK_MAX3_I16 : VOP3P_Real_gfx1250<0x2f>;
defm V_PK_MAX3_U16 : VOP3P_Real_gfx1250<0x30>;
defm V_PK_MIN3_I16 : VOP3P_Real_gfx1250<0x31>;
defm V_PK_MIN3_U16 : VOP3P_Real_gfx1250<0x32>;
+defm V_PK_FMA_BF16 : VOP3P_Real_gfx1250<0x11>;
+defm V_PK_ADD_BF16 : VOP3P_Real_gfx1250<0x23>;
+defm V_PK_MUL_BF16 : VOP3P_Real_gfx1250<0x2a>;
+defm V_PK_MIN_NUM_BF16 : VOP3P_Real_gfx1250<0x2b>;
+defm V_PK_MAX_NUM_BF16 : VOP3P_Real_gfx1250<0x2c>;
defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>;
defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
index b49614d05700a..1ef8d10ffb8fd 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
@@ -24,6 +24,364 @@ define amdgpu_ps void @llvm_sqrt_bf16_s(ptr addrspace(1) %out, bfloat inreg %src
ret void
}
+define amdgpu_ps void @v_test_add_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_test_add_v2bf16_vv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, v2, v3
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fadd <2 x bfloat> %a, %b
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_add_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_add_v2bf16_vs:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fadd <2 x bfloat> %a, %b
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_add_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_add_v2bf16_ss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, s0, s1
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fadd <2 x bfloat> %a, %b
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_add_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_add_v2bf16_vc:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, v2, 2.0 op_sel_hi:[1,0]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fadd <2 x bfloat> %a, <bfloat 2.0, bfloat 2.0>
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_add_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_add_v2bf16_vl:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, 0x42c83f80, v2
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fadd <2 x bfloat> %a, <bfloat 1.0, bfloat 100.0>
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_sub_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_test_sub_v2bf16_vv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, v2, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fsub <2 x bfloat> %a, %b
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_sub_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_sub_v2bf16_vs:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, v2, s0 neg_lo:[0,1] neg_hi:[0,1]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fsub <2 x bfloat> %a, %b
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_sub_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_sub_v2bf16_ss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, s0, s1 neg_lo:[0,1] neg_hi:[0,1]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fsub <2 x bfloat> %a, %b
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_sub_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_sub_v2bf16_vc:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, v2, -2.0 op_sel_hi:[1,0]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fsub <2 x bfloat> %a, <bfloat 2.0, bfloat 2.0>
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_sub_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_sub_v2bf16_vl:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, 0xc2c8bf80, v2
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fsub <2 x bfloat> %a, <bfloat 1.0, bfloat 100.0>
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_sub_v2bf16_lv(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_sub_v2bf16_lv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, 0x42c83f80, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fsub <2 x bfloat> <bfloat 1.0, bfloat 100.0>, %a
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_sub_v2bf16_iv(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_sub_v2bf16_iv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_bf16 v2, v2, 1.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %add = fsub <2 x bfloat> <bfloat 1.0, bfloat 1.0>, %a
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_test_mul_v2bf16_vv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, v2, v3
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul <2 x bfloat> %a, %b
+ store <2 x bfloat> %mul, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_mul_v2bf16_vs:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul <2 x bfloat> %a, %b
+ store <2 x bfloat> %mul, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_mul_v2bf16_ss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, s0, s1
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul <2 x bfloat> %a, %b
+ store <2 x bfloat> %mul, ptr addrspace(1) %out
+ ret void
+}
+
+; FIXME: We can do better folding inline constant instead of a literal.
+
+define amdgpu_ps void @v_test_mul_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_mul_v2bf16_vc:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul <2 x bfloat> %a, <bfloat 0.5, bfloat 0.5>
+ store <2 x bfloat> %mul, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_mul_v2bf16_vl:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, 0x42c83f80, v2
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul <2 x bfloat> %a, <bfloat 1.0, bfloat 100.0>
+ store <2 x bfloat> %mul, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_min_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_test_min_v2bf16_vv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_min_num_bf16 v2, v2, v3
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %min = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+ store <2 x bfloat> %min, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_min_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_min_v2bf16_vs:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_min_num_bf16 v2, v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %min = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+ store <2 x bfloat> %min, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_min_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_min_v2bf16_ss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_min_num_bf16 v2, s0, s1
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %min = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+ store <2 x bfloat> %min, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_min_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_min_v2bf16_vc:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_min_num_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %min = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 0.5, bfloat 0.5>)
+ store <2 x bfloat> %min, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_min_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_min_v2bf16_vl:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_min_num_bf16 v2, 0x42c83f80, v2
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %min = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 1.0, bfloat 100.0>)
+ store <2 x bfloat> %min, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_max_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_test_max_v2bf16_vv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v2, v2, v3
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+ store <2 x bfloat> %max, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_max_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_max_v2bf16_vs:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v2, v2, s0
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+ store <2 x bfloat> %max, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_max_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_max_v2bf16_ss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v2, s0, s1
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+ store <2 x bfloat> %max, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_max_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_max_v2bf16_vc:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 0.5, bfloat 0.5>)
+ store <2 x bfloat> %max, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_max_v2bf16_vl:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v2, 0x42c83f80, v2
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 1.0, bfloat 100.0>)
+ store <2 x bfloat> %max, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_fma_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
+; GCN-LABEL: v_test_fma_v2bf16_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_fma_bf16 v2, v2, v3, v4
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %fma = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
+ store <2 x bfloat> %fma, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_fma_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
+; GCN-LABEL: v_test_fma_v2bf16_vss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, s1
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %fma = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
+ store <2 x bfloat> %fma, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_fma_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
+; GCN-LABEL: v_test_fma_v2bf16_sss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_pk_fma_bf16 v2, s0, s1, v2
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %fma = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
+ store <2 x bfloat> %fma, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_fma_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_fma_v2bf16_vsc:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, 0.5 op_sel_hi:[1,1,0]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %fma = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> <bfloat 0.5, bfloat 0.5>)
+ store <2 x bfloat> %fma, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_fma_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_fma_v2bf16_vll:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s0, 0x42c83f80
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, 0x43484000
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %fma = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 1.0, bfloat 100.0>, <2 x bfloat> <bfloat 2.0, bfloat 200.0>)
+ store <2 x bfloat> %fma, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_ps void @llvm_log2_bf16_v(ptr addrspace(1) %out, bfloat %src) {
; GCN-LABEL: llvm_log2_bf16_v:
; GCN: ; %bb.0:
@@ -68,6 +426,11 @@ define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src
ret void
}
+
+declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
declare bfloat @llvm.sqrt.bf16(bfloat)
declare bfloat @llvm.log2.bf16(bfloat)
declare bfloat @llvm.exp2.bf16(bfloat)
+
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vo...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/150283
More information about the llvm-commits
mailing list