[llvm] [NVPTX] Add patterns for fma.relu.{f16|bf16} (PR #114977)
Hugh Delaney via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 6 03:00:50 PST 2024
https://github.com/hdelan updated https://github.com/llvm/llvm-project/pull/114977
>From 4759e15870beee08b63fd978147cda9327716c56 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney at codeplay.com>
Date: Tue, 5 Nov 2024 12:25:41 +0000
Subject: [PATCH] Add patterns for fma.relu.{f16|bf16}
Add patterns to lower fma(a, b, c) > 0 ? fma(a, b, c) : 0 for f16 and
bf16 types.
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 37 ++
llvm/test/CodeGen/NVPTX/fma-relu.ll | 499 ++++++++++++++++++++++++
2 files changed, 536 insertions(+)
create mode 100644 llvm/test/CodeGen/NVPTX/fma-relu.ll
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 5f6cba397c5352..7da0f6d875ce4c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3917,3 +3917,40 @@ def atomic_thread_fence_seq_cst_cta :
def atomic_thread_fence_acq_rel_cta :
NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>,
Requires<[hasPTX<60>, hasSM<70>]>;
+
+def fpimm0 : FPImmLeaf<fAny, [{
+ return Imm.isExactlyValue(+0.0);
+}]>;
+
+def FMARELU_F16 :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+ "fma.rn.relu.f16 \t$dst, $a, $b, $c;", []>,
+ Requires<[useFP16Math, hasPTX<70>, hasSM<80>]>;
+def FMARELU_BF16 :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+ "fma.rn.relu.bf16 \t$dst, $a, $b, $c;", []>,
+ Requires<[hasBF16Math, hasPTX<70>, hasSM<80>]>;
+def FMARELU_F16_FTZ :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+ "fma.rn.relu.ftz.f16 \t$dst, $a, $b, $c;", []>,
+ Requires<[useFP16Math, hasPTX<70>, hasSM<80>]>;
+def FMARELU_BF16_FTZ :
+ NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+ "fma.rn.relu.ftz.bf16 \t$dst, $a, $b, $c;", []>,
+ Requires<[hasBF16Math, hasPTX<70>, hasSM<80>]>;
+
+
+// FTZ variants
+def : Pat<(f16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+ (FMARELU_F16_FTZ Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
+ Requires<[allowFMA, doF32FTZ, allowUnsafeFPMath, hasPTX<70>]>;
+def : Pat<(bf16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+ (FMARELU_BF16_FTZ Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
+ Requires<[allowFMA, doF32FTZ, allowUnsafeFPMath, hasPTX<70>]>;
+// No FTZ
+def : Pat<(f16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+ (FMARELU_F16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
+ Requires<[allowFMA, allowUnsafeFPMath, hasPTX<70>]>;
+def : Pat<(bf16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+ (FMARELU_BF16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
+ Requires<[allowFMA, allowUnsafeFPMath, hasPTX<70>]>;
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu.ll b/llvm/test/CodeGen/NVPTX/fma-relu.ll
new file mode 100644
index 00000000000000..bdff52415f69fe
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fma-relu.ll
@@ -0,0 +1,499 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_80 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | %ptxas-verify -arch=sm_80 %}
+
+; Using FTZ should emit fma.ftz.relu
+; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_80 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | FileCheck %s --check-prefixes=CHECK-FTZ
+; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | %ptxas-verify -arch=sm_80 %}
+
+; RUN: llc < %s -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_80 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=0 | FileCheck %s --check-prefixes=CHECK-NO-FMA
+
+; SM < 80 or PTX version < 70 should not emit fma{.ftz}.relu
+; RUN: llc < %s -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_70 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | FileCheck %s --check-prefixes=CHECK-SM70
+; RUN: llc < %s -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_80 -mattr=+ptx60 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | FileCheck %s --check-prefixes=CHECK-PTX60
+
+define half @fma_f16(half %a, half %b, half %c) {
+; CHECK-LABEL: fma_f16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_param_1];
+; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_param_2];
+; CHECK-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT: ret;
+;
+; CHECK-FTZ-LABEL: fma_f16(
+; CHECK-FTZ: {
+; CHECK-FTZ-NEXT: .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT: // %bb.0:
+; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_param_0];
+; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_param_1];
+; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_param_2];
+; CHECK-FTZ-NEXT: fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT: ret;
+;
+; CHECK-NO-FMA-LABEL: fma_f16(
+; CHECK-NO-FMA: {
+; CHECK-NO-FMA-NEXT: .reg .b16 %rs<7>;
+; CHECK-NO-FMA-EMPTY:
+; CHECK-NO-FMA-NEXT: // %bb.0:
+; CHECK-NO-FMA-NEXT: ld.param.b16 %rs1, [fma_f16_param_0];
+; CHECK-NO-FMA-NEXT: ld.param.b16 %rs2, [fma_f16_param_1];
+; CHECK-NO-FMA-NEXT: ld.param.b16 %rs3, [fma_f16_param_2];
+; CHECK-NO-FMA-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-NEXT: mov.b16 %rs5, 0x0000;
+; CHECK-NO-FMA-NEXT: max.f16 %rs6, %rs4, %rs5;
+; CHECK-NO-FMA-NEXT: st.param.b16 [func_retval0], %rs6;
+; CHECK-NO-FMA-NEXT: ret;
+;
+; CHECK-SM70-LABEL: fma_f16(
+; CHECK-SM70: {
+; CHECK-SM70-NEXT: .reg .b16 %rs<6>;
+; CHECK-SM70-NEXT: .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT: // %bb.0:
+; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_param_0];
+; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_param_1];
+; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_param_2];
+; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT: cvt.f32.f16 %f1, %rs4;
+; CHECK-SM70-NEXT: max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT: ret;
+;
+; CHECK-PTX60-LABEL: fma_f16(
+; CHECK-PTX60: {
+; CHECK-PTX60-NEXT: .reg .b16 %rs<5>;
+; CHECK-PTX60-EMPTY:
+; CHECK-PTX60-NEXT: // %bb.0:
+; CHECK-PTX60-NEXT: ld.param.b16 %rs1, [fma_f16_param_0];
+; CHECK-PTX60-NEXT: ld.param.b16 %rs2, [fma_f16_param_1];
+; CHECK-PTX60-NEXT: ld.param.b16 %rs3, [fma_f16_param_2];
+; CHECK-PTX60-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-PTX60-NEXT: st.param.b16 [func_retval0], %rs4;
+; CHECK-PTX60-NEXT: ret;
+; CHECK-NO-ARCH-LABEL: fma_f16(
+; CHECK-NO-ARCH: {
+; CHECK-NO-ARCH-NEXT: .reg .b16 %rs<6>;
+; CHECK-NO-ARCH-NEXT: .reg .f32 %f<3>;
+; CHECK-NO-ARCH-EMPTY:
+; CHECK-NO-ARCH-NEXT: // %bb.0:
+; CHECK-NO-ARCH-NEXT: ld.param.b16 %rs1, [fma_f16_param_0];
+; CHECK-NO-ARCH-NEXT: ld.param.b16 %rs2, [fma_f16_param_1];
+; CHECK-NO-ARCH-NEXT: ld.param.b16 %rs3, [fma_f16_param_2];
+; CHECK-NO-ARCH-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-ARCH-NEXT: cvt.f32.f16 %f1, %rs4;
+; CHECK-NO-ARCH-NEXT: max.f32 %f2, %f1, 0f00000000;
+; CHECK-NO-ARCH-NEXT: cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-NO-ARCH-NEXT: st.param.b16 [func_retval0], %rs5;
+; CHECK-NO-ARCH-NEXT: ret;
+; CHECK-NO-PTX-LABEL: fma_f16(
+; CHECK-NO-PTX: {
+; CHECK-NO-PTX-NEXT: .reg .b16 %rs<5>;
+; CHECK-NO-PTX-EMPTY:
+; CHECK-NO-PTX-NEXT: // %bb.0:
+; CHECK-NO-PTX-NEXT: ld.param.b16 %rs1, [fma_f16_param_0];
+; CHECK-NO-PTX-NEXT: ld.param.b16 %rs2, [fma_f16_param_1];
+; CHECK-NO-PTX-NEXT: ld.param.b16 %rs3, [fma_f16_param_2];
+; CHECK-NO-PTX-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-PTX-NEXT: st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-PTX-NEXT: ret;
+ %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
+ %2 = fcmp ogt half %1, 0.0
+ %3 = select i1 %2, half %1, half 0.0
+ ret half %3
+}
+
+define half @fma_f16_expanded(half %a, half %b, half %c) {
+; CHECK-LABEL: fma_f16_expanded(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_param_1];
+; CHECK-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_param_2];
+; CHECK-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT: ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_expanded(
+; CHECK-FTZ: {
+; CHECK-FTZ-NEXT: .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT: // %bb.0:
+; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_param_0];
+; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_param_1];
+; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_param_2];
+; CHECK-FTZ-NEXT: fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT: ret;
+;
+; CHECK-NO-FMA-LABEL: fma_f16_expanded(
+; CHECK-NO-FMA: {
+; CHECK-NO-FMA-NEXT: .reg .b16 %rs<7>;
+; CHECK-NO-FMA-EMPTY:
+; CHECK-NO-FMA-NEXT: // %bb.0:
+; CHECK-NO-FMA-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_param_0];
+; CHECK-NO-FMA-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_param_1];
+; CHECK-NO-FMA-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_param_2];
+; CHECK-NO-FMA-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-NEXT: mov.b16 %rs5, 0x0000;
+; CHECK-NO-FMA-NEXT: max.f16 %rs6, %rs4, %rs5;
+; CHECK-NO-FMA-NEXT: st.param.b16 [func_retval0], %rs6;
+; CHECK-NO-FMA-NEXT: ret;
+;
+; CHECK-SM70-LABEL: fma_f16_expanded(
+; CHECK-SM70: {
+; CHECK-SM70-NEXT: .reg .b16 %rs<6>;
+; CHECK-SM70-NEXT: .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT: // %bb.0:
+; CHECK-SM70-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_param_0];
+; CHECK-SM70-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_param_1];
+; CHECK-SM70-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_param_2];
+; CHECK-SM70-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT: cvt.f32.f16 %f1, %rs4;
+; CHECK-SM70-NEXT: max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT: cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT: ret;
+;
+; CHECK-PTX60-LABEL: fma_f16_expanded(
+; CHECK-PTX60: {
+; CHECK-PTX60-NEXT: .reg .b16 %rs<5>;
+; CHECK-PTX60-EMPTY:
+; CHECK-PTX60-NEXT: // %bb.0:
+; CHECK-PTX60-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_param_0];
+; CHECK-PTX60-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_param_1];
+; CHECK-PTX60-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_param_2];
+; CHECK-PTX60-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-PTX60-NEXT: st.param.b16 [func_retval0], %rs4;
+; CHECK-PTX60-NEXT: ret;
+; CHECK-NO-ARCH-LABEL: fma_f16_expanded(
+; CHECK-NO-ARCH: {
+; CHECK-NO-ARCH-NEXT: .reg .b16 %rs<6>;
+; CHECK-NO-ARCH-NEXT: .reg .f32 %f<3>;
+; CHECK-NO-ARCH-EMPTY:
+; CHECK-NO-ARCH-NEXT: // %bb.0:
+; CHECK-NO-ARCH-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_param_0];
+; CHECK-NO-ARCH-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_param_1];
+; CHECK-NO-ARCH-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_param_2];
+; CHECK-NO-ARCH-NEXT: fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-ARCH-NEXT: cvt.f32.f16 %f1, %rs4;
+; CHECK-NO-ARCH-NEXT: max.f32 %f2, %f1, 0f00000000;
+; CHECK-NO-ARCH-NEXT: cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-NO-ARCH-NEXT: st.param.b16 [func_retval0], %rs5;
+; CHECK-NO-ARCH-NEXT: ret;
+; CHECK-NO-PTX-LABEL: fma_f16_expanded(
+; CHECK-NO-PTX: {
+; CHECK-NO-PTX-NEXT: .reg .b16 %rs<5>;
+; CHECK-NO-PTX-EMPTY:
+; CHECK-NO-PTX-NEXT: // %bb.0:
+; CHECK-NO-PTX-NEXT: ld.param.b16 %rs1, [fma_f16_expanded_param_0];
+; CHECK-NO-PTX-NEXT: ld.param.b16 %rs2, [fma_f16_expanded_param_1];
+; CHECK-NO-PTX-NEXT: ld.param.b16 %rs3, [fma_f16_expanded_param_2];
+; CHECK-NO-PTX-NEXT: fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-PTX-NEXT: st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-PTX-NEXT: ret;
+ %1 = fmul half %a, %b
+ %2 = fadd half %1, %c
+ %3 = fcmp ogt half %2, 0.0
+ %4 = select i1 %3, half %2, half 0.0
+ ret half %4
+}
+
+define bfloat @fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
+; CHECK-LABEL: fma_bf16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_param_1];
+; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_param_2];
+; CHECK-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT: ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16(
+; CHECK-FTZ: {
+; CHECK-FTZ-NEXT: .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT: // %bb.0:
+; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_param_0];
+; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_param_1];
+; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_param_2];
+; CHECK-FTZ-NEXT: fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT: ret;
+;
+; CHECK-NO-FMA-LABEL: fma_bf16(
+; CHECK-NO-FMA: {
+; CHECK-NO-FMA-NEXT: .reg .b16 %rs<7>;
+; CHECK-NO-FMA-EMPTY:
+; CHECK-NO-FMA-NEXT: // %bb.0:
+; CHECK-NO-FMA-NEXT: ld.param.b16 %rs1, [fma_bf16_param_0];
+; CHECK-NO-FMA-NEXT: ld.param.b16 %rs2, [fma_bf16_param_1];
+; CHECK-NO-FMA-NEXT: ld.param.b16 %rs3, [fma_bf16_param_2];
+; CHECK-NO-FMA-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-NEXT: mov.b16 %rs5, 0x0000;
+; CHECK-NO-FMA-NEXT: max.bf16 %rs6, %rs4, %rs5;
+; CHECK-NO-FMA-NEXT: st.param.b16 [func_retval0], %rs6;
+; CHECK-NO-FMA-NEXT: ret;
+;
+; CHECK-SM70-LABEL: fma_bf16(
+; CHECK-SM70: {
+; CHECK-SM70-NEXT: .reg .pred %p<3>;
+; CHECK-SM70-NEXT: .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT: .reg .b32 %r<20>;
+; CHECK-SM70-NEXT: .reg .f32 %f<7>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT: // %bb.0:
+; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_param_2];
+; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT: mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_param_1];
+; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT: mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_param_0];
+; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT: mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT: mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT: mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT: max.f32 %f6, %f5, 0f00000000;
+; CHECK-SM70-NEXT: mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT: ret;
+;
+; CHECK-PTX60-LABEL: fma_bf16(
+; CHECK-PTX60: {
+; CHECK-PTX60-NEXT: .reg .b16 %rs<5>;
+; CHECK-PTX60-EMPTY:
+; CHECK-PTX60-NEXT: // %bb.0:
+; CHECK-PTX60-NEXT: ld.param.b16 %rs1, [fma_bf16_param_0];
+; CHECK-PTX60-NEXT: ld.param.b16 %rs2, [fma_bf16_param_1];
+; CHECK-PTX60-NEXT: ld.param.b16 %rs3, [fma_bf16_param_2];
+; CHECK-PTX60-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-PTX60-NEXT: st.param.b16 [func_retval0], %rs4;
+; CHECK-PTX60-NEXT: ret;
+; CHECK-NO-ARCH-LABEL: fma_bf16(
+; CHECK-NO-ARCH: {
+; CHECK-NO-ARCH-NEXT: .reg .pred %p<3>;
+; CHECK-NO-ARCH-NEXT: .reg .b16 %rs<3>;
+; CHECK-NO-ARCH-NEXT: .reg .b32 %r<20>;
+; CHECK-NO-ARCH-NEXT: .reg .f32 %f<7>;
+; CHECK-NO-ARCH-EMPTY:
+; CHECK-NO-ARCH-NEXT: // %bb.0:
+; CHECK-NO-ARCH-NEXT: ld.param.u16 %r1, [fma_bf16_param_2];
+; CHECK-NO-ARCH-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-NO-ARCH-NEXT: mov.b32 %f1, %r2;
+; CHECK-NO-ARCH-NEXT: ld.param.u16 %r3, [fma_bf16_param_1];
+; CHECK-NO-ARCH-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-NO-ARCH-NEXT: mov.b32 %f2, %r4;
+; CHECK-NO-ARCH-NEXT: ld.param.u16 %r5, [fma_bf16_param_0];
+; CHECK-NO-ARCH-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-NO-ARCH-NEXT: mov.b32 %f3, %r6;
+; CHECK-NO-ARCH-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-NO-ARCH-NEXT: mov.b32 %r7, %f4;
+; CHECK-NO-ARCH-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; CHECK-NO-ARCH-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-NO-ARCH-NEXT: add.s32 %r10, %r9, 32767;
+; CHECK-NO-ARCH-NEXT: setp.nan.f32 %p1, %f4, %f4;
+; CHECK-NO-ARCH-NEXT: or.b32 %r11, %r7, 4194304;
+; CHECK-NO-ARCH-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-NO-ARCH-NEXT: and.b32 %r13, %r12, -65536;
+; CHECK-NO-ARCH-NEXT: mov.b32 %f5, %r13;
+; CHECK-NO-ARCH-NEXT: max.f32 %f6, %f5, 0f00000000;
+; CHECK-NO-ARCH-NEXT: mov.b32 %r14, %f6;
+; CHECK-NO-ARCH-NEXT: bfe.u32 %r15, %r14, 16, 1;
+; CHECK-NO-ARCH-NEXT: add.s32 %r16, %r15, %r14;
+; CHECK-NO-ARCH-NEXT: add.s32 %r17, %r16, 32767;
+; CHECK-NO-ARCH-NEXT: setp.nan.f32 %p2, %f6, %f6;
+; CHECK-NO-ARCH-NEXT: or.b32 %r18, %r14, 4194304;
+; CHECK-NO-ARCH-NEXT: selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-NO-ARCH-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-NO-ARCH-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NO-ARCH-NEXT: ret;
+; CHECK-NO-PTX-LABEL: fma_bf16(
+; CHECK-NO-PTX: {
+; CHECK-NO-PTX-NEXT: .reg .b16 %rs<5>;
+; CHECK-NO-PTX-EMPTY:
+; CHECK-NO-PTX-NEXT: // %bb.0:
+; CHECK-NO-PTX-NEXT: ld.param.b16 %rs1, [fma_bf16_param_0];
+; CHECK-NO-PTX-NEXT: ld.param.b16 %rs2, [fma_bf16_param_1];
+; CHECK-NO-PTX-NEXT: ld.param.b16 %rs3, [fma_bf16_param_2];
+; CHECK-NO-PTX-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-PTX-NEXT: st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-PTX-NEXT: ret;
+ %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
+ %2 = fcmp ogt bfloat %1, 0.0
+ %3 = select i1 %2, bfloat %1, bfloat 0.0
+ ret bfloat %3
+}
+
+define bfloat @fma_bf16_expanded(bfloat %a, bfloat %b, bfloat %c) {
+; CHECK-LABEL: fma_bf16_expanded(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_param_1];
+; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_param_2];
+; CHECK-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT: ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_expanded(
+; CHECK-FTZ: {
+; CHECK-FTZ-NEXT: .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT: // %bb.0:
+; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_param_0];
+; CHECK-FTZ-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_param_1];
+; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_param_2];
+; CHECK-FTZ-NEXT: fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT: ret;
+;
+; CHECK-NO-FMA-LABEL: fma_bf16_expanded(
+; CHECK-NO-FMA: {
+; CHECK-NO-FMA-NEXT: .reg .b16 %rs<7>;
+; CHECK-NO-FMA-EMPTY:
+; CHECK-NO-FMA-NEXT: // %bb.0:
+; CHECK-NO-FMA-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_param_0];
+; CHECK-NO-FMA-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_param_1];
+; CHECK-NO-FMA-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_param_2];
+; CHECK-NO-FMA-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-NEXT: mov.b16 %rs5, 0x0000;
+; CHECK-NO-FMA-NEXT: max.bf16 %rs6, %rs4, %rs5;
+; CHECK-NO-FMA-NEXT: st.param.b16 [func_retval0], %rs6;
+; CHECK-NO-FMA-NEXT: ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_expanded(
+; CHECK-SM70: {
+; CHECK-SM70-NEXT: .reg .pred %p<3>;
+; CHECK-SM70-NEXT: .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT: .reg .b32 %r<20>;
+; CHECK-SM70-NEXT: .reg .f32 %f<7>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT: // %bb.0:
+; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_param_2];
+; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT: mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_param_1];
+; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT: mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_param_0];
+; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT: mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT: mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT: mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT: max.f32 %f6, %f5, 0f00000000;
+; CHECK-SM70-NEXT: mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT: ret;
+;
+; CHECK-PTX60-LABEL: fma_bf16_expanded(
+; CHECK-PTX60: {
+; CHECK-PTX60-NEXT: .reg .b16 %rs<5>;
+; CHECK-PTX60-EMPTY:
+; CHECK-PTX60-NEXT: // %bb.0:
+; CHECK-PTX60-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_param_0];
+; CHECK-PTX60-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_param_1];
+; CHECK-PTX60-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_param_2];
+; CHECK-PTX60-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-PTX60-NEXT: st.param.b16 [func_retval0], %rs4;
+; CHECK-PTX60-NEXT: ret;
+; CHECK-NO-ARCH-LABEL: fma_bf16_expanded(
+; CHECK-NO-ARCH: {
+; CHECK-NO-ARCH-NEXT: .reg .pred %p<3>;
+; CHECK-NO-ARCH-NEXT: .reg .b16 %rs<3>;
+; CHECK-NO-ARCH-NEXT: .reg .b32 %r<20>;
+; CHECK-NO-ARCH-NEXT: .reg .f32 %f<7>;
+; CHECK-NO-ARCH-EMPTY:
+; CHECK-NO-ARCH-NEXT: // %bb.0:
+; CHECK-NO-ARCH-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_param_2];
+; CHECK-NO-ARCH-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-NO-ARCH-NEXT: mov.b32 %f1, %r2;
+; CHECK-NO-ARCH-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_param_1];
+; CHECK-NO-ARCH-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-NO-ARCH-NEXT: mov.b32 %f2, %r4;
+; CHECK-NO-ARCH-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_param_0];
+; CHECK-NO-ARCH-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-NO-ARCH-NEXT: mov.b32 %f3, %r6;
+; CHECK-NO-ARCH-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-NO-ARCH-NEXT: mov.b32 %r7, %f4;
+; CHECK-NO-ARCH-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; CHECK-NO-ARCH-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-NO-ARCH-NEXT: add.s32 %r10, %r9, 32767;
+; CHECK-NO-ARCH-NEXT: setp.nan.f32 %p1, %f4, %f4;
+; CHECK-NO-ARCH-NEXT: or.b32 %r11, %r7, 4194304;
+; CHECK-NO-ARCH-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-NO-ARCH-NEXT: and.b32 %r13, %r12, -65536;
+; CHECK-NO-ARCH-NEXT: mov.b32 %f5, %r13;
+; CHECK-NO-ARCH-NEXT: max.f32 %f6, %f5, 0f00000000;
+; CHECK-NO-ARCH-NEXT: mov.b32 %r14, %f6;
+; CHECK-NO-ARCH-NEXT: bfe.u32 %r15, %r14, 16, 1;
+; CHECK-NO-ARCH-NEXT: add.s32 %r16, %r15, %r14;
+; CHECK-NO-ARCH-NEXT: add.s32 %r17, %r16, 32767;
+; CHECK-NO-ARCH-NEXT: setp.nan.f32 %p2, %f6, %f6;
+; CHECK-NO-ARCH-NEXT: or.b32 %r18, %r14, 4194304;
+; CHECK-NO-ARCH-NEXT: selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-NO-ARCH-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-NO-ARCH-NEXT: st.param.b16 [func_retval0], %rs1;
+; CHECK-NO-ARCH-NEXT: ret;
+; CHECK-NO-PTX-LABEL: fma_bf16_expanded(
+; CHECK-NO-PTX: {
+; CHECK-NO-PTX-NEXT: .reg .b16 %rs<5>;
+; CHECK-NO-PTX-EMPTY:
+; CHECK-NO-PTX-NEXT: // %bb.0:
+; CHECK-NO-PTX-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_param_0];
+; CHECK-NO-PTX-NEXT: ld.param.b16 %rs2, [fma_bf16_expanded_param_1];
+; CHECK-NO-PTX-NEXT: ld.param.b16 %rs3, [fma_bf16_expanded_param_2];
+; CHECK-NO-PTX-NEXT: fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-PTX-NEXT: st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-PTX-NEXT: ret;
+ %1 = fmul bfloat %a, %b
+ %2 = fadd bfloat %1, %c
+ %3 = fcmp ogt bfloat %2, 0.0
+ %4 = select i1 %3, bfloat %2, bfloat 0.0
+ ret bfloat %4
+}
More information about the llvm-commits
mailing list