[llvm] [NVPTX] Add patterns for fma.relu.{f16|bf16} (PR #114977)

Hugh Delaney via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 6 03:00:50 PST 2024


https://github.com/hdelan updated https://github.com/llvm/llvm-project/pull/114977

>From 4759e15870beee08b63fd978147cda9327716c56 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney at codeplay.com>
Date: Tue, 5 Nov 2024 12:25:41 +0000
Subject: [PATCH] Add patterns for fma.relu.{f16|bf16}

Add patterns to lower fma(a, b, c) > 0 ? fma(a, b, c) : 0 for f16 and
bf16 types.
---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td |  37 ++
 llvm/test/CodeGen/NVPTX/fma-relu.ll     | 499 ++++++++++++++++++++++++
 2 files changed, 536 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/fma-relu.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 5f6cba397c5352..7da0f6d875ce4c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3917,3 +3917,40 @@ def atomic_thread_fence_seq_cst_cta :
 def atomic_thread_fence_acq_rel_cta :
   NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>,
   Requires<[hasPTX<60>, hasSM<70>]>;
+
+def fpimm0 : FPImmLeaf<fAny, [{
+  return Imm.isExactlyValue(+0.0);
+}]>;
+
+def FMARELU_F16 :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+    "fma.rn.relu.f16 \t$dst, $a, $b, $c;", []>,
+    Requires<[useFP16Math, hasPTX<70>, hasSM<80>]>;
+def FMARELU_BF16 :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+    "fma.rn.relu.bf16 \t$dst, $a, $b, $c;", []>,
+    Requires<[hasBF16Math, hasPTX<70>, hasSM<80>]>;
+def FMARELU_F16_FTZ :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+    "fma.rn.relu.ftz.f16 \t$dst, $a, $b, $c;", []>,
+    Requires<[useFP16Math, hasPTX<70>, hasSM<80>]>;
+def FMARELU_BF16_FTZ :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+    "fma.rn.relu.ftz.bf16 \t$dst, $a, $b, $c;", []>,
+    Requires<[hasBF16Math, hasPTX<70>, hasSM<80>]>;
+
+
+// FTZ variants
+def : Pat<(f16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+  (FMARELU_F16_FTZ Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
+  Requires<[allowFMA, doF32FTZ, allowUnsafeFPMath, hasPTX<70>]>;
+def : Pat<(bf16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+  (FMARELU_BF16_FTZ Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
+  Requires<[allowFMA, doF32FTZ, allowUnsafeFPMath, hasPTX<70>]>;
+// No FTZ
+def : Pat<(f16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+  (FMARELU_F16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
+  Requires<[allowFMA, allowUnsafeFPMath, hasPTX<70>]>;
+def : Pat<(bf16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+  (FMARELU_BF16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
+  Requires<[allowFMA, allowUnsafeFPMath, hasPTX<70>]>;
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu.ll b/llvm/test/CodeGen/NVPTX/fma-relu.ll
new file mode 100644
index 00000000000000..bdff52415f69fe
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fma-relu.ll
@@ -0,0 +1,499 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_80 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | %ptxas-verify -arch=sm_80 %}
+
+; Using FTZ should emit fma.ftz.relu
+; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_80 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | FileCheck %s --check-prefixes=CHECK-FTZ
+; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | %ptxas-verify -arch=sm_80 %}
+
+; RUN: llc < %s -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_80 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=0 | FileCheck %s --check-prefixes=CHECK-NO-FMA
+
+; SM < 80 or PTX version < 70 should not emit fma{.ftz}.relu
+; RUN: llc < %s -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_70 -mattr=+ptx70 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | FileCheck %s --check-prefixes=CHECK-SM70
+; RUN: llc < %s -march=nvptx64 --enable-unsafe-fp-math -mcpu=sm_80 -mattr=+ptx60 -verify-machineinstrs -fp-contract=fast -nvptx-fma-level=2 | FileCheck %s --check-prefixes=CHECK-PTX60
+
+define half @fma_f16(half %a, half %b, half %c) {
+; CHECK-LABEL: fma_f16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_param_2];
+; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-LABEL: fma_f16(
+; CHECK-NO-FMA:       {
+; CHECK-NO-FMA-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NO-FMA-EMPTY:
+; CHECK-NO-FMA-NEXT:  // %bb.0:
+; CHECK-NO-FMA-NEXT:    ld.param.b16 %rs1, [fma_f16_param_0];
+; CHECK-NO-FMA-NEXT:    ld.param.b16 %rs2, [fma_f16_param_1];
+; CHECK-NO-FMA-NEXT:    ld.param.b16 %rs3, [fma_f16_param_2];
+; CHECK-NO-FMA-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-NEXT:    mov.b16 %rs5, 0x0000;
+; CHECK-NO-FMA-NEXT:    max.f16 %rs6, %rs4, %rs5;
+; CHECK-NO-FMA-NEXT:    st.param.b16 [func_retval0], %rs6;
+; CHECK-NO-FMA-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_param_2];
+; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT:    ret;
+;
+; CHECK-PTX60-LABEL: fma_f16(
+; CHECK-PTX60:       {
+; CHECK-PTX60-NEXT:    .reg .b16 %rs<5>;
+; CHECK-PTX60-EMPTY:
+; CHECK-PTX60-NEXT:  // %bb.0:
+; CHECK-PTX60-NEXT:    ld.param.b16 %rs1, [fma_f16_param_0];
+; CHECK-PTX60-NEXT:    ld.param.b16 %rs2, [fma_f16_param_1];
+; CHECK-PTX60-NEXT:    ld.param.b16 %rs3, [fma_f16_param_2];
+; CHECK-PTX60-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-PTX60-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-PTX60-NEXT:    ret;
+; CHECK-NO-ARCH-LABEL: fma_f16(
+; CHECK-NO-ARCH:       {
+; CHECK-NO-ARCH-NEXT:    .reg .b16 %rs<6>;
+; CHECK-NO-ARCH-NEXT:    .reg .f32 %f<3>;
+; CHECK-NO-ARCH-EMPTY:
+; CHECK-NO-ARCH-NEXT:  // %bb.0:
+; CHECK-NO-ARCH-NEXT:    ld.param.b16 %rs1, [fma_f16_param_0];
+; CHECK-NO-ARCH-NEXT:    ld.param.b16 %rs2, [fma_f16_param_1];
+; CHECK-NO-ARCH-NEXT:    ld.param.b16 %rs3, [fma_f16_param_2];
+; CHECK-NO-ARCH-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-ARCH-NEXT:    cvt.f32.f16 %f1, %rs4;
+; CHECK-NO-ARCH-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-NO-ARCH-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-NO-ARCH-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-NO-ARCH-NEXT:    ret;
+; CHECK-NO-PTX-LABEL: fma_f16(
+; CHECK-NO-PTX:       {
+; CHECK-NO-PTX-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-PTX-EMPTY:
+; CHECK-NO-PTX-NEXT:  // %bb.0:
+; CHECK-NO-PTX-NEXT:    ld.param.b16 %rs1, [fma_f16_param_0];
+; CHECK-NO-PTX-NEXT:    ld.param.b16 %rs2, [fma_f16_param_1];
+; CHECK-NO-PTX-NEXT:    ld.param.b16 %rs3, [fma_f16_param_2];
+; CHECK-NO-PTX-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-PTX-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-PTX-NEXT:    ret;
+  %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
+  %2 = fcmp ogt half %1, 0.0
+  %3 = select i1 %2, half %1, half 0.0
+  ret half %3
+}
+
+define half @fma_f16_expanded(half %a, half %b, half %c) {
+; CHECK-LABEL: fma_f16_expanded(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_param_2];
+; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_expanded(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-LABEL: fma_f16_expanded(
+; CHECK-NO-FMA:       {
+; CHECK-NO-FMA-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NO-FMA-EMPTY:
+; CHECK-NO-FMA-NEXT:  // %bb.0:
+; CHECK-NO-FMA-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_param_0];
+; CHECK-NO-FMA-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_param_1];
+; CHECK-NO-FMA-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_param_2];
+; CHECK-NO-FMA-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-NEXT:    mov.b16 %rs5, 0x0000;
+; CHECK-NO-FMA-NEXT:    max.f16 %rs6, %rs4, %rs5;
+; CHECK-NO-FMA-NEXT:    st.param.b16 [func_retval0], %rs6;
+; CHECK-NO-FMA-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_expanded(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_param_2];
+; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT:    ret;
+;
+; CHECK-PTX60-LABEL: fma_f16_expanded(
+; CHECK-PTX60:       {
+; CHECK-PTX60-NEXT:    .reg .b16 %rs<5>;
+; CHECK-PTX60-EMPTY:
+; CHECK-PTX60-NEXT:  // %bb.0:
+; CHECK-PTX60-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_param_0];
+; CHECK-PTX60-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_param_1];
+; CHECK-PTX60-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_param_2];
+; CHECK-PTX60-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-PTX60-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-PTX60-NEXT:    ret;
+; CHECK-NO-ARCH-LABEL: fma_f16_expanded(
+; CHECK-NO-ARCH:       {
+; CHECK-NO-ARCH-NEXT:    .reg .b16 %rs<6>;
+; CHECK-NO-ARCH-NEXT:    .reg .f32 %f<3>;
+; CHECK-NO-ARCH-EMPTY:
+; CHECK-NO-ARCH-NEXT:  // %bb.0:
+; CHECK-NO-ARCH-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_param_0];
+; CHECK-NO-ARCH-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_param_1];
+; CHECK-NO-ARCH-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_param_2];
+; CHECK-NO-ARCH-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-ARCH-NEXT:    cvt.f32.f16 %f1, %rs4;
+; CHECK-NO-ARCH-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-NO-ARCH-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-NO-ARCH-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-NO-ARCH-NEXT:    ret;
+; CHECK-NO-PTX-LABEL: fma_f16_expanded(
+; CHECK-NO-PTX:       {
+; CHECK-NO-PTX-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-PTX-EMPTY:
+; CHECK-NO-PTX-NEXT:  // %bb.0:
+; CHECK-NO-PTX-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_param_0];
+; CHECK-NO-PTX-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_param_1];
+; CHECK-NO-PTX-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_param_2];
+; CHECK-NO-PTX-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-PTX-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-PTX-NEXT:    ret;
+  %1 = fmul half %a, %b
+  %2 = fadd half %1, %c
+  %3 = fcmp ogt half %2, 0.0
+  %4 = select i1 %3, half %2, half 0.0
+  ret half %4
+}
+
+define bfloat @fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
+; CHECK-LABEL: fma_bf16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_param_2];
+; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-LABEL: fma_bf16(
+; CHECK-NO-FMA:       {
+; CHECK-NO-FMA-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NO-FMA-EMPTY:
+; CHECK-NO-FMA-NEXT:  // %bb.0:
+; CHECK-NO-FMA-NEXT:    ld.param.b16 %rs1, [fma_bf16_param_0];
+; CHECK-NO-FMA-NEXT:    ld.param.b16 %rs2, [fma_bf16_param_1];
+; CHECK-NO-FMA-NEXT:    ld.param.b16 %rs3, [fma_bf16_param_2];
+; CHECK-NO-FMA-NEXT:    fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-NEXT:    mov.b16 %rs5, 0x0000;
+; CHECK-NO-FMA-NEXT:    max.bf16 %rs6, %rs4, %rs5;
+; CHECK-NO-FMA-NEXT:    st.param.b16 [func_retval0], %rs6;
+; CHECK-NO-FMA-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+;
+; CHECK-PTX60-LABEL: fma_bf16(
+; CHECK-PTX60:       {
+; CHECK-PTX60-NEXT:    .reg .b16 %rs<5>;
+; CHECK-PTX60-EMPTY:
+; CHECK-PTX60-NEXT:  // %bb.0:
+; CHECK-PTX60-NEXT:    ld.param.b16 %rs1, [fma_bf16_param_0];
+; CHECK-PTX60-NEXT:    ld.param.b16 %rs2, [fma_bf16_param_1];
+; CHECK-PTX60-NEXT:    ld.param.b16 %rs3, [fma_bf16_param_2];
+; CHECK-PTX60-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-PTX60-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-PTX60-NEXT:    ret;
+; CHECK-NO-ARCH-LABEL: fma_bf16(
+; CHECK-NO-ARCH:       {
+; CHECK-NO-ARCH-NEXT:    .reg .pred %p<3>;
+; CHECK-NO-ARCH-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NO-ARCH-NEXT:    .reg .b32 %r<20>;
+; CHECK-NO-ARCH-NEXT:    .reg .f32 %f<7>;
+; CHECK-NO-ARCH-EMPTY:
+; CHECK-NO-ARCH-NEXT:  // %bb.0:
+; CHECK-NO-ARCH-NEXT:    ld.param.u16 %r1, [fma_bf16_param_2];
+; CHECK-NO-ARCH-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-NO-ARCH-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NO-ARCH-NEXT:    ld.param.u16 %r3, [fma_bf16_param_1];
+; CHECK-NO-ARCH-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-NO-ARCH-NEXT:    mov.b32 %f2, %r4;
+; CHECK-NO-ARCH-NEXT:    ld.param.u16 %r5, [fma_bf16_param_0];
+; CHECK-NO-ARCH-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-NO-ARCH-NEXT:    mov.b32 %f3, %r6;
+; CHECK-NO-ARCH-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-NO-ARCH-NEXT:    mov.b32 %r7, %f4;
+; CHECK-NO-ARCH-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-NO-ARCH-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-NO-ARCH-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-NO-ARCH-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-NO-ARCH-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-NO-ARCH-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-NO-ARCH-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-NO-ARCH-NEXT:    mov.b32 %f5, %r13;
+; CHECK-NO-ARCH-NEXT:    max.f32 %f6, %f5, 0f00000000;
+; CHECK-NO-ARCH-NEXT:    mov.b32 %r14, %f6;
+; CHECK-NO-ARCH-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-NO-ARCH-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-NO-ARCH-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-NO-ARCH-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-NO-ARCH-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-NO-ARCH-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-NO-ARCH-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-NO-ARCH-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-NO-ARCH-NEXT:    ret;
+; CHECK-NO-PTX-LABEL: fma_bf16(
+; CHECK-NO-PTX:       {
+; CHECK-NO-PTX-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-PTX-EMPTY:
+; CHECK-NO-PTX-NEXT:  // %bb.0:
+; CHECK-NO-PTX-NEXT:    ld.param.b16 %rs1, [fma_bf16_param_0];
+; CHECK-NO-PTX-NEXT:    ld.param.b16 %rs2, [fma_bf16_param_1];
+; CHECK-NO-PTX-NEXT:    ld.param.b16 %rs3, [fma_bf16_param_2];
+; CHECK-NO-PTX-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-PTX-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-PTX-NEXT:    ret;
+  %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
+  %2 = fcmp ogt bfloat %1, 0.0
+  %3 = select i1 %2, bfloat %1, bfloat 0.0
+  ret bfloat %3
+}
+
+define bfloat @fma_bf16_expanded(bfloat %a, bfloat %b, bfloat %c) {
+; CHECK-LABEL: fma_bf16_expanded(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_param_2];
+; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_expanded(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-LABEL: fma_bf16_expanded(
+; CHECK-NO-FMA:       {
+; CHECK-NO-FMA-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NO-FMA-EMPTY:
+; CHECK-NO-FMA-NEXT:  // %bb.0:
+; CHECK-NO-FMA-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_param_0];
+; CHECK-NO-FMA-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_param_1];
+; CHECK-NO-FMA-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_param_2];
+; CHECK-NO-FMA-NEXT:    fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-NEXT:    mov.b16 %rs5, 0x0000;
+; CHECK-NO-FMA-NEXT:    max.bf16 %rs6, %rs4, %rs5;
+; CHECK-NO-FMA-NEXT:    st.param.b16 [func_retval0], %rs6;
+; CHECK-NO-FMA-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_expanded(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+;
+; CHECK-PTX60-LABEL: fma_bf16_expanded(
+; CHECK-PTX60:       {
+; CHECK-PTX60-NEXT:    .reg .b16 %rs<5>;
+; CHECK-PTX60-EMPTY:
+; CHECK-PTX60-NEXT:  // %bb.0:
+; CHECK-PTX60-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_param_0];
+; CHECK-PTX60-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_param_1];
+; CHECK-PTX60-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_param_2];
+; CHECK-PTX60-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-PTX60-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-PTX60-NEXT:    ret;
+; CHECK-NO-ARCH-LABEL: fma_bf16_expanded(
+; CHECK-NO-ARCH:       {
+; CHECK-NO-ARCH-NEXT:    .reg .pred %p<3>;
+; CHECK-NO-ARCH-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NO-ARCH-NEXT:    .reg .b32 %r<20>;
+; CHECK-NO-ARCH-NEXT:    .reg .f32 %f<7>;
+; CHECK-NO-ARCH-EMPTY:
+; CHECK-NO-ARCH-NEXT:  // %bb.0:
+; CHECK-NO-ARCH-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_param_2];
+; CHECK-NO-ARCH-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-NO-ARCH-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NO-ARCH-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_param_1];
+; CHECK-NO-ARCH-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-NO-ARCH-NEXT:    mov.b32 %f2, %r4;
+; CHECK-NO-ARCH-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_param_0];
+; CHECK-NO-ARCH-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-NO-ARCH-NEXT:    mov.b32 %f3, %r6;
+; CHECK-NO-ARCH-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-NO-ARCH-NEXT:    mov.b32 %r7, %f4;
+; CHECK-NO-ARCH-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-NO-ARCH-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-NO-ARCH-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-NO-ARCH-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-NO-ARCH-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-NO-ARCH-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-NO-ARCH-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-NO-ARCH-NEXT:    mov.b32 %f5, %r13;
+; CHECK-NO-ARCH-NEXT:    max.f32 %f6, %f5, 0f00000000;
+; CHECK-NO-ARCH-NEXT:    mov.b32 %r14, %f6;
+; CHECK-NO-ARCH-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-NO-ARCH-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-NO-ARCH-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-NO-ARCH-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-NO-ARCH-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-NO-ARCH-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-NO-ARCH-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-NO-ARCH-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-NO-ARCH-NEXT:    ret;
+; CHECK-NO-PTX-LABEL: fma_bf16_expanded(
+; CHECK-NO-PTX:       {
+; CHECK-NO-PTX-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-PTX-EMPTY:
+; CHECK-NO-PTX-NEXT:  // %bb.0:
+; CHECK-NO-PTX-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_param_0];
+; CHECK-NO-PTX-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_param_1];
+; CHECK-NO-PTX-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_param_2];
+; CHECK-NO-PTX-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-PTX-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-PTX-NEXT:    ret;
+  %1 = fmul bfloat %a, %b
+  %2 = fadd bfloat %1, %c
+  %3 = fcmp ogt bfloat %2, 0.0
+  %4 = select i1 %3, bfloat %2, bfloat 0.0
+  ret bfloat %4
+}



More information about the llvm-commits mailing list