[llvm] [NVPTX] Add patterns for fma.relu.{f16|bf16} (PR #114977)

Hugh Delaney via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 6 04:55:41 PST 2024


https://github.com/hdelan updated https://github.com/llvm/llvm-project/pull/114977

>From f5eea93c32e099c60c275b5cf4139b4e07137ef3 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney at codeplay.com>
Date: Tue, 5 Nov 2024 12:25:41 +0000
Subject: [PATCH] Add patterns for fma.relu.{f16|bf16}

Add patterns to lower fma(a, b, c) > 0 ? fma(a, b, c) : 0 for f16 and
bf16 types.
---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td |  37 +
 llvm/test/CodeGen/NVPTX/fma-relu.ll     | 920 ++++++++++++++++++++++++
 2 files changed, 957 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/fma-relu.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 5f6cba397c5352..39ab54841e8294 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3917,3 +3917,40 @@ def atomic_thread_fence_seq_cst_cta :
 def atomic_thread_fence_acq_rel_cta :
   NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>,
   Requires<[hasPTX<60>, hasSM<70>]>;
+
+def fpimm0 : FPImmLeaf<fAny, [{
+  return Imm.isExactlyValue(+0.0);
+}]>;
+
+def FMARELU_F16 :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+    "fma.rn.relu.f16 \t$dst, $a, $b, $c;", []>,
+    Requires<[useFP16Math, hasPTX<70>, hasSM<80>]>;
+def FMARELU_BF16 :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+    "fma.rn.relu.bf16 \t$dst, $a, $b, $c;", []>,
+    Requires<[hasBF16Math, hasPTX<70>, hasSM<80>]>;
+def FMARELU_F16_FTZ :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+    "fma.rn.relu.ftz.f16 \t$dst, $a, $b, $c;", []>,
+    Requires<[useFP16Math, hasPTX<70>, hasSM<80>]>;
+def FMARELU_BF16_FTZ :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+    "fma.rn.relu.ftz.bf16 \t$dst, $a, $b, $c;", []>,
+    Requires<[hasBF16Math, hasPTX<70>, hasSM<80>]>;
+
+
+// FTZ variants
+def : Pat<(f16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+  (FMARELU_F16_FTZ Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
+  Requires<[doF32FTZ, allowUnsafeFPMath]>;
+def : Pat<(bf16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+  (FMARELU_BF16_FTZ Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
+  Requires<[doF32FTZ, allowUnsafeFPMath]>;
+// No FTZ
+def : Pat<(f16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+  (FMARELU_F16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
+  Requires<[allowUnsafeFPMath]>;
+def : Pat<(bf16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+  (FMARELU_BF16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
+  Requires<[allowUnsafeFPMath]>;
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu.ll b/llvm/test/CodeGen/NVPTX/fma-relu.ll
new file mode 100644
index 00000000000000..3d95a4df2d3308
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fma-relu.ll
@@ -0,0 +1,920 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+
+; Using FTZ should emit fma.ftz.relu
+; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -march=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK-FTZ
+; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -march=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+
+; Don't contract FMAs
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -nvptx-fma-level=0 | FileCheck %s --check-prefixes=CHECK-NO-FMA-CONTRACTION
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -nvptx-fma-level=2 | FileCheck %s --check-prefixes=CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH
+
+; SM < 80 or (which needs PTX version >= 70) should not emit fma{.ftz}.relu
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 | FileCheck %s --check-prefixes=CHECK-SM70
+
+define half @fma_f16_unsafe(half %a, half %b, half %c) #0 {
+; CHECK-LABEL: fma_f16_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_f16_unsafe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_f16_unsafe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
+; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT:    ret;
+  %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
+  %2 = fcmp ogt half %1, 0.0
+  %3 = select i1 %2, half %1, half 0.0
+  ret half %3
+}
+
+define half @fma_f16_maxnum_unsafe(half %a, half %b, half %c) #0 {
+; CHECK-LABEL: fma_f16_maxnum_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_maxnum_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_f16_maxnum_unsafe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_f16_maxnum_unsafe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_maxnum_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
+; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT:    ret;
+  %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
+  %2 = call half @llvm.maxnum.f16(half %1, half 0.0)
+  ret half %2
+}
+
+define half @fma_f16_expanded_unsafe(half %a, half %b, half %c) #0 {
+; CHECK-LABEL: fma_f16_expanded_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_expanded_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_f16_expanded_unsafe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_f16_expanded_unsafe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_expanded_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
+; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul half %a, %b
+  %2 = fadd half %1, %c
+  %3 = fcmp ogt half %2, 0.0
+  %4 = select i1 %3, half %2, half 0.0
+  ret half %4
+}
+
+define half @fma_f16_expanded_safe(half %a, half %b, half %c) {
+; CHECK-LABEL: fma_f16_expanded_safe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
+; CHECK-NEXT:    mul.rn.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
+; CHECK-NEXT:    add.rn.f16 %rs5, %rs3, %rs4;
+; CHECK-NEXT:    mov.b16 %rs6, 0x0000;
+; CHECK-NEXT:    max.f16 %rs7, %rs5, %rs6;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs7;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_expanded_safe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<8>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
+; CHECK-FTZ-NEXT:    mul.rn.ftz.f16 %rs3, %rs1, %rs2;
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
+; CHECK-FTZ-NEXT:    add.rn.ftz.f16 %rs5, %rs3, %rs4;
+; CHECK-FTZ-NEXT:    mov.b16 %rs6, 0x0000;
+; CHECK-FTZ-NEXT:    max.ftz.f16 %rs7, %rs5, %rs6;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs7;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_f16_expanded_safe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<8>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    mul.rn.f16 %rs3, %rs1, %rs2;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    add.rn.f16 %rs5, %rs3, %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b16 %rs6, 0x0000;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    max.f16 %rs7, %rs5, %rs6;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs7;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_f16_expanded_safe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<8>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mul.f16 %rs3, %rs1, %rs2;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    add.f16 %rs5, %rs3, %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b16 %rs6, 0x0000;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    max.f16 %rs7, %rs5, %rs6;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs7;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_expanded_safe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
+; CHECK-SM70-NEXT:    mul.rn.f16 %rs3, %rs1, %rs2;
+; CHECK-SM70-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
+; CHECK-SM70-NEXT:    add.rn.f16 %rs5, %rs3, %rs4;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs5;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs6, %f2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs6;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul half %a, %b
+  %2 = fadd half %1, %c
+  %3 = fcmp ogt half %2, 0.0
+  %4 = select i1 %3, half %2, half 0.0
+  ret half %4
+}
+
+define half @fma_f16_expanded_maxnum_unsafe(half %a, half %b, half %c) #0 {
+; CHECK-LABEL: fma_f16_expanded_maxnum_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_expanded_maxnum_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_f16_expanded_maxnum_unsafe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_f16_expanded_maxnum_unsafe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_expanded_maxnum_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
+; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul half %a, %b
+  %2 = fadd half %1, %c
+  %3 = call half @llvm.maxnum.f16(half %2, half 0.0)
+  ret half %3
+}
+
+define bfloat @fma_bf16_unsafe(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-LABEL: fma_bf16_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_bf16_unsafe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_bf16_unsafe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_bf16_unsafe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_bf16_unsafe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_bf16_unsafe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_bf16_unsafe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_bf16_unsafe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_bf16_unsafe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_unsafe_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_unsafe_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_unsafe_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+  %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
+  %2 = fcmp ogt bfloat %1, 0.0
+  %3 = select i1 %2, bfloat %1, bfloat 0.0
+  ret bfloat %3
+}
+
+define bfloat @fma_bf16_maxnum_unsafe(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-LABEL: fma_bf16_maxnum_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_maxnum_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_maxnum_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_maxnum_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_maxnum_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_maxnum_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_maxnum_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_maxnum_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_bf16_maxnum_unsafe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_bf16_maxnum_unsafe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_bf16_maxnum_unsafe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_bf16_maxnum_unsafe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_bf16_maxnum_unsafe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_bf16_maxnum_unsafe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_bf16_maxnum_unsafe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_bf16_maxnum_unsafe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_maxnum_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_maxnum_unsafe_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_maxnum_unsafe_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_maxnum_unsafe_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+  %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
+  %2 = call bfloat @llvm.maxnum.bf16(bfloat %1, bfloat 0.0)
+  ret bfloat %2
+}
+
+define bfloat @fma_bf16_expanded_safe(bfloat %a, bfloat %b, bfloat %c) {
+; CHECK-LABEL: fma_bf16_expanded_safe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<6>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .f32 %f<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
+; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
+; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-NEXT:    mov.b32 %f2, %r4;
+; CHECK-NEXT:    mul.rn.f32 %f3, %f2, %f1;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs1, %f3;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs1;
+; CHECK-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-NEXT:    mov.b32 %f4, %r6;
+; CHECK-NEXT:    ld.param.u16 %r7, [fma_bf16_expanded_safe_param_2];
+; CHECK-NEXT:    shl.b32 %r8, %r7, 16;
+; CHECK-NEXT:    mov.b32 %f5, %r8;
+; CHECK-NEXT:    add.rn.f32 %f6, %f4, %f5;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs3, %f6;
+; CHECK-NEXT:    mov.b16 %rs4, 0x0000;
+; CHECK-NEXT:    max.bf16 %rs5, %rs3, %rs4;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_expanded_safe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<6>;
+; CHECK-FTZ-NEXT:    .reg .b32 %r<9>;
+; CHECK-FTZ-NEXT:    .reg .f32 %f<7>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
+; CHECK-FTZ-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-FTZ-NEXT:    mov.b32 %f1, %r2;
+; CHECK-FTZ-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
+; CHECK-FTZ-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-FTZ-NEXT:    mov.b32 %f2, %r4;
+; CHECK-FTZ-NEXT:    mul.rn.ftz.f32 %f3, %f2, %f1;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %f3;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs1;
+; CHECK-FTZ-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-FTZ-NEXT:    mov.b32 %f4, %r6;
+; CHECK-FTZ-NEXT:    ld.param.u16 %r7, [fma_bf16_expanded_safe_param_2];
+; CHECK-FTZ-NEXT:    shl.b32 %r8, %r7, 16;
+; CHECK-FTZ-NEXT:    mov.b32 %f5, %r8;
+; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f6, %f4, %f5;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f6;
+; CHECK-FTZ-NEXT:    mov.b16 %rs4, 0x0000;
+; CHECK-FTZ-NEXT:    max.bf16 %rs5, %rs3, %rs4;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_bf16_expanded_safe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<6>;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b32 %r<9>;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .f32 %f<7>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b32 %f2, %r4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    mul.rn.f32 %f3, %f2, %f1;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    cvt.rn.bf16.f32 %rs1, %f3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    cvt.u32.u16 %r5, %rs1;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b32 %f4, %r6;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.u16 %r7, [fma_bf16_expanded_safe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    shl.b32 %r8, %r7, 16;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b32 %f5, %r8;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    add.rn.f32 %f6, %f4, %f5;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    cvt.rn.bf16.f32 %rs3, %f6;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b16 %rs4, 0x0000;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    max.bf16 %rs5, %rs3, %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_bf16_expanded_safe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<6>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b32 %r<9>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .f32 %f<7>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b32 %f1, %r2;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b32 %f2, %r4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mul.f32 %f3, %f2, %f1;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    cvt.rn.bf16.f32 %rs1, %f3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    cvt.u32.u16 %r5, %rs1;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b32 %f4, %r6;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.u16 %r7, [fma_bf16_expanded_safe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    shl.b32 %r8, %r7, 16;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b32 %f5, %r8;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    add.f32 %f6, %f4, %f5;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    cvt.rn.bf16.f32 %rs3, %f6;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b16 %rs4, 0x0000;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    max.bf16 %rs5, %rs3, %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_expanded_safe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<4>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<27>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<9>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    mul.rn.f32 %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r5, %f3;
+; CHECK-SM70-NEXT:    bfe.u32 %r6, %r5, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r7, %r6, %r5;
+; CHECK-SM70-NEXT:    add.s32 %r8, %r7, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
+; CHECK-SM70-NEXT:    or.b32 %r9, %r5, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r11, %r10, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f4, %r11;
+; CHECK-SM70-NEXT:    ld.param.u16 %r12, [fma_bf16_expanded_safe_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r13, %r12, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    add.rn.f32 %f6, %f4, %f5;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    and.b32 %r20, %r19, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f7, %r20;
+; CHECK-SM70-NEXT:    max.f32 %f8, %f7, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r21, %f8;
+; CHECK-SM70-NEXT:    bfe.u32 %r22, %r21, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r23, %r22, %r21;
+; CHECK-SM70-NEXT:    add.s32 %r24, %r23, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f8, %f8;
+; CHECK-SM70-NEXT:    or.b32 %r25, %r21, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r26, %r25, %r24, %p3;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r26; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul bfloat %a, %b
+  %2 = fadd bfloat %1, %c
+  %3 = fcmp ogt bfloat %2, 0.0
+  %4 = select i1 %3, bfloat %2, bfloat 0.0
+  ret bfloat %4
+}
+
+define bfloat @fma_bf16_expanded_unsafe(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-LABEL: fma_bf16_expanded_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_expanded_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_bf16_expanded_unsafe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_bf16_expanded_unsafe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_expanded_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_unsafe_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_unsafe_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_unsafe_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul bfloat %a, %b
+  %2 = fadd bfloat %1, %c
+  %3 = fcmp ogt bfloat %2, 0.0
+  %4 = select i1 %3, bfloat %2, bfloat 0.0
+  ret bfloat %4
+}
+
+define bfloat @fma_bf16_expanded_maxnum_unsafe(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-LABEL: fma_bf16_expanded_maxnum_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_expanded_maxnum_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_bf16_expanded_maxnum_unsafe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_unsafe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_unsafe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_unsafe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_bf16_expanded_maxnum_unsafe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_unsafe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_unsafe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_unsafe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_expanded_maxnum_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_maxnum_unsafe_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_maxnum_unsafe_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_maxnum_unsafe_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul bfloat %a, %b
+  %2 = fadd bfloat %1, %c
+  %3 = call bfloat @llvm.maxnum.bf16(bfloat %2, bfloat 0.0)
+  ret bfloat %3
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }



More information about the llvm-commits mailing list