[llvm] [NVPTX] Add patterns for fma.relu.{f16|bf16} (PR #114977)

Hugh Delaney via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 7 02:26:50 PST 2024


https://github.com/hdelan updated https://github.com/llvm/llvm-project/pull/114977

>From f5eea93c32e099c60c275b5cf4139b4e07137ef3 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney at codeplay.com>
Date: Tue, 5 Nov 2024 12:25:41 +0000
Subject: [PATCH 1/2] Add patterns for fma.relu.{f16|bf16}

Add patterns to lower fma(a, b, c) > 0 ? fma(a, b, c) : 0 for f16 and
bf16 types.
---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td |  37 +
 llvm/test/CodeGen/NVPTX/fma-relu.ll     | 920 ++++++++++++++++++++++++
 2 files changed, 957 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/fma-relu.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 5f6cba397c5352..39ab54841e8294 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3917,3 +3917,40 @@ def atomic_thread_fence_seq_cst_cta :
 def atomic_thread_fence_acq_rel_cta :
   NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>,
   Requires<[hasPTX<60>, hasSM<70>]>;
+
+def fpimm0 : FPImmLeaf<fAny, [{
+  return Imm.isExactlyValue(+0.0);
+}]>;
+
+def FMARELU_F16 :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+    "fma.rn.relu.f16 \t$dst, $a, $b, $c;", []>,
+    Requires<[useFP16Math, hasPTX<70>, hasSM<80>]>;
+def FMARELU_BF16 :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+    "fma.rn.relu.bf16 \t$dst, $a, $b, $c;", []>,
+    Requires<[hasBF16Math, hasPTX<70>, hasSM<80>]>;
+def FMARELU_F16_FTZ :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+    "fma.rn.relu.ftz.f16 \t$dst, $a, $b, $c;", []>,
+    Requires<[useFP16Math, hasPTX<70>, hasSM<80>]>;
+def FMARELU_BF16_FTZ :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+    "fma.rn.relu.ftz.bf16 \t$dst, $a, $b, $c;", []>,
+    Requires<[hasBF16Math, hasPTX<70>, hasSM<80>]>;
+
+
+// FTZ variants
+def : Pat<(f16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+  (FMARELU_F16_FTZ Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
+  Requires<[doF32FTZ, allowUnsafeFPMath]>;
+def : Pat<(bf16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+  (FMARELU_BF16_FTZ Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
+  Requires<[doF32FTZ, allowUnsafeFPMath]>;
+// No FTZ
+def : Pat<(f16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+  (FMARELU_F16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
+  Requires<[allowUnsafeFPMath]>;
+def : Pat<(bf16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+  (FMARELU_BF16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
+  Requires<[allowUnsafeFPMath]>;
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu.ll b/llvm/test/CodeGen/NVPTX/fma-relu.ll
new file mode 100644
index 00000000000000..3d95a4df2d3308
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fma-relu.ll
@@ -0,0 +1,920 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+
+; Using FTZ should emit fma.ftz.relu
+; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -march=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK-FTZ
+; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -march=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+
+; Don't contract FMAs
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -nvptx-fma-level=0 | FileCheck %s --check-prefixes=CHECK-NO-FMA-CONTRACTION
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -nvptx-fma-level=2 | FileCheck %s --check-prefixes=CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH
+
+; SM < 80 or (which needs PTX version >= 70) should not emit fma{.ftz}.relu
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 | FileCheck %s --check-prefixes=CHECK-SM70
+
+define half @fma_f16_unsafe(half %a, half %b, half %c) #0 {
+; CHECK-LABEL: fma_f16_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_f16_unsafe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_f16_unsafe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
+; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT:    ret;
+  %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
+  %2 = fcmp ogt half %1, 0.0
+  %3 = select i1 %2, half %1, half 0.0
+  ret half %3
+}
+
+define half @fma_f16_maxnum_unsafe(half %a, half %b, half %c) #0 {
+; CHECK-LABEL: fma_f16_maxnum_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_maxnum_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_f16_maxnum_unsafe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_f16_maxnum_unsafe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_maxnum_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
+; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT:    ret;
+  %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
+  %2 = call half @llvm.maxnum.f16(half %1, half 0.0)
+  ret half %2
+}
+
+define half @fma_f16_expanded_unsafe(half %a, half %b, half %c) #0 {
+; CHECK-LABEL: fma_f16_expanded_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_expanded_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_f16_expanded_unsafe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_f16_expanded_unsafe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_expanded_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
+; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul half %a, %b
+  %2 = fadd half %1, %c
+  %3 = fcmp ogt half %2, 0.0
+  %4 = select i1 %3, half %2, half 0.0
+  ret half %4
+}
+
+define half @fma_f16_expanded_safe(half %a, half %b, half %c) {
+; CHECK-LABEL: fma_f16_expanded_safe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
+; CHECK-NEXT:    mul.rn.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
+; CHECK-NEXT:    add.rn.f16 %rs5, %rs3, %rs4;
+; CHECK-NEXT:    mov.b16 %rs6, 0x0000;
+; CHECK-NEXT:    max.f16 %rs7, %rs5, %rs6;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs7;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_expanded_safe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<8>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
+; CHECK-FTZ-NEXT:    mul.rn.ftz.f16 %rs3, %rs1, %rs2;
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
+; CHECK-FTZ-NEXT:    add.rn.ftz.f16 %rs5, %rs3, %rs4;
+; CHECK-FTZ-NEXT:    mov.b16 %rs6, 0x0000;
+; CHECK-FTZ-NEXT:    max.ftz.f16 %rs7, %rs5, %rs6;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs7;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_f16_expanded_safe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<8>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    mul.rn.f16 %rs3, %rs1, %rs2;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    add.rn.f16 %rs5, %rs3, %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b16 %rs6, 0x0000;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    max.f16 %rs7, %rs5, %rs6;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs7;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_f16_expanded_safe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<8>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mul.f16 %rs3, %rs1, %rs2;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    add.f16 %rs5, %rs3, %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b16 %rs6, 0x0000;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    max.f16 %rs7, %rs5, %rs6;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs7;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_expanded_safe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
+; CHECK-SM70-NEXT:    mul.rn.f16 %rs3, %rs1, %rs2;
+; CHECK-SM70-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
+; CHECK-SM70-NEXT:    add.rn.f16 %rs5, %rs3, %rs4;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs5;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs6, %f2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs6;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul half %a, %b
+  %2 = fadd half %1, %c
+  %3 = fcmp ogt half %2, 0.0
+  %4 = select i1 %3, half %2, half 0.0
+  ret half %4
+}
+
+define half @fma_f16_expanded_maxnum_unsafe(half %a, half %b, half %c) #0 {
+; CHECK-LABEL: fma_f16_expanded_maxnum_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_expanded_maxnum_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_f16_expanded_maxnum_unsafe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_f16_expanded_maxnum_unsafe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_expanded_maxnum_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
+; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul half %a, %b
+  %2 = fadd half %1, %c
+  %3 = call half @llvm.maxnum.f16(half %2, half 0.0)
+  ret half %3
+}
+
+define bfloat @fma_bf16_unsafe(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-LABEL: fma_bf16_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_bf16_unsafe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_bf16_unsafe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_bf16_unsafe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_bf16_unsafe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_bf16_unsafe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_bf16_unsafe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_bf16_unsafe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_bf16_unsafe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_unsafe_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_unsafe_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_unsafe_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+  %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
+  %2 = fcmp ogt bfloat %1, 0.0
+  %3 = select i1 %2, bfloat %1, bfloat 0.0
+  ret bfloat %3
+}
+
+define bfloat @fma_bf16_maxnum_unsafe(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-LABEL: fma_bf16_maxnum_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_maxnum_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_maxnum_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_maxnum_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_maxnum_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_maxnum_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_maxnum_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_maxnum_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_bf16_maxnum_unsafe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_bf16_maxnum_unsafe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_bf16_maxnum_unsafe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_bf16_maxnum_unsafe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_bf16_maxnum_unsafe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_bf16_maxnum_unsafe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_bf16_maxnum_unsafe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_bf16_maxnum_unsafe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_maxnum_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_maxnum_unsafe_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_maxnum_unsafe_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_maxnum_unsafe_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+  %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
+  %2 = call bfloat @llvm.maxnum.bf16(bfloat %1, bfloat 0.0)
+  ret bfloat %2
+}
+
+define bfloat @fma_bf16_expanded_safe(bfloat %a, bfloat %b, bfloat %c) {
+; CHECK-LABEL: fma_bf16_expanded_safe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<6>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .f32 %f<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
+; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
+; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-NEXT:    mov.b32 %f2, %r4;
+; CHECK-NEXT:    mul.rn.f32 %f3, %f2, %f1;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs1, %f3;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs1;
+; CHECK-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-NEXT:    mov.b32 %f4, %r6;
+; CHECK-NEXT:    ld.param.u16 %r7, [fma_bf16_expanded_safe_param_2];
+; CHECK-NEXT:    shl.b32 %r8, %r7, 16;
+; CHECK-NEXT:    mov.b32 %f5, %r8;
+; CHECK-NEXT:    add.rn.f32 %f6, %f4, %f5;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs3, %f6;
+; CHECK-NEXT:    mov.b16 %rs4, 0x0000;
+; CHECK-NEXT:    max.bf16 %rs5, %rs3, %rs4;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_expanded_safe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<6>;
+; CHECK-FTZ-NEXT:    .reg .b32 %r<9>;
+; CHECK-FTZ-NEXT:    .reg .f32 %f<7>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
+; CHECK-FTZ-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-FTZ-NEXT:    mov.b32 %f1, %r2;
+; CHECK-FTZ-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
+; CHECK-FTZ-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-FTZ-NEXT:    mov.b32 %f2, %r4;
+; CHECK-FTZ-NEXT:    mul.rn.ftz.f32 %f3, %f2, %f1;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %f3;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs1;
+; CHECK-FTZ-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-FTZ-NEXT:    mov.b32 %f4, %r6;
+; CHECK-FTZ-NEXT:    ld.param.u16 %r7, [fma_bf16_expanded_safe_param_2];
+; CHECK-FTZ-NEXT:    shl.b32 %r8, %r7, 16;
+; CHECK-FTZ-NEXT:    mov.b32 %f5, %r8;
+; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f6, %f4, %f5;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f6;
+; CHECK-FTZ-NEXT:    mov.b16 %rs4, 0x0000;
+; CHECK-FTZ-NEXT:    max.bf16 %rs5, %rs3, %rs4;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_bf16_expanded_safe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<6>;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b32 %r<9>;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .f32 %f<7>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b32 %f2, %r4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    mul.rn.f32 %f3, %f2, %f1;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    cvt.rn.bf16.f32 %rs1, %f3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    cvt.u32.u16 %r5, %rs1;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b32 %f4, %r6;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.u16 %r7, [fma_bf16_expanded_safe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    shl.b32 %r8, %r7, 16;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b32 %f5, %r8;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    add.rn.f32 %f6, %f4, %f5;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    cvt.rn.bf16.f32 %rs3, %f6;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b16 %rs4, 0x0000;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    max.bf16 %rs5, %rs3, %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_bf16_expanded_safe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<6>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b32 %r<9>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .f32 %f<7>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b32 %f1, %r2;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b32 %f2, %r4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mul.f32 %f3, %f2, %f1;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    cvt.rn.bf16.f32 %rs1, %f3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    cvt.u32.u16 %r5, %rs1;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b32 %f4, %r6;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.u16 %r7, [fma_bf16_expanded_safe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    shl.b32 %r8, %r7, 16;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b32 %f5, %r8;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    add.f32 %f6, %f4, %f5;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    cvt.rn.bf16.f32 %rs3, %f6;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b16 %rs4, 0x0000;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    max.bf16 %rs5, %rs3, %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_expanded_safe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<4>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<27>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<9>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    mul.rn.f32 %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r5, %f3;
+; CHECK-SM70-NEXT:    bfe.u32 %r6, %r5, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r7, %r6, %r5;
+; CHECK-SM70-NEXT:    add.s32 %r8, %r7, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
+; CHECK-SM70-NEXT:    or.b32 %r9, %r5, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r11, %r10, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f4, %r11;
+; CHECK-SM70-NEXT:    ld.param.u16 %r12, [fma_bf16_expanded_safe_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r13, %r12, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    add.rn.f32 %f6, %f4, %f5;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    and.b32 %r20, %r19, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f7, %r20;
+; CHECK-SM70-NEXT:    max.f32 %f8, %f7, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r21, %f8;
+; CHECK-SM70-NEXT:    bfe.u32 %r22, %r21, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r23, %r22, %r21;
+; CHECK-SM70-NEXT:    add.s32 %r24, %r23, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f8, %f8;
+; CHECK-SM70-NEXT:    or.b32 %r25, %r21, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r26, %r25, %r24, %p3;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r26; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul bfloat %a, %b
+  %2 = fadd bfloat %1, %c
+  %3 = fcmp ogt bfloat %2, 0.0
+  %4 = select i1 %3, bfloat %2, bfloat 0.0
+  ret bfloat %4
+}
+
+define bfloat @fma_bf16_expanded_unsafe(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-LABEL: fma_bf16_expanded_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_expanded_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_bf16_expanded_unsafe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_bf16_expanded_unsafe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_expanded_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_unsafe_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_unsafe_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_unsafe_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul bfloat %a, %b
+  %2 = fadd bfloat %1, %c
+  %3 = fcmp ogt bfloat %2, 0.0
+  %4 = select i1 %3, bfloat %2, bfloat 0.0
+  ret bfloat %4
+}
+
+define bfloat @fma_bf16_expanded_maxnum_unsafe(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-LABEL: fma_bf16_expanded_maxnum_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_expanded_maxnum_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-NO-FMA-CONTRACTION-LABEL: fma_bf16_expanded_maxnum_unsafe(
+; CHECK-NO-FMA-CONTRACTION:       {
+; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NO-FMA-CONTRACTION-EMPTY:
+; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_unsafe_param_0];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_unsafe_param_1];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_unsafe_param_2];
+; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
+;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_bf16_expanded_maxnum_unsafe(
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_unsafe_param_0];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_unsafe_param_1];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_unsafe_param_2];
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_expanded_maxnum_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_maxnum_unsafe_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_maxnum_unsafe_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_maxnum_unsafe_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul bfloat %a, %b
+  %2 = fadd bfloat %1, %c
+  %3 = call bfloat @llvm.maxnum.bf16(bfloat %2, bfloat 0.0)
+  ret bfloat %3
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }

>From c793bfd09af8da0eda9a35422304f0509b0a6874 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney at codeplay.com>
Date: Thu, 7 Nov 2024 10:25:13 +0000
Subject: [PATCH 2/2] Make sure FMA only has one use, update tests

FMA relu should only be emitted if the FMA node has a single use. This
should limit register pressure in some cases do avoid computing FMA as
well as FMA.relu.

Also split tests into two files, one using FMA contraction and the other
using the FMA intrinsic.
---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       |  17 +-
 llvm/test/CodeGen/NVPTX/fma-relu-contract.ll  | 600 ++++++++++++
 .../CodeGen/NVPTX/fma-relu-fma-intrinsic.ll   | 400 ++++++++
 llvm/test/CodeGen/NVPTX/fma-relu.ll           | 920 ------------------
 4 files changed, 1011 insertions(+), 926 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
 delete mode 100644 llvm/test/CodeGen/NVPTX/fma-relu.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 39ab54841e8294..5a4fb90aba2a32 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3932,25 +3932,30 @@ def FMARELU_BF16 :
     Requires<[hasBF16Math, hasPTX<70>, hasSM<80>]>;
 def FMARELU_F16_FTZ :
   NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
-    "fma.rn.relu.ftz.f16 \t$dst, $a, $b, $c;", []>,
+    "fma.rn.ftz.relu.f16 \t$dst, $a, $b, $c;", []>,
     Requires<[useFP16Math, hasPTX<70>, hasSM<80>]>;
 def FMARELU_BF16_FTZ :
   NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
-    "fma.rn.relu.ftz.bf16 \t$dst, $a, $b, $c;", []>,
+    "fma.rn.ftz.relu.bf16 \t$dst, $a, $b, $c;", []>,
     Requires<[hasBF16Math, hasPTX<70>, hasSM<80>]>;
 
+// Patterns will only be used if FMA has a single use, in order to mitigate register pressure
+def NVPTX_fma_oneuse : PatFrag<(ops node:$a, node:$b, node:$c),
+                                  (fma node:$a, node:$b, node:$c), [{
+  return N->hasOneUse();
+}]>;
 
 // FTZ variants
-def : Pat<(f16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+def : Pat<(f16 (fmaxnum (NVPTX_fma_oneuse Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
   (FMARELU_F16_FTZ Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
   Requires<[doF32FTZ, allowUnsafeFPMath]>;
-def : Pat<(bf16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+def : Pat<(bf16 (fmaxnum (NVPTX_fma_oneuse Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
   (FMARELU_BF16_FTZ Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
   Requires<[doF32FTZ, allowUnsafeFPMath]>;
 // No FTZ
-def : Pat<(f16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+def : Pat<(f16 (fmaxnum (NVPTX_fma_oneuse Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
   (FMARELU_F16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
   Requires<[allowUnsafeFPMath]>;
-def : Pat<(bf16 (fmaxnum (fma Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
+def : Pat<(bf16 (fmaxnum (NVPTX_fma_oneuse Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), fpimm0)),
   (FMARELU_BF16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
   Requires<[allowUnsafeFPMath]>;
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
new file mode 100644
index 00000000000000..74d5ae45997e89
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
@@ -0,0 +1,600 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+
+; Using FTZ should emit fma.ftz.relu
+; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -march=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK-FTZ
+; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -march=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+
+; SM < 80 or (which needs PTX version >= 70) should not emit fma{.ftz}.relu
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 | FileCheck %s --check-prefixes=CHECK-SM70
+
+define half @fma_f16_expanded_unsafe(half %a, half %b, half %c) #0 {
+; CHECK-LABEL: fma_f16_expanded_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_expanded_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.ftz.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_expanded_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
+; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul half %a, %b
+  %2 = fadd half %1, %c
+  %3 = fcmp ogt half %2, 0.0
+  %4 = select i1 %3, half %2, half 0.0
+  ret half %4
+}
+
+; FMA relu shouldn't be selected if the FMA operation has multiple uses
+define half @fma_f16_expanded_unsafe_multiple_uses_of_fma(half %a, half %b, half %c) #0 {
+; CHECK-LABEL: fma_f16_expanded_unsafe_multiple_uses_of_fma(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_multiple_uses_of_fma_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_multiple_uses_of_fma_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_multiple_uses_of_fma_param_2];
+; CHECK-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    mov.b16 %rs5, 0x0000;
+; CHECK-NEXT:    max.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT:    mov.b16 %rs7, 0x4700;
+; CHECK-NEXT:    add.f16 %rs8, %rs4, %rs7;
+; CHECK-NEXT:    add.f16 %rs9, %rs6, %rs8;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs9;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_expanded_unsafe_multiple_uses_of_fma(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<10>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_multiple_uses_of_fma_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_multiple_uses_of_fma_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_multiple_uses_of_fma_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.ftz.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    mov.b16 %rs5, 0x0000;
+; CHECK-FTZ-NEXT:    max.ftz.f16 %rs6, %rs4, %rs5;
+; CHECK-FTZ-NEXT:    mov.b16 %rs7, 0x4700;
+; CHECK-FTZ-NEXT:    add.ftz.f16 %rs8, %rs4, %rs7;
+; CHECK-FTZ-NEXT:    add.ftz.f16 %rs9, %rs6, %rs8;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs9;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_expanded_unsafe_multiple_uses_of_fma(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<9>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_multiple_uses_of_fma_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_multiple_uses_of_fma_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_multiple_uses_of_fma_param_2];
+; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    mov.b16 %rs6, 0x4700;
+; CHECK-SM70-NEXT:    add.f16 %rs7, %rs4, %rs6;
+; CHECK-SM70-NEXT:    add.f16 %rs8, %rs5, %rs7;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs8;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul half %a, %b
+  %2 = fadd half %1, %c
+  %3 = fcmp ogt half %2, 0.0
+  %4 = select i1 %3, half %2, half 0.0
+  %5 = fadd half %2, 7.0
+  %6 = fadd half %4, %5
+  ret half %6
+}
+
+define half @fma_f16_expanded_safe(half %a, half %b, half %c) {
+; CHECK-LABEL: fma_f16_expanded_safe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
+; CHECK-NEXT:    mul.rn.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
+; CHECK-NEXT:    add.rn.f16 %rs5, %rs3, %rs4;
+; CHECK-NEXT:    mov.b16 %rs6, 0x0000;
+; CHECK-NEXT:    max.f16 %rs7, %rs5, %rs6;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs7;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_expanded_safe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<8>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
+; CHECK-FTZ-NEXT:    mul.rn.ftz.f16 %rs3, %rs1, %rs2;
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
+; CHECK-FTZ-NEXT:    add.rn.ftz.f16 %rs5, %rs3, %rs4;
+; CHECK-FTZ-NEXT:    mov.b16 %rs6, 0x0000;
+; CHECK-FTZ-NEXT:    max.ftz.f16 %rs7, %rs5, %rs6;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs7;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_expanded_safe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
+; CHECK-SM70-NEXT:    mul.rn.f16 %rs3, %rs1, %rs2;
+; CHECK-SM70-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
+; CHECK-SM70-NEXT:    add.rn.f16 %rs5, %rs3, %rs4;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs5;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs6, %f2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs6;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul half %a, %b
+  %2 = fadd half %1, %c
+  %3 = fcmp ogt half %2, 0.0
+  %4 = select i1 %3, half %2, half 0.0
+  ret half %4
+}
+
+define half @fma_f16_expanded_maxnum_unsafe(half %a, half %b, half %c) #0 {
+; CHECK-LABEL: fma_f16_expanded_maxnum_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_expanded_maxnum_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.ftz.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_expanded_maxnum_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
+; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul half %a, %b
+  %2 = fadd half %1, %c
+  %3 = call half @llvm.maxnum.f16(half %2, half 0.0)
+  ret half %3
+}
+
+define bfloat @fma_bf16_expanded_safe(bfloat %a, bfloat %b, bfloat %c) {
+; CHECK-LABEL: fma_bf16_expanded_safe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<6>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .f32 %f<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
+; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
+; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-NEXT:    mov.b32 %f2, %r4;
+; CHECK-NEXT:    mul.rn.f32 %f3, %f2, %f1;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs1, %f3;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs1;
+; CHECK-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-NEXT:    mov.b32 %f4, %r6;
+; CHECK-NEXT:    ld.param.u16 %r7, [fma_bf16_expanded_safe_param_2];
+; CHECK-NEXT:    shl.b32 %r8, %r7, 16;
+; CHECK-NEXT:    mov.b32 %f5, %r8;
+; CHECK-NEXT:    add.rn.f32 %f6, %f4, %f5;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs3, %f6;
+; CHECK-NEXT:    mov.b16 %rs4, 0x0000;
+; CHECK-NEXT:    max.bf16 %rs5, %rs3, %rs4;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_expanded_safe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<6>;
+; CHECK-FTZ-NEXT:    .reg .b32 %r<9>;
+; CHECK-FTZ-NEXT:    .reg .f32 %f<7>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
+; CHECK-FTZ-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-FTZ-NEXT:    mov.b32 %f1, %r2;
+; CHECK-FTZ-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
+; CHECK-FTZ-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-FTZ-NEXT:    mov.b32 %f2, %r4;
+; CHECK-FTZ-NEXT:    mul.rn.ftz.f32 %f3, %f2, %f1;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %f3;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs1;
+; CHECK-FTZ-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-FTZ-NEXT:    mov.b32 %f4, %r6;
+; CHECK-FTZ-NEXT:    ld.param.u16 %r7, [fma_bf16_expanded_safe_param_2];
+; CHECK-FTZ-NEXT:    shl.b32 %r8, %r7, 16;
+; CHECK-FTZ-NEXT:    mov.b32 %f5, %r8;
+; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f6, %f4, %f5;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f6;
+; CHECK-FTZ-NEXT:    mov.b16 %rs4, 0x0000;
+; CHECK-FTZ-NEXT:    max.bf16 %rs5, %rs3, %rs4;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_expanded_safe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<4>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<27>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<9>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    mul.rn.f32 %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r5, %f3;
+; CHECK-SM70-NEXT:    bfe.u32 %r6, %r5, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r7, %r6, %r5;
+; CHECK-SM70-NEXT:    add.s32 %r8, %r7, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
+; CHECK-SM70-NEXT:    or.b32 %r9, %r5, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r11, %r10, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f4, %r11;
+; CHECK-SM70-NEXT:    ld.param.u16 %r12, [fma_bf16_expanded_safe_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r13, %r12, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    add.rn.f32 %f6, %f4, %f5;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    and.b32 %r20, %r19, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f7, %r20;
+; CHECK-SM70-NEXT:    max.f32 %f8, %f7, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r21, %f8;
+; CHECK-SM70-NEXT:    bfe.u32 %r22, %r21, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r23, %r22, %r21;
+; CHECK-SM70-NEXT:    add.s32 %r24, %r23, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f8, %f8;
+; CHECK-SM70-NEXT:    or.b32 %r25, %r21, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r26, %r25, %r24, %p3;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r26; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul bfloat %a, %b
+  %2 = fadd bfloat %1, %c
+  %3 = fcmp ogt bfloat %2, 0.0
+  %4 = select i1 %3, bfloat %2, bfloat 0.0
+  ret bfloat %4
+}
+
+define bfloat @fma_bf16_expanded_unsafe(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-LABEL: fma_bf16_expanded_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_expanded_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.ftz.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_expanded_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_unsafe_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_unsafe_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_unsafe_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul bfloat %a, %b
+  %2 = fadd bfloat %1, %c
+  %3 = fcmp ogt bfloat %2, 0.0
+  %4 = select i1 %3, bfloat %2, bfloat 0.0
+  ret bfloat %4
+}
+
+; FMA relu shouldn't be selected if the FMA operation has multiple uses
+define bfloat @fma_bf16_expanded_unsafe_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-LABEL: fma_bf16_expanded_unsafe_multiple_uses_of_fma(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<12>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .f32 %f<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_multiple_uses_of_fma_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_multiple_uses_of_fma_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_multiple_uses_of_fma_param_2];
+; CHECK-NEXT:    fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    mov.b16 %rs5, 0x0000;
+; CHECK-NEXT:    max.bf16 %rs6, %rs4, %rs5;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
+; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NEXT:    add.f32 %f2, %f1, 0f40E00000;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs8, %f2;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs6;
+; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-NEXT:    mov.b32 %f3, %r4;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs8;
+; CHECK-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-NEXT:    mov.b32 %f4, %r6;
+; CHECK-NEXT:    add.f32 %f5, %f3, %f4;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs11, %f5;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs11;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_expanded_unsafe_multiple_uses_of_fma(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<12>;
+; CHECK-FTZ-NEXT:    .reg .b32 %r<7>;
+; CHECK-FTZ-NEXT:    .reg .f32 %f<6>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_multiple_uses_of_fma_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_multiple_uses_of_fma_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_multiple_uses_of_fma_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    mov.b16 %rs5, 0x0000;
+; CHECK-FTZ-NEXT:    max.bf16 %rs6, %rs4, %rs5;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r1, %rs4;
+; CHECK-FTZ-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-FTZ-NEXT:    mov.b32 %f1, %r2;
+; CHECK-FTZ-NEXT:    add.ftz.f32 %f2, %f1, 0f40E00000;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs8, %f2;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r3, %rs6;
+; CHECK-FTZ-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-FTZ-NEXT:    mov.b32 %f3, %r4;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs8;
+; CHECK-FTZ-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-FTZ-NEXT:    mov.b32 %f4, %r6;
+; CHECK-FTZ-NEXT:    add.ftz.f32 %f5, %f3, %f4;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs11, %f5;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs11;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_expanded_unsafe_multiple_uses_of_fma(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<5>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<34>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<11>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_unsafe_multiple_uses_of_fma_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_unsafe_multiple_uses_of_fma_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_unsafe_multiple_uses_of_fma_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    add.f32 %f7, %f5, 0f40E00000;
+; CHECK-SM70-NEXT:    mov.b32 %r20, %f7;
+; CHECK-SM70-NEXT:    bfe.u32 %r21, %r20, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r22, %r21, %r20;
+; CHECK-SM70-NEXT:    add.s32 %r23, %r22, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f7, %f7;
+; CHECK-SM70-NEXT:    or.b32 %r24, %r20, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r25, %r24, %r23, %p3;
+; CHECK-SM70-NEXT:    and.b32 %r26, %r25, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f8, %r26;
+; CHECK-SM70-NEXT:    and.b32 %r27, %r19, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f9, %r27;
+; CHECK-SM70-NEXT:    add.f32 %f10, %f9, %f8;
+; CHECK-SM70-NEXT:    mov.b32 %r28, %f10;
+; CHECK-SM70-NEXT:    bfe.u32 %r29, %r28, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r30, %r29, %r28;
+; CHECK-SM70-NEXT:    add.s32 %r31, %r30, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %f10, %f10;
+; CHECK-SM70-NEXT:    or.b32 %r32, %r28, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r33, %r32, %r31, %p4;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r33; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul bfloat %a, %b
+  %2 = fadd bfloat %1, %c
+  %3 = fcmp ogt bfloat %2, 0.0
+  %4 = select i1 %3, bfloat %2, bfloat 0.0
+  %5 = fadd bfloat %2, 7.0
+  %6 = fadd bfloat %4, %5
+  ret bfloat %6
+}
+
+define bfloat @fma_bf16_expanded_maxnum_unsafe(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-LABEL: fma_bf16_expanded_maxnum_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_expanded_maxnum_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.ftz.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_expanded_maxnum_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_maxnum_unsafe_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_maxnum_unsafe_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_maxnum_unsafe_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+  %1 = fmul bfloat %a, %b
+  %2 = fadd bfloat %1, %c
+  %3 = call bfloat @llvm.maxnum.bf16(bfloat %2, bfloat 0.0)
+  ret bfloat %3
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
new file mode 100644
index 00000000000000..3e114e02d463b0
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
@@ -0,0 +1,400 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+
+; Using FTZ should emit fma.ftz.relu
+; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -march=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK-FTZ
+; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -march=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+
+; SM < 80 or (which needs PTX version >= 70) should not emit fma{.ftz}.relu
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 | FileCheck %s --check-prefixes=CHECK-SM70
+
+define half @fma_f16_unsafe(half %a, half %b, half %c) #0 {
+; CHECK-LABEL: fma_f16_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.ftz.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
+; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT:    ret;
+  %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
+  %2 = fcmp ogt half %1, 0.0
+  %3 = select i1 %2, half %1, half 0.0
+  ret half %3
+}
+
+; FMA relu shouldn't be selected if the FMA operation has multiple uses
+define half @fma_f16_unsafe_multiple_uses_of_fma(half %a, half %b, half %c) #0 {
+; CHECK-LABEL: fma_f16_unsafe_multiple_uses_of_fma(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_multiple_uses_of_fma_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_multiple_uses_of_fma_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_multiple_uses_of_fma_param_2];
+; CHECK-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    mov.b16 %rs5, 0x4700;
+; CHECK-NEXT:    add.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT:    add.f16 %rs7, %rs6, %rs4;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs7;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_unsafe_multiple_uses_of_fma(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<8>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_multiple_uses_of_fma_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_multiple_uses_of_fma_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_multiple_uses_of_fma_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.ftz.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    mov.b16 %rs5, 0x4700;
+; CHECK-FTZ-NEXT:    add.ftz.f16 %rs6, %rs4, %rs5;
+; CHECK-FTZ-NEXT:    add.ftz.f16 %rs7, %rs6, %rs4;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs7;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_unsafe_multiple_uses_of_fma(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<8>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_multiple_uses_of_fma_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_multiple_uses_of_fma_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_multiple_uses_of_fma_param_2];
+; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT:    mov.b16 %rs5, 0x4700;
+; CHECK-SM70-NEXT:    add.f16 %rs6, %rs4, %rs5;
+; CHECK-SM70-NEXT:    add.f16 %rs7, %rs6, %rs4;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs7;
+; CHECK-SM70-NEXT:    ret;
+  %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
+  %2 = fcmp ogt half %1, 0.0
+  %3 = select i1 %2, half %1, half 0.0
+  %4 = fadd half %1, 7.0
+  %5 = fadd half %4, %1
+  ret half %5
+}
+
+define half @fma_f16_maxnum_unsafe(half %a, half %b, half %c) #0 {
+; CHECK-LABEL: fma_f16_maxnum_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_f16_maxnum_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.ftz.relu.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_f16_maxnum_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
+; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
+; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT:    ret;
+  %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
+  %2 = call half @llvm.maxnum.f16(half %1, half 0.0)
+  ret half %2
+}
+
+define bfloat @fma_bf16_unsafe(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-LABEL: fma_bf16_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.ftz.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_unsafe_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_unsafe_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_unsafe_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+  %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
+  %2 = fcmp ogt bfloat %1, 0.0
+  %3 = select i1 %2, bfloat %1, bfloat 0.0
+  ret bfloat %3
+}
+
+; FMA_relu shouldn't be selected if the FMA operation has multiple uses
+define bfloat @fma_bf16_unsafe_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-LABEL: fma_bf16_unsafe_multiple_uses_of_fma(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<9>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_unsafe_multiple_uses_of_fma_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_unsafe_multiple_uses_of_fma_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_unsafe_multiple_uses_of_fma_param_2];
+; CHECK-NEXT:    fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
+; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NEXT:    add.f32 %f2, %f1, 0f40E00000;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs6, %f2;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs6;
+; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-NEXT:    mov.b32 %f3, %r4;
+; CHECK-NEXT:    add.f32 %f4, %f3, %f1;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs8, %f4;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs8;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_unsafe_multiple_uses_of_fma(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<9>;
+; CHECK-FTZ-NEXT:    .reg .b32 %r<5>;
+; CHECK-FTZ-NEXT:    .reg .f32 %f<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_unsafe_multiple_uses_of_fma_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_unsafe_multiple_uses_of_fma_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_unsafe_multiple_uses_of_fma_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r1, %rs4;
+; CHECK-FTZ-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-FTZ-NEXT:    mov.b32 %f1, %r2;
+; CHECK-FTZ-NEXT:    add.ftz.f32 %f2, %f1, 0f40E00000;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs6, %f2;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r3, %rs6;
+; CHECK-FTZ-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-FTZ-NEXT:    mov.b32 %f3, %r4;
+; CHECK-FTZ-NEXT:    add.ftz.f32 %f4, %f3, %f1;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs8, %f4;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs8;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_unsafe_multiple_uses_of_fma(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<4>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<27>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<9>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_unsafe_multiple_uses_of_fma_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_unsafe_multiple_uses_of_fma_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_unsafe_multiple_uses_of_fma_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    add.f32 %f6, %f5, 0f40E00000;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    and.b32 %r20, %r19, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f7, %r20;
+; CHECK-SM70-NEXT:    add.f32 %f8, %f7, %f5;
+; CHECK-SM70-NEXT:    mov.b32 %r21, %f8;
+; CHECK-SM70-NEXT:    bfe.u32 %r22, %r21, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r23, %r22, %r21;
+; CHECK-SM70-NEXT:    add.s32 %r24, %r23, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f8, %f8;
+; CHECK-SM70-NEXT:    or.b32 %r25, %r21, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r26, %r25, %r24, %p3;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r26; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+  %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
+  %2 = fcmp ogt bfloat %1, 0.0
+  %3 = select i1 %2, bfloat %1, bfloat 0.0
+  %4 = fadd bfloat %1, 7.0
+  %5 = fadd bfloat %4, %1
+  ret bfloat %5
+}
+
+define bfloat @fma_bf16_maxnum_unsafe(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-LABEL: fma_bf16_maxnum_unsafe(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_maxnum_unsafe_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_maxnum_unsafe_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_maxnum_unsafe_param_2];
+; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FTZ-LABEL: fma_bf16_maxnum_unsafe(
+; CHECK-FTZ:       {
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
+; CHECK-FTZ-EMPTY:
+; CHECK-FTZ-NEXT:  // %bb.0:
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_maxnum_unsafe_param_0];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_maxnum_unsafe_param_1];
+; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_maxnum_unsafe_param_2];
+; CHECK-FTZ-NEXT:    fma.rn.ftz.relu.bf16 %rs4, %rs1, %rs2, %rs3;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
+; CHECK-FTZ-NEXT:    ret;
+;
+; CHECK-SM70-LABEL: fma_bf16_maxnum_unsafe(
+; CHECK-SM70:       {
+; CHECK-SM70-NEXT:    .reg .pred %p<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
+; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM70-EMPTY:
+; CHECK-SM70-NEXT:  // %bb.0:
+; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_maxnum_unsafe_param_2];
+; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
+; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_maxnum_unsafe_param_1];
+; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
+; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_maxnum_unsafe_param_0];
+; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
+; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
+; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
+; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
+; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
+; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
+; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
+; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-SM70-NEXT:    ret;
+  %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
+  %2 = call bfloat @llvm.maxnum.bf16(bfloat %1, bfloat 0.0)
+  ret bfloat %2
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu.ll b/llvm/test/CodeGen/NVPTX/fma-relu.ll
deleted file mode 100644
index 3d95a4df2d3308..00000000000000
--- a/llvm/test/CodeGen/NVPTX/fma-relu.ll
+++ /dev/null
@@ -1,920 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
-
-; Using FTZ should emit fma.ftz.relu
-; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -march=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK-FTZ
-; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -march=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
-
-; Don't contract FMAs
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -nvptx-fma-level=0 | FileCheck %s --check-prefixes=CHECK-NO-FMA-CONTRACTION
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -nvptx-fma-level=2 | FileCheck %s --check-prefixes=CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH
-
-; SM < 80 or (which needs PTX version >= 70) should not emit fma{.ftz}.relu
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 | FileCheck %s --check-prefixes=CHECK-SM70
-
-define half @fma_f16_unsafe(half %a, half %b, half %c) #0 {
-; CHECK-LABEL: fma_f16_unsafe(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
-; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
-; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
-; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-NEXT:    ret;
-;
-; CHECK-FTZ-LABEL: fma_f16_unsafe(
-; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FTZ-EMPTY:
-; CHECK-FTZ-NEXT:  // %bb.0:
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
-; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-FTZ-NEXT:    ret;
-;
-; CHECK-NO-FMA-CONTRACTION-LABEL: fma_f16_unsafe(
-; CHECK-NO-FMA-CONTRACTION:       {
-; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NO-FMA-CONTRACTION-EMPTY:
-; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
-;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_f16_unsafe(
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
-;
-; CHECK-SM70-LABEL: fma_f16_unsafe(
-; CHECK-SM70:       {
-; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
-; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
-; CHECK-SM70-EMPTY:
-; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_unsafe_param_0];
-; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_unsafe_param_1];
-; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_unsafe_param_2];
-; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
-; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
-; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
-; CHECK-SM70-NEXT:    ret;
-  %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
-  %2 = fcmp ogt half %1, 0.0
-  %3 = select i1 %2, half %1, half 0.0
-  ret half %3
-}
-
-define half @fma_f16_maxnum_unsafe(half %a, half %b, half %c) #0 {
-; CHECK-LABEL: fma_f16_maxnum_unsafe(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
-; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
-; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
-; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-NEXT:    ret;
-;
-; CHECK-FTZ-LABEL: fma_f16_maxnum_unsafe(
-; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FTZ-EMPTY:
-; CHECK-FTZ-NEXT:  // %bb.0:
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
-; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-FTZ-NEXT:    ret;
-;
-; CHECK-NO-FMA-CONTRACTION-LABEL: fma_f16_maxnum_unsafe(
-; CHECK-NO-FMA-CONTRACTION:       {
-; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NO-FMA-CONTRACTION-EMPTY:
-; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
-;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_f16_maxnum_unsafe(
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
-;
-; CHECK-SM70-LABEL: fma_f16_maxnum_unsafe(
-; CHECK-SM70:       {
-; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
-; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
-; CHECK-SM70-EMPTY:
-; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_maxnum_unsafe_param_0];
-; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_maxnum_unsafe_param_1];
-; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_maxnum_unsafe_param_2];
-; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
-; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
-; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
-; CHECK-SM70-NEXT:    ret;
-  %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
-  %2 = call half @llvm.maxnum.f16(half %1, half 0.0)
-  ret half %2
-}
-
-define half @fma_f16_expanded_unsafe(half %a, half %b, half %c) #0 {
-; CHECK-LABEL: fma_f16_expanded_unsafe(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
-; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
-; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
-; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-NEXT:    ret;
-;
-; CHECK-FTZ-LABEL: fma_f16_expanded_unsafe(
-; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FTZ-EMPTY:
-; CHECK-FTZ-NEXT:  // %bb.0:
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
-; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-FTZ-NEXT:    ret;
-;
-; CHECK-NO-FMA-CONTRACTION-LABEL: fma_f16_expanded_unsafe(
-; CHECK-NO-FMA-CONTRACTION:       {
-; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NO-FMA-CONTRACTION-EMPTY:
-; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
-;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_f16_expanded_unsafe(
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
-;
-; CHECK-SM70-LABEL: fma_f16_expanded_unsafe(
-; CHECK-SM70:       {
-; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
-; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
-; CHECK-SM70-EMPTY:
-; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_unsafe_param_0];
-; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_unsafe_param_1];
-; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_unsafe_param_2];
-; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
-; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
-; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
-; CHECK-SM70-NEXT:    ret;
-  %1 = fmul half %a, %b
-  %2 = fadd half %1, %c
-  %3 = fcmp ogt half %2, 0.0
-  %4 = select i1 %3, half %2, half 0.0
-  ret half %4
-}
-
-define half @fma_f16_expanded_safe(half %a, half %b, half %c) {
-; CHECK-LABEL: fma_f16_expanded_safe(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<8>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
-; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
-; CHECK-NEXT:    mul.rn.f16 %rs3, %rs1, %rs2;
-; CHECK-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
-; CHECK-NEXT:    add.rn.f16 %rs5, %rs3, %rs4;
-; CHECK-NEXT:    mov.b16 %rs6, 0x0000;
-; CHECK-NEXT:    max.f16 %rs7, %rs5, %rs6;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs7;
-; CHECK-NEXT:    ret;
-;
-; CHECK-FTZ-LABEL: fma_f16_expanded_safe(
-; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<8>;
-; CHECK-FTZ-EMPTY:
-; CHECK-FTZ-NEXT:  // %bb.0:
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
-; CHECK-FTZ-NEXT:    mul.rn.ftz.f16 %rs3, %rs1, %rs2;
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
-; CHECK-FTZ-NEXT:    add.rn.ftz.f16 %rs5, %rs3, %rs4;
-; CHECK-FTZ-NEXT:    mov.b16 %rs6, 0x0000;
-; CHECK-FTZ-NEXT:    max.ftz.f16 %rs7, %rs5, %rs6;
-; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs7;
-; CHECK-FTZ-NEXT:    ret;
-;
-; CHECK-NO-FMA-CONTRACTION-LABEL: fma_f16_expanded_safe(
-; CHECK-NO-FMA-CONTRACTION:       {
-; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<8>;
-; CHECK-NO-FMA-CONTRACTION-EMPTY:
-; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    mul.rn.f16 %rs3, %rs1, %rs2;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    add.rn.f16 %rs5, %rs3, %rs4;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b16 %rs6, 0x0000;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    max.f16 %rs7, %rs5, %rs6;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs7;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
-;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_f16_expanded_safe(
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<8>;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mul.f16 %rs3, %rs1, %rs2;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    add.f16 %rs5, %rs3, %rs4;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b16 %rs6, 0x0000;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    max.f16 %rs7, %rs5, %rs6;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs7;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
-;
-; CHECK-SM70-LABEL: fma_f16_expanded_safe(
-; CHECK-SM70:       {
-; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
-; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
-; CHECK-SM70-EMPTY:
-; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_safe_param_0];
-; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_safe_param_1];
-; CHECK-SM70-NEXT:    mul.rn.f16 %rs3, %rs1, %rs2;
-; CHECK-SM70-NEXT:    ld.param.b16 %rs4, [fma_f16_expanded_safe_param_2];
-; CHECK-SM70-NEXT:    add.rn.f16 %rs5, %rs3, %rs4;
-; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs5;
-; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs6, %f2;
-; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs6;
-; CHECK-SM70-NEXT:    ret;
-  %1 = fmul half %a, %b
-  %2 = fadd half %1, %c
-  %3 = fcmp ogt half %2, 0.0
-  %4 = select i1 %3, half %2, half 0.0
-  ret half %4
-}
-
-define half @fma_f16_expanded_maxnum_unsafe(half %a, half %b, half %c) #0 {
-; CHECK-LABEL: fma_f16_expanded_maxnum_unsafe(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
-; CHECK-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
-; CHECK-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
-; CHECK-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-NEXT:    ret;
-;
-; CHECK-FTZ-LABEL: fma_f16_expanded_maxnum_unsafe(
-; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FTZ-EMPTY:
-; CHECK-FTZ-NEXT:  // %bb.0:
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
-; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-FTZ-NEXT:    ret;
-;
-; CHECK-NO-FMA-CONTRACTION-LABEL: fma_f16_expanded_maxnum_unsafe(
-; CHECK-NO-FMA-CONTRACTION:       {
-; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NO-FMA-CONTRACTION-EMPTY:
-; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
-;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_f16_expanded_maxnum_unsafe(
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
-;
-; CHECK-SM70-LABEL: fma_f16_expanded_maxnum_unsafe(
-; CHECK-SM70:       {
-; CHECK-SM70-NEXT:    .reg .b16 %rs<6>;
-; CHECK-SM70-NEXT:    .reg .f32 %f<3>;
-; CHECK-SM70-EMPTY:
-; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.b16 %rs1, [fma_f16_expanded_maxnum_unsafe_param_0];
-; CHECK-SM70-NEXT:    ld.param.b16 %rs2, [fma_f16_expanded_maxnum_unsafe_param_1];
-; CHECK-SM70-NEXT:    ld.param.b16 %rs3, [fma_f16_expanded_maxnum_unsafe_param_2];
-; CHECK-SM70-NEXT:    fma.rn.f16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-SM70-NEXT:    cvt.f32.f16 %f1, %rs4;
-; CHECK-SM70-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %f2;
-; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
-; CHECK-SM70-NEXT:    ret;
-  %1 = fmul half %a, %b
-  %2 = fadd half %1, %c
-  %3 = call half @llvm.maxnum.f16(half %2, half 0.0)
-  ret half %3
-}
-
-define bfloat @fma_bf16_unsafe(bfloat %a, bfloat %b, bfloat %c) #0 {
-; CHECK-LABEL: fma_bf16_unsafe(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_unsafe_param_0];
-; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_unsafe_param_1];
-; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_unsafe_param_2];
-; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-NEXT:    ret;
-;
-; CHECK-FTZ-LABEL: fma_bf16_unsafe(
-; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FTZ-EMPTY:
-; CHECK-FTZ-NEXT:  // %bb.0:
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_unsafe_param_0];
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_unsafe_param_1];
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_unsafe_param_2];
-; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-FTZ-NEXT:    ret;
-;
-; CHECK-NO-FMA-CONTRACTION-LABEL: fma_bf16_unsafe(
-; CHECK-NO-FMA-CONTRACTION:       {
-; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NO-FMA-CONTRACTION-EMPTY:
-; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_bf16_unsafe_param_0];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_bf16_unsafe_param_1];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_bf16_unsafe_param_2];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
-;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_bf16_unsafe(
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_bf16_unsafe_param_0];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_bf16_unsafe_param_1];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_bf16_unsafe_param_2];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
-;
-; CHECK-SM70-LABEL: fma_bf16_unsafe(
-; CHECK-SM70:       {
-; CHECK-SM70-NEXT:    .reg .pred %p<3>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
-; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
-; CHECK-SM70-EMPTY:
-; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_unsafe_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_unsafe_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_unsafe_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
-; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
-; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
-; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
-; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
-; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
-; CHECK-SM70-NEXT:    ret;
-  %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
-  %2 = fcmp ogt bfloat %1, 0.0
-  %3 = select i1 %2, bfloat %1, bfloat 0.0
-  ret bfloat %3
-}
-
-define bfloat @fma_bf16_maxnum_unsafe(bfloat %a, bfloat %b, bfloat %c) #0 {
-; CHECK-LABEL: fma_bf16_maxnum_unsafe(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_maxnum_unsafe_param_0];
-; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_maxnum_unsafe_param_1];
-; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_maxnum_unsafe_param_2];
-; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-NEXT:    ret;
-;
-; CHECK-FTZ-LABEL: fma_bf16_maxnum_unsafe(
-; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FTZ-EMPTY:
-; CHECK-FTZ-NEXT:  // %bb.0:
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_maxnum_unsafe_param_0];
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_maxnum_unsafe_param_1];
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_maxnum_unsafe_param_2];
-; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-FTZ-NEXT:    ret;
-;
-; CHECK-NO-FMA-CONTRACTION-LABEL: fma_bf16_maxnum_unsafe(
-; CHECK-NO-FMA-CONTRACTION:       {
-; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NO-FMA-CONTRACTION-EMPTY:
-; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_bf16_maxnum_unsafe_param_0];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_bf16_maxnum_unsafe_param_1];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_bf16_maxnum_unsafe_param_2];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
-;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_bf16_maxnum_unsafe(
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_bf16_maxnum_unsafe_param_0];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_bf16_maxnum_unsafe_param_1];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_bf16_maxnum_unsafe_param_2];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
-;
-; CHECK-SM70-LABEL: fma_bf16_maxnum_unsafe(
-; CHECK-SM70:       {
-; CHECK-SM70-NEXT:    .reg .pred %p<3>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
-; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
-; CHECK-SM70-EMPTY:
-; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_maxnum_unsafe_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_maxnum_unsafe_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_maxnum_unsafe_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
-; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
-; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
-; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
-; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
-; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
-; CHECK-SM70-NEXT:    ret;
-  %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
-  %2 = call bfloat @llvm.maxnum.bf16(bfloat %1, bfloat 0.0)
-  ret bfloat %2
-}
-
-define bfloat @fma_bf16_expanded_safe(bfloat %a, bfloat %b, bfloat %c) {
-; CHECK-LABEL: fma_bf16_expanded_safe(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<6>;
-; CHECK-NEXT:    .reg .b32 %r<9>;
-; CHECK-NEXT:    .reg .f32 %f<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
-; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-NEXT:    mov.b32 %f1, %r2;
-; CHECK-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
-; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-NEXT:    mov.b32 %f2, %r4;
-; CHECK-NEXT:    mul.rn.f32 %f3, %f2, %f1;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs1, %f3;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs1;
-; CHECK-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-NEXT:    mov.b32 %f4, %r6;
-; CHECK-NEXT:    ld.param.u16 %r7, [fma_bf16_expanded_safe_param_2];
-; CHECK-NEXT:    shl.b32 %r8, %r7, 16;
-; CHECK-NEXT:    mov.b32 %f5, %r8;
-; CHECK-NEXT:    add.rn.f32 %f6, %f4, %f5;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs3, %f6;
-; CHECK-NEXT:    mov.b16 %rs4, 0x0000;
-; CHECK-NEXT:    max.bf16 %rs5, %rs3, %rs4;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs5;
-; CHECK-NEXT:    ret;
-;
-; CHECK-FTZ-LABEL: fma_bf16_expanded_safe(
-; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<6>;
-; CHECK-FTZ-NEXT:    .reg .b32 %r<9>;
-; CHECK-FTZ-NEXT:    .reg .f32 %f<7>;
-; CHECK-FTZ-EMPTY:
-; CHECK-FTZ-NEXT:  // %bb.0:
-; CHECK-FTZ-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
-; CHECK-FTZ-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f1, %r2;
-; CHECK-FTZ-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
-; CHECK-FTZ-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f2, %r4;
-; CHECK-FTZ-NEXT:    mul.rn.ftz.f32 %f3, %f2, %f1;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %f3;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs1;
-; CHECK-FTZ-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f4, %r6;
-; CHECK-FTZ-NEXT:    ld.param.u16 %r7, [fma_bf16_expanded_safe_param_2];
-; CHECK-FTZ-NEXT:    shl.b32 %r8, %r7, 16;
-; CHECK-FTZ-NEXT:    mov.b32 %f5, %r8;
-; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f6, %f4, %f5;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f6;
-; CHECK-FTZ-NEXT:    mov.b16 %rs4, 0x0000;
-; CHECK-FTZ-NEXT:    max.bf16 %rs5, %rs3, %rs4;
-; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs5;
-; CHECK-FTZ-NEXT:    ret;
-;
-; CHECK-NO-FMA-CONTRACTION-LABEL: fma_bf16_expanded_safe(
-; CHECK-NO-FMA-CONTRACTION:       {
-; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<6>;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b32 %r<9>;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .f32 %f<7>;
-; CHECK-NO-FMA-CONTRACTION-EMPTY:
-; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b32 %f1, %r2;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b32 %f2, %r4;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    mul.rn.f32 %f3, %f2, %f1;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    cvt.rn.bf16.f32 %rs1, %f3;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    cvt.u32.u16 %r5, %rs1;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b32 %f4, %r6;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.u16 %r7, [fma_bf16_expanded_safe_param_2];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    shl.b32 %r8, %r7, 16;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b32 %f5, %r8;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    add.rn.f32 %f6, %f4, %f5;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    cvt.rn.bf16.f32 %rs3, %f6;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    mov.b16 %rs4, 0x0000;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    max.bf16 %rs5, %rs3, %rs4;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs5;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
-;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_bf16_expanded_safe(
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<6>;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b32 %r<9>;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .f32 %f<7>;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b32 %f1, %r2;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b32 %f2, %r4;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mul.f32 %f3, %f2, %f1;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    cvt.rn.bf16.f32 %rs1, %f3;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    cvt.u32.u16 %r5, %rs1;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b32 %f4, %r6;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.u16 %r7, [fma_bf16_expanded_safe_param_2];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    shl.b32 %r8, %r7, 16;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b32 %f5, %r8;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    add.f32 %f6, %f4, %f5;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    cvt.rn.bf16.f32 %rs3, %f6;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    mov.b16 %rs4, 0x0000;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    max.bf16 %rs5, %rs3, %rs4;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs5;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
-;
-; CHECK-SM70-LABEL: fma_bf16_expanded_safe(
-; CHECK-SM70:       {
-; CHECK-SM70-NEXT:    .reg .pred %p<4>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<27>;
-; CHECK-SM70-NEXT:    .reg .f32 %f<9>;
-; CHECK-SM70-EMPTY:
-; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_safe_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_safe_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    mul.rn.f32 %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r5, %f3;
-; CHECK-SM70-NEXT:    bfe.u32 %r6, %r5, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r7, %r6, %r5;
-; CHECK-SM70-NEXT:    add.s32 %r8, %r7, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
-; CHECK-SM70-NEXT:    or.b32 %r9, %r5, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
-; CHECK-SM70-NEXT:    and.b32 %r11, %r10, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f4, %r11;
-; CHECK-SM70-NEXT:    ld.param.u16 %r12, [fma_bf16_expanded_safe_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r13, %r12, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    add.rn.f32 %f6, %f4, %f5;
-; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
-; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
-; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
-; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
-; CHECK-SM70-NEXT:    and.b32 %r20, %r19, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f7, %r20;
-; CHECK-SM70-NEXT:    max.f32 %f8, %f7, 0f00000000;
-; CHECK-SM70-NEXT:    mov.b32 %r21, %f8;
-; CHECK-SM70-NEXT:    bfe.u32 %r22, %r21, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r23, %r22, %r21;
-; CHECK-SM70-NEXT:    add.s32 %r24, %r23, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f8, %f8;
-; CHECK-SM70-NEXT:    or.b32 %r25, %r21, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r26, %r25, %r24, %p3;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r26; }
-; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
-; CHECK-SM70-NEXT:    ret;
-  %1 = fmul bfloat %a, %b
-  %2 = fadd bfloat %1, %c
-  %3 = fcmp ogt bfloat %2, 0.0
-  %4 = select i1 %3, bfloat %2, bfloat 0.0
-  ret bfloat %4
-}
-
-define bfloat @fma_bf16_expanded_unsafe(bfloat %a, bfloat %b, bfloat %c) #0 {
-; CHECK-LABEL: fma_bf16_expanded_unsafe(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_param_0];
-; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_param_1];
-; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_param_2];
-; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-NEXT:    ret;
-;
-; CHECK-FTZ-LABEL: fma_bf16_expanded_unsafe(
-; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FTZ-EMPTY:
-; CHECK-FTZ-NEXT:  // %bb.0:
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_param_0];
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_param_1];
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_param_2];
-; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-FTZ-NEXT:    ret;
-;
-; CHECK-NO-FMA-CONTRACTION-LABEL: fma_bf16_expanded_unsafe(
-; CHECK-NO-FMA-CONTRACTION:       {
-; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NO-FMA-CONTRACTION-EMPTY:
-; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_param_0];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_param_1];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_param_2];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
-;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_bf16_expanded_unsafe(
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_unsafe_param_0];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_unsafe_param_1];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_unsafe_param_2];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
-;
-; CHECK-SM70-LABEL: fma_bf16_expanded_unsafe(
-; CHECK-SM70:       {
-; CHECK-SM70-NEXT:    .reg .pred %p<3>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
-; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
-; CHECK-SM70-EMPTY:
-; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_unsafe_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_unsafe_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_unsafe_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
-; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
-; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
-; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
-; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
-; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
-; CHECK-SM70-NEXT:    ret;
-  %1 = fmul bfloat %a, %b
-  %2 = fadd bfloat %1, %c
-  %3 = fcmp ogt bfloat %2, 0.0
-  %4 = select i1 %3, bfloat %2, bfloat 0.0
-  ret bfloat %4
-}
-
-define bfloat @fma_bf16_expanded_maxnum_unsafe(bfloat %a, bfloat %b, bfloat %c) #0 {
-; CHECK-LABEL: fma_bf16_expanded_maxnum_unsafe(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_unsafe_param_0];
-; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_unsafe_param_1];
-; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_unsafe_param_2];
-; CHECK-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-NEXT:    ret;
-;
-; CHECK-FTZ-LABEL: fma_bf16_expanded_maxnum_unsafe(
-; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FTZ-EMPTY:
-; CHECK-FTZ-NEXT:  // %bb.0:
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_unsafe_param_0];
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_unsafe_param_1];
-; CHECK-FTZ-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_unsafe_param_2];
-; CHECK-FTZ-NEXT:    fma.rn.relu.ftz.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-FTZ-NEXT:    ret;
-;
-; CHECK-NO-FMA-CONTRACTION-LABEL: fma_bf16_expanded_maxnum_unsafe(
-; CHECK-NO-FMA-CONTRACTION:       {
-; CHECK-NO-FMA-CONTRACTION-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NO-FMA-CONTRACTION-EMPTY:
-; CHECK-NO-FMA-CONTRACTION-NEXT:  // %bb.0:
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_unsafe_param_0];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_unsafe_param_1];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_unsafe_param_2];
-; CHECK-NO-FMA-CONTRACTION-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-NO-FMA-CONTRACTION-NEXT:    ret;
-;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-LABEL: fma_bf16_expanded_maxnum_unsafe(
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH:       {
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    .reg .b16 %rs<5>;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-EMPTY:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:  // %bb.0:
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_maxnum_unsafe_param_0];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs2, [fma_bf16_expanded_maxnum_unsafe_param_1];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ld.param.b16 %rs3, [fma_bf16_expanded_maxnum_unsafe_param_2];
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    fma.rn.relu.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    st.param.b16 [func_retval0], %rs4;
-; CHECK-FMA-CONTRACTION-WITHOUT-UNSAFE-MATH-NEXT:    ret;
-;
-; CHECK-SM70-LABEL: fma_bf16_expanded_maxnum_unsafe(
-; CHECK-SM70:       {
-; CHECK-SM70-NEXT:    .reg .pred %p<3>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
-; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
-; CHECK-SM70-EMPTY:
-; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_maxnum_unsafe_param_2];
-; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_maxnum_unsafe_param_1];
-; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_maxnum_unsafe_param_0];
-; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
-; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
-; CHECK-SM70-NEXT:    mov.b32 %r7, %f4;
-; CHECK-SM70-NEXT:    bfe.u32 %r8, %r7, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r9, %r8, %r7;
-; CHECK-SM70-NEXT:    add.s32 %r10, %r9, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
-; CHECK-SM70-NEXT:    or.b32 %r11, %r7, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
-; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
-; CHECK-SM70-NEXT:    max.f32 %f6, %f5, 0f00000000;
-; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
-; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
-; CHECK-SM70-NEXT:    add.s32 %r16, %r15, %r14;
-; CHECK-SM70-NEXT:    add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
-; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
-; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
-; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs1;
-; CHECK-SM70-NEXT:    ret;
-  %1 = fmul bfloat %a, %b
-  %2 = fadd bfloat %1, %c
-  %3 = call bfloat @llvm.maxnum.bf16(bfloat %2, bfloat 0.0)
-  ret bfloat %3
-}
-
-attributes #0 = { "unsafe-fp-math"="true" }



More information about the llvm-commits mailing list