[llvm] 5e5fd0e - [NVPTX] Select bfloat16 add/mul/sub as fma on SM80 (#121065)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 16 06:53:28 PST 2025
Author: peterbell10
Date: 2025-01-16T14:53:24Z
New Revision: 5e5fd0e6fc50cc1198750308c11433a5b3acfd0f
URL: https://github.com/llvm/llvm-project/commit/5e5fd0e6fc50cc1198750308c11433a5b3acfd0f
DIFF: https://github.com/llvm/llvm-project/commit/5e5fd0e6fc50cc1198750308c11433a5b3acfd0f.diff
LOG: [NVPTX] Select bfloat16 add/mul/sub as fma on SM80 (#121065)
SM80 has fma for bfloat16 but not add/mul/sub. Currently these ops incur
a promotion to f32, but we can avoid this by writing them in terms of
the fma:
```
FADD(a, b) -> FMA(a, 1.0, b)
FMUL(a, b) -> FMA(a, b, -0.0)
FSUB(a, b) -> FMA(b, -1.0, a)
```
Unfortunately there is no `fma.ftz` so when ftz is enabled, we still
fall back to promotion.
Added:
Modified:
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
llvm/lib/Target/NVPTX/NVPTXISelLowering.h
llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
llvm/lib/Target/NVPTX/NVPTXSubtarget.h
llvm/test/CodeGen/NVPTX/atomics-sm90.ll
llvm/test/CodeGen/NVPTX/bf16-instructions.ll
llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 2e66b67dfdcc76..8f6adf2c22f922 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "NVPTXISelDAGToDAG.h"
+#include "NVPTX.h"
#include "NVPTXUtilities.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/ISDOpcodes.h"
@@ -191,6 +192,12 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
}
break;
}
+ case ISD::FADD:
+ case ISD::FMUL:
+ case ISD::FSUB:
+ if (tryBF16ArithToFMA(N))
+ return;
+ break;
default:
break;
}
@@ -2450,6 +2457,62 @@ bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
return true;
}
+// Select bf16/bf16v2 FADD, FSUB, FMUL as fma on targets with only fma
+bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(SDNode *N) {
+ EVT VT = SDValue(N, 0).getValueType();
+ if (VT.getScalarType() != MVT::bf16)
+ return false;
+
+ const NVPTXSubtarget *STI = TM.getSubtargetImpl();
+ if (STI->hasNativeBF16Support(N->getOpcode()))
+ return false;
+
+ const bool IsVec = VT.isVector();
+ assert(!IsVec || VT.getVectorNumElements() == 2);
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SmallVector<SDValue, 3> Operands;
+ auto GetConstant = [&](float Value) -> SDValue {
+ // BF16 immediates must be legalized to integer register values
+ APFloat APF(Value);
+ bool LosesInfo;
+ APF.convert(APFloat::BFloat(), APFloat::rmNearestTiesToEven, &LosesInfo);
+ assert(!LosesInfo);
+ if (IsVec) {
+ auto API = APF.bitcastToAPInt();
+ API = API.concat(API);
+ auto Const = CurDAG->getTargetConstant(API, DL, MVT::i32);
+ return SDValue(CurDAG->getMachineNode(NVPTX::IMOV32ri, DL, VT, Const), 0);
+ }
+ auto Const = CurDAG->getTargetConstantFP(APF, DL, VT);
+ return SDValue(CurDAG->getMachineNode(NVPTX::BFMOV16ri, DL, VT, Const), 0);
+ };
+
+ switch (N->getOpcode()) {
+ case ISD::FADD:
+ // add(a, b) -> fma(a, 1.0, b)
+ Operands = {N0, GetConstant(1.0), N1};
+ break;
+ case ISD::FSUB:
+ // sub(a, b) -> fma(b, -1.0, a)
+ Operands = {N1, GetConstant(-1.0), N0};
+ break;
+ case ISD::FMUL:
+ // mul(a, b) -> fma(a, b, -0.0)
+ // NOTE: The identity is -0, not 0, because -0 + 0 == 0 for floats
+ Operands = {N0, N1, GetConstant(-0.0)};
+ break;
+ default:
+ llvm_unreachable("Unexpected opcode");
+ };
+
+ int Opcode = IsVec ? NVPTX::BFMA16x2rrr : NVPTX::BFMA16rrr;
+ MachineSDNode *FMA = CurDAG->getMachineNode(Opcode, DL, VT, Operands);
+ ReplaceNode(N, FMA);
+ return true;
+}
+
static inline bool isAddLike(const SDValue V) {
return V.getOpcode() == ISD::ADD ||
(V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 8cadde8a822647..7661f153238fcd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -84,6 +84,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool tryFence(SDNode *N);
void SelectAddrSpaceCast(SDNode *N);
bool tryBFE(SDNode *N);
+ bool tryBF16ArithToFMA(SDNode *N);
bool tryConstantFP(SDNode *N);
bool SelectSETP_F16X2(SDNode *N);
bool SelectSETP_BF16X2(SDNode *N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 184f96b872aa62..899db28a0ef642 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -535,34 +535,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
LegalizeAction NoBF16Action) {
- bool IsOpSupported = STI.hasBF16Math();
- switch (Op) {
- // Several BF16 instructions are available on sm_90 only.
- case ISD::FADD:
- case ISD::FMUL:
- case ISD::FSUB:
- case ISD::SELECT:
- case ISD::SELECT_CC:
- case ISD::SETCC:
- case ISD::FEXP2:
- case ISD::FCEIL:
- case ISD::FFLOOR:
- case ISD::FNEARBYINT:
- case ISD::FRINT:
- case ISD::FROUNDEVEN:
- case ISD::FTRUNC:
- IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78;
- break;
- // Several BF16 instructions are available on sm_80 only.
- case ISD::FMINNUM:
- case ISD::FMAXNUM:
- case ISD::FMAXNUM_IEEE:
- case ISD::FMINNUM_IEEE:
- case ISD::FMAXIMUM:
- case ISD::FMINIMUM:
- IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
- break;
- }
+ bool IsOpSupported = STI.hasNativeBF16Support(Op);
setOperationAction(
Op, VT, IsOpSupported ? Action : NoBF16Action);
};
@@ -862,6 +835,15 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
AddPromotedToType(Op, MVT::bf16, MVT::f32);
}
+ // On SM80, we select add/mul/sub as fma to avoid promotion to float
+ for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB}) {
+ for (const auto &VT : {MVT::bf16, MVT::v2bf16}) {
+ if (!STI.hasNativeBF16Support(Op) && STI.hasNativeBF16Support(ISD::FMA)) {
+ setOperationAction(Op, VT, Custom);
+ }
+ }
+ }
+
// f16/f16x2 neg was introduced in PTX 60, SM_53.
const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
STI.getPTXVersion() >= 60 &&
@@ -2498,6 +2480,27 @@ SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
}
+static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ EVT NVT = MVT::f32;
+ if (VT.isVector()) {
+ NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());
+ }
+ SDLoc DL(N);
+ SDValue Tmp0 = DAG.getFPExtendOrRound(N->getOperand(0), DL, NVT);
+ SDValue Tmp1 = DAG.getFPExtendOrRound(N->getOperand(1), DL, NVT);
+ SDValue Res = DAG.getNode(N->getOpcode(), DL, NVT, Tmp0, Tmp1, N->getFlags());
+ return DAG.getFPExtendOrRound(Res, DL, VT);
+}
+
+SDValue NVPTXTargetLowering::PromoteBinOpIfF32FTZ(SDValue Op,
+ SelectionDAG &DAG) const {
+ if (useF32FTZ(DAG.getMachineFunction())) {
+ return PromoteBinOpToF32(Op.getNode(), DAG);
+ }
+ return Op;
+}
+
SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
@@ -2689,6 +2692,12 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerSTACKSAVE(Op, DAG);
case ISD::CopyToReg:
return LowerCopyToReg_128(Op, DAG);
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ // Used only for bf16 on SM80, where we select fma for non-ftz operation
+ return PromoteBinOpIfF32FTZ(Op, DAG);
+
default:
llvm_unreachable("Custom lowering not defined for operation");
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 51265ed2179d88..5adf69d621552f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -278,6 +278,8 @@ class NVPTXTargetLowering : public TargetLowering {
SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
+ SDValue PromoteBinOpIfF32FTZ(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 74ce6a9fc4ac08..e5d680c19d9211 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -70,6 +70,38 @@ bool NVPTXSubtarget::allowFP16Math() const {
return hasFP16Math() && NoF16Math == false;
}
+bool NVPTXSubtarget::hasNativeBF16Support(int Opcode) const {
+ if (!hasBF16Math())
+ return false;
+
+ switch (Opcode) {
+ // Several BF16 instructions are available on sm_90 only.
+ case ISD::FADD:
+ case ISD::FMUL:
+ case ISD::FSUB:
+ case ISD::SELECT:
+ case ISD::SELECT_CC:
+ case ISD::SETCC:
+ case ISD::FEXP2:
+ case ISD::FCEIL:
+ case ISD::FFLOOR:
+ case ISD::FNEARBYINT:
+ case ISD::FRINT:
+ case ISD::FROUNDEVEN:
+ case ISD::FTRUNC:
+ return getSmVersion() >= 90 && getPTXVersion() >= 78;
+ // Several BF16 instructions are available on sm_80 only.
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case ISD::FMAXNUM_IEEE:
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXIMUM:
+ case ISD::FMINIMUM:
+ return getSmVersion() >= 80 && getPTXVersion() >= 70;
+ }
+ return true;
+}
+
void NVPTXSubtarget::failIfClustersUnsupported(
std::string const &FailureMessage) const {
if (hasClusters())
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index bbc1cca7c12d85..3b5c28e357e0cc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -118,6 +118,8 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
}
bool hasTargetName() const { return !TargetName.empty(); }
+ bool hasNativeBF16Support(int Opcode) const;
+
// Get maximum value of required alignments among the supported data types.
// From the PTX ISA doc, section 8.2.3:
// The memory consistency model relates operations executed on memory
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index f81b785f13225c..67552b95e04915 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -46,58 +46,52 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-LABEL: test(
; CHECKPTX71: {
; CHECKPTX71-NEXT: .reg .pred %p<5>;
-; CHECKPTX71-NEXT: .reg .b16 %rs<22>;
+; CHECKPTX71-NEXT: .reg .b16 %rs<26>;
; CHECKPTX71-NEXT: .reg .b32 %r<4>;
-; CHECKPTX71-NEXT: .reg .f32 %f<12>;
; CHECKPTX71-EMPTY:
; CHECKPTX71-NEXT: // %bb.0:
; CHECKPTX71-NEXT: ld.param.b16 %rs13, [test_param_3];
; CHECKPTX71-NEXT: ld.param.u32 %r3, [test_param_2];
; CHECKPTX71-NEXT: ld.param.u32 %r2, [test_param_1];
; CHECKPTX71-NEXT: ld.param.u32 %r1, [test_param_0];
-; CHECKPTX71-NEXT: ld.b16 %rs18, [%r1];
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs13;
+; CHECKPTX71-NEXT: ld.b16 %rs22, [%r1];
; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start14
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs18;
-; CHECKPTX71-NEXT: add.rn.f32 %f3, %f2, %f1;
-; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs14, %f3;
-; CHECKPTX71-NEXT: atom.cas.b16 %rs3, [%r1], %rs18, %rs14;
-; CHECKPTX71-NEXT: setp.ne.s16 %p1, %rs3, %rs18;
-; CHECKPTX71-NEXT: mov.u16 %rs18, %rs3;
+; CHECKPTX71-NEXT: mov.b16 %rs14, 0x3F80;
+; CHECKPTX71-NEXT: fma.rn.bf16 %rs15, %rs22, %rs14, %rs13;
+; CHECKPTX71-NEXT: atom.cas.b16 %rs3, [%r1], %rs22, %rs15;
+; CHECKPTX71-NEXT: setp.ne.s16 %p1, %rs3, %rs22;
+; CHECKPTX71-NEXT: mov.u16 %rs22, %rs3;
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end13
-; CHECKPTX71-NEXT: ld.b16 %rs19, [%r1];
+; CHECKPTX71-NEXT: ld.b16 %rs23, [%r1];
; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start8
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs19;
-; CHECKPTX71-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs15, %f5;
-; CHECKPTX71-NEXT: atom.cas.b16 %rs6, [%r1], %rs19, %rs15;
-; CHECKPTX71-NEXT: setp.ne.s16 %p2, %rs6, %rs19;
-; CHECKPTX71-NEXT: mov.u16 %rs19, %rs6;
+; CHECKPTX71-NEXT: mov.b16 %rs16, 0x3F80;
+; CHECKPTX71-NEXT: fma.rn.bf16 %rs17, %rs23, %rs16, %rs16;
+; CHECKPTX71-NEXT: atom.cas.b16 %rs6, [%r1], %rs23, %rs17;
+; CHECKPTX71-NEXT: setp.ne.s16 %p2, %rs6, %rs23;
+; CHECKPTX71-NEXT: mov.u16 %rs23, %rs6;
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end7
-; CHECKPTX71-NEXT: ld.global.b16 %rs20, [%r2];
+; CHECKPTX71-NEXT: ld.global.b16 %rs24, [%r2];
; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start2
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f7, %rs20;
-; CHECKPTX71-NEXT: add.rn.f32 %f8, %f7, %f1;
-; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs16, %f8;
-; CHECKPTX71-NEXT: atom.global.cas.b16 %rs9, [%r2], %rs20, %rs16;
-; CHECKPTX71-NEXT: setp.ne.s16 %p3, %rs9, %rs20;
-; CHECKPTX71-NEXT: mov.u16 %rs20, %rs9;
+; CHECKPTX71-NEXT: mov.b16 %rs18, 0x3F80;
+; CHECKPTX71-NEXT: fma.rn.bf16 %rs19, %rs24, %rs18, %rs13;
+; CHECKPTX71-NEXT: atom.global.cas.b16 %rs9, [%r2], %rs24, %rs19;
+; CHECKPTX71-NEXT: setp.ne.s16 %p3, %rs9, %rs24;
+; CHECKPTX71-NEXT: mov.u16 %rs24, %rs9;
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end1
-; CHECKPTX71-NEXT: ld.shared.b16 %rs21, [%r3];
+; CHECKPTX71-NEXT: ld.shared.b16 %rs25, [%r3];
; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f10, %rs21;
-; CHECKPTX71-NEXT: add.rn.f32 %f11, %f10, %f1;
-; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs17, %f11;
-; CHECKPTX71-NEXT: atom.shared.cas.b16 %rs12, [%r3], %rs21, %rs17;
-; CHECKPTX71-NEXT: setp.ne.s16 %p4, %rs12, %rs21;
-; CHECKPTX71-NEXT: mov.u16 %rs21, %rs12;
+; CHECKPTX71-NEXT: mov.b16 %rs20, 0x3F80;
+; CHECKPTX71-NEXT: fma.rn.bf16 %rs21, %rs25, %rs20, %rs13;
+; CHECKPTX71-NEXT: atom.shared.cas.b16 %rs12, [%r3], %rs25, %rs21;
+; CHECKPTX71-NEXT: setp.ne.s16 %p4, %rs12, %rs25;
+; CHECKPTX71-NEXT: mov.u16 %rs25, %rs12;
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end
; CHECKPTX71-NEXT: ret;
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index 6828bac18cad7f..0c1b1e21669286 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -42,17 +42,14 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
;
; SM80-LABEL: test_fadd(
; SM80: {
-; SM80-NEXT: .reg .b16 %rs<4>;
-; SM80-NEXT: .reg .f32 %f<4>;
+; SM80-NEXT: .reg .b16 %rs<5>;
; SM80-EMPTY:
; SM80-NEXT: // %bb.0:
; SM80-NEXT: ld.param.b16 %rs1, [test_fadd_param_0];
; SM80-NEXT: ld.param.b16 %rs2, [test_fadd_param_1];
-; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
-; SM80-NEXT: cvt.f32.bf16 %f2, %rs1;
-; SM80-NEXT: add.rn.f32 %f3, %f2, %f1;
-; SM80-NEXT: cvt.rn.bf16.f32 %rs3, %f3;
-; SM80-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM80-NEXT: mov.b16 %rs3, 0x3F80;
+; SM80-NEXT: fma.rn.bf16 %rs4, %rs1, %rs3, %rs2;
+; SM80-NEXT: st.param.b16 [func_retval0], %rs4;
; SM80-NEXT: ret;
;
; SM80-FTZ-LABEL: test_fadd(
@@ -113,17 +110,14 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
;
; SM80-LABEL: test_fsub(
; SM80: {
-; SM80-NEXT: .reg .b16 %rs<4>;
-; SM80-NEXT: .reg .f32 %f<4>;
+; SM80-NEXT: .reg .b16 %rs<5>;
; SM80-EMPTY:
; SM80-NEXT: // %bb.0:
; SM80-NEXT: ld.param.b16 %rs1, [test_fsub_param_0];
-; SM80-NEXT: ld.param.b16 %rs2, [test_fsub_param_1];
-; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
-; SM80-NEXT: cvt.f32.bf16 %f2, %rs1;
-; SM80-NEXT: sub.rn.f32 %f3, %f2, %f1;
-; SM80-NEXT: cvt.rn.bf16.f32 %rs3, %f3;
-; SM80-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM80-NEXT: mov.b16 %rs2, 0xBF80;
+; SM80-NEXT: ld.param.b16 %rs3, [test_fsub_param_1];
+; SM80-NEXT: fma.rn.bf16 %rs4, %rs3, %rs2, %rs1;
+; SM80-NEXT: st.param.b16 [func_retval0], %rs4;
; SM80-NEXT: ret;
;
; SM80-FTZ-LABEL: test_fsub(
@@ -202,23 +196,14 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
;
; SM80-LABEL: test_faddx2(
; SM80: {
-; SM80-NEXT: .reg .b16 %rs<5>;
-; SM80-NEXT: .reg .b32 %r<4>;
-; SM80-NEXT: .reg .f32 %f<7>;
+; SM80-NEXT: .reg .b32 %r<5>;
; SM80-EMPTY:
; SM80-NEXT: // %bb.0:
-; SM80-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
-; SM80-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
-; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; SM80-NEXT: cvt.f32.bf16 %f2, %rs3;
-; SM80-NEXT: add.rn.f32 %f3, %f2, %f1;
-; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
-; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
-; SM80-NEXT: add.rn.f32 %f6, %f5, %f4;
-; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
-; SM80-NEXT: st.param.b32 [func_retval0], %r3;
+; SM80-NEXT: ld.param.b32 %r1, [test_faddx2_param_1];
+; SM80-NEXT: ld.param.b32 %r2, [test_faddx2_param_0];
+; SM80-NEXT: mov.b32 %r3, 1065369472;
+; SM80-NEXT: fma.rn.bf16x2 %r4, %r2, %r3, %r1;
+; SM80-NEXT: st.param.b32 [func_retval0], %r4;
; SM80-NEXT: ret;
;
; SM80-FTZ-LABEL: test_faddx2(
@@ -303,23 +288,14 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
;
; SM80-LABEL: test_fsubx2(
; SM80: {
-; SM80-NEXT: .reg .b16 %rs<5>;
-; SM80-NEXT: .reg .b32 %r<4>;
-; SM80-NEXT: .reg .f32 %f<7>;
+; SM80-NEXT: .reg .b32 %r<5>;
; SM80-EMPTY:
; SM80-NEXT: // %bb.0:
; SM80-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
; SM80-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
-; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; SM80-NEXT: cvt.f32.bf16 %f2, %rs3;
-; SM80-NEXT: sub.rn.f32 %f3, %f2, %f1;
-; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
-; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
-; SM80-NEXT: sub.rn.f32 %f6, %f5, %f4;
-; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
-; SM80-NEXT: st.param.b32 [func_retval0], %r3;
+; SM80-NEXT: mov.b32 %r3, -1082081408;
+; SM80-NEXT: fma.rn.bf16x2 %r4, %r2, %r3, %r1;
+; SM80-NEXT: st.param.b32 [func_retval0], %r4;
; SM80-NEXT: ret;
;
; SM80-FTZ-LABEL: test_fsubx2(
@@ -404,23 +380,14 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
;
; SM80-LABEL: test_fmulx2(
; SM80: {
-; SM80-NEXT: .reg .b16 %rs<5>;
-; SM80-NEXT: .reg .b32 %r<4>;
-; SM80-NEXT: .reg .f32 %f<7>;
+; SM80-NEXT: .reg .b32 %r<5>;
; SM80-EMPTY:
; SM80-NEXT: // %bb.0:
-; SM80-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
-; SM80-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
-; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; SM80-NEXT: cvt.f32.bf16 %f2, %rs3;
-; SM80-NEXT: mul.rn.f32 %f3, %f2, %f1;
-; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
-; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
-; SM80-NEXT: mul.rn.f32 %f6, %f5, %f4;
-; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
-; SM80-NEXT: st.param.b32 [func_retval0], %r3;
+; SM80-NEXT: ld.param.b32 %r1, [test_fmulx2_param_1];
+; SM80-NEXT: ld.param.b32 %r2, [test_fmulx2_param_0];
+; SM80-NEXT: mov.b32 %r3, -2147450880;
+; SM80-NEXT: fma.rn.bf16x2 %r4, %r2, %r1, %r3;
+; SM80-NEXT: st.param.b32 [func_retval0], %r4;
; SM80-NEXT: ret;
;
; SM80-FTZ-LABEL: test_fmulx2(
@@ -727,15 +694,13 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
;
; SM80-LABEL: test_fadd_imm_1(
; SM80: {
-; SM80-NEXT: .reg .b16 %rs<3>;
-; SM80-NEXT: .reg .f32 %f<3>;
+; SM80-NEXT: .reg .b16 %rs<4>;
; SM80-EMPTY:
; SM80-NEXT: // %bb.0:
; SM80-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
-; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f2;
-; SM80-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM80-NEXT: mov.b16 %rs2, 0x3F80;
+; SM80-NEXT: fma.rn.bf16 %rs3, %rs1, %rs2, %rs2;
+; SM80-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-NEXT: ret;
;
; SM80-FTZ-LABEL: test_fadd_imm_1(
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index 03cdeb9683abae..e6d35bd5ba536b 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -22,19 +22,14 @@ define <2 x bfloat> @test_ret_const() #0 {
define <2 x bfloat> @test_fadd_imm_0(<2 x bfloat> %a) #0 {
; SM80-LABEL: test_fadd_imm_0(
; SM80: {
-; SM80-NEXT: .reg .b16 %rs<3>;
-; SM80-NEXT: .reg .b32 %r<3>;
-; SM80-NEXT: .reg .f32 %f<5>;
+; SM80-NEXT: .reg .b32 %r<5>;
; SM80-EMPTY:
; SM80-NEXT: // %bb.0:
; SM80-NEXT: ld.param.b32 %r1, [test_fadd_imm_0_param_0];
-; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1;
-; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM80-NEXT: cvt.f32.bf16 %f3, %rs2;
-; SM80-NEXT: add.rn.f32 %f4, %f3, 0f40000000;
-; SM80-NEXT: cvt.rn.bf16x2.f32 %r2, %f4, %f2;
-; SM80-NEXT: st.param.b32 [func_retval0], %r2;
+; SM80-NEXT: mov.b32 %r2, 1065369472;
+; SM80-NEXT: mov.b32 %r3, 1073758080;
+; SM80-NEXT: fma.rn.bf16x2 %r4, %r1, %r2, %r3;
+; SM80-NEXT: st.param.b32 [func_retval0], %r4;
; SM80-NEXT: ret;
;
; SM90-LABEL: test_fadd_imm_0(
@@ -54,15 +49,13 @@ define <2 x bfloat> @test_fadd_imm_0(<2 x bfloat> %a) #0 {
define bfloat @test_fadd_imm_1(bfloat %a) #0 {
; SM80-LABEL: test_fadd_imm_1(
; SM80: {
-; SM80-NEXT: .reg .b16 %rs<3>;
-; SM80-NEXT: .reg .f32 %f<3>;
+; SM80-NEXT: .reg .b16 %rs<4>;
; SM80-EMPTY:
; SM80-NEXT: // %bb.0:
; SM80-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
-; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f2;
-; SM80-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM80-NEXT: mov.b16 %rs2, 0x3F80;
+; SM80-NEXT: fma.rn.bf16 %rs3, %rs1, %rs2, %rs2;
+; SM80-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-NEXT: ret;
;
; SM90-LABEL: test_fadd_imm_1(
@@ -82,23 +75,14 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-LABEL: test_fsubx2(
; SM80: {
-; SM80-NEXT: .reg .b16 %rs<5>;
-; SM80-NEXT: .reg .b32 %r<4>;
-; SM80-NEXT: .reg .f32 %f<7>;
+; SM80-NEXT: .reg .b32 %r<5>;
; SM80-EMPTY:
; SM80-NEXT: // %bb.0:
; SM80-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
; SM80-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
-; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; SM80-NEXT: cvt.f32.bf16 %f2, %rs3;
-; SM80-NEXT: sub.rn.f32 %f3, %f2, %f1;
-; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
-; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
-; SM80-NEXT: sub.rn.f32 %f6, %f5, %f4;
-; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
-; SM80-NEXT: st.param.b32 [func_retval0], %r3;
+; SM80-NEXT: mov.b32 %r3, -1082081408;
+; SM80-NEXT: fma.rn.bf16x2 %r4, %r2, %r3, %r1;
+; SM80-NEXT: st.param.b32 [func_retval0], %r4;
; SM80-NEXT: ret;
;
; SM90-LABEL: test_fsubx2(
@@ -118,23 +102,14 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-LABEL: test_fmulx2(
; SM80: {
-; SM80-NEXT: .reg .b16 %rs<5>;
-; SM80-NEXT: .reg .b32 %r<4>;
-; SM80-NEXT: .reg .f32 %f<7>;
+; SM80-NEXT: .reg .b32 %r<5>;
; SM80-EMPTY:
; SM80-NEXT: // %bb.0:
-; SM80-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
-; SM80-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
-; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; SM80-NEXT: cvt.f32.bf16 %f2, %rs3;
-; SM80-NEXT: mul.rn.f32 %f3, %f2, %f1;
-; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
-; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
-; SM80-NEXT: mul.rn.f32 %f6, %f5, %f4;
-; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
-; SM80-NEXT: st.param.b32 [func_retval0], %r3;
+; SM80-NEXT: ld.param.b32 %r1, [test_fmulx2_param_1];
+; SM80-NEXT: ld.param.b32 %r2, [test_fmulx2_param_0];
+; SM80-NEXT: mov.b32 %r3, -2147450880;
+; SM80-NEXT: fma.rn.bf16x2 %r4, %r2, %r1, %r3;
+; SM80-NEXT: st.param.b32 [func_retval0], %r4;
; SM80-NEXT: ret;
;
; SM90-LABEL: test_fmulx2(
@@ -543,30 +518,16 @@ define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 {
define <2 x bfloat> @test_fabs_add(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-LABEL: test_fabs_add(
; SM80: {
-; SM80-NEXT: .reg .b16 %rs<7>;
-; SM80-NEXT: .reg .b32 %r<6>;
-; SM80-NEXT: .reg .f32 %f<11>;
+; SM80-NEXT: .reg .b32 %r<7>;
; SM80-EMPTY:
; SM80-NEXT: // %bb.0:
; SM80-NEXT: ld.param.b32 %r1, [test_fabs_add_param_1];
; SM80-NEXT: ld.param.b32 %r2, [test_fabs_add_param_0];
-; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT: add.rn.f32 %f2, %f1, %f1;
-; SM80-NEXT: cvt.f32.bf16 %f3, %rs2;
-; SM80-NEXT: add.rn.f32 %f4, %f3, %f3;
-; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f4, %f2;
-; SM80-NEXT: abs.bf16x2 %r4, %r3;
-; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r4;
-; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
-; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; SM80-NEXT: cvt.f32.bf16 %f6, %rs5;
-; SM80-NEXT: add.rn.f32 %f7, %f5, %f6;
-; SM80-NEXT: cvt.f32.bf16 %f8, %rs4;
-; SM80-NEXT: cvt.f32.bf16 %f9, %rs6;
-; SM80-NEXT: add.rn.f32 %f10, %f8, %f9;
-; SM80-NEXT: cvt.rn.bf16x2.f32 %r5, %f10, %f7;
-; SM80-NEXT: st.param.b32 [func_retval0], %r5;
+; SM80-NEXT: mov.b32 %r3, 1065369472;
+; SM80-NEXT: fma.rn.bf16x2 %r4, %r2, %r3, %r2;
+; SM80-NEXT: abs.bf16x2 %r5, %r4;
+; SM80-NEXT: fma.rn.bf16x2 %r6, %r5, %r3, %r1;
+; SM80-NEXT: st.param.b32 [func_retval0], %r6;
; SM80-NEXT: ret;
;
; SM90-LABEL: test_fabs_add(
@@ -802,45 +763,18 @@ define <2 x bfloat> @test_round(<2 x bfloat> %a) #0 {
}
define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
-; SM80-LABEL: test_copysign(
-; SM80: {
-; SM80-NEXT: .reg .pred %p<3>;
-; SM80-NEXT: .reg .b16 %rs<15>;
-; SM80-NEXT: .reg .b32 %r<4>;
-; SM80-EMPTY:
-; SM80-NEXT: // %bb.0:
-; SM80-NEXT: ld.param.b32 %r1, [test_copysign_param_1];
-; SM80-NEXT: ld.param.b32 %r2, [test_copysign_param_0];
-; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; SM80-NEXT: abs.bf16 %rs3, %rs2;
-; SM80-NEXT: neg.bf16 %rs4, %rs3;
-; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; SM80-NEXT: shr.u16 %rs7, %rs6, 15;
-; SM80-NEXT: and.b16 %rs8, %rs7, 1;
-; SM80-NEXT: setp.eq.b16 %p1, %rs8, 1;
-; SM80-NEXT: selp.b16 %rs9, %rs4, %rs3, %p1;
-; SM80-NEXT: abs.bf16 %rs10, %rs1;
-; SM80-NEXT: neg.bf16 %rs11, %rs10;
-; SM80-NEXT: shr.u16 %rs12, %rs5, 15;
-; SM80-NEXT: and.b16 %rs13, %rs12, 1;
-; SM80-NEXT: setp.eq.b16 %p2, %rs13, 1;
-; SM80-NEXT: selp.b16 %rs14, %rs11, %rs10, %p2;
-; SM80-NEXT: mov.b32 %r3, {%rs14, %rs9};
-; SM80-NEXT: st.param.b32 [func_retval0], %r3;
-; SM80-NEXT: ret;
-;
-; SM90-LABEL: test_copysign(
-; SM90: {
-; SM90-NEXT: .reg .b32 %r<6>;
-; SM90-EMPTY:
-; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.b32 %r1, [test_copysign_param_0];
-; SM90-NEXT: ld.param.b32 %r2, [test_copysign_param_1];
-; SM90-NEXT: and.b32 %r3, %r2, -2147450880;
-; SM90-NEXT: and.b32 %r4, %r1, 2147450879;
-; SM90-NEXT: or.b32 %r5, %r4, %r3;
-; SM90-NEXT: st.param.b32 [func_retval0], %r5;
-; SM90-NEXT: ret;
+; CHECK-LABEL: test_copysign(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test_copysign_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [test_copysign_param_1];
+; CHECK-NEXT: and.b32 %r3, %r2, -2147450880;
+; CHECK-NEXT: and.b32 %r4, %r1, 2147450879;
+; CHECK-NEXT: or.b32 %r5, %r4, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: ret;
%r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b)
ret <2 x bfloat> %r
}
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
index 48c94f275274bd..7dce894620e6bb 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
@@ -352,9 +352,7 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
; CHECK-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<9>;
-; CHECK-NEXT: .reg .b32 %r<7>;
-; CHECK-NEXT: .reg .f32 %f<6>;
+; CHECK-NEXT: .reg .b16 %rs<11>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
@@ -363,20 +361,11 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
; CHECK-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
; CHECK-NEXT: mov.b16 %rs5, 0x0000;
; CHECK-NEXT: max.bf16 %rs6, %rs4, %rs5;
-; CHECK-NEXT: cvt.u32.u16 %r1, %rs4;
-; CHECK-NEXT: shl.b32 %r2, %r1, 16;
-; CHECK-NEXT: mov.b32 %f1, %r2;
-; CHECK-NEXT: add.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT: cvt.rn.bf16.f32 %rs7, %f2;
-; CHECK-NEXT: cvt.u32.u16 %r3, %rs6;
-; CHECK-NEXT: shl.b32 %r4, %r3, 16;
-; CHECK-NEXT: mov.b32 %f3, %r4;
-; CHECK-NEXT: cvt.u32.u16 %r5, %rs7;
-; CHECK-NEXT: shl.b32 %r6, %r5, 16;
-; CHECK-NEXT: mov.b32 %f4, %r6;
-; CHECK-NEXT: add.f32 %f5, %f3, %f4;
-; CHECK-NEXT: cvt.rn.bf16.f32 %rs8, %f5;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs8;
+; CHECK-NEXT: mov.b16 %rs7, 0x3F80;
+; CHECK-NEXT: mov.b16 %rs8, 0x40E0;
+; CHECK-NEXT: fma.rn.bf16 %rs9, %rs4, %rs7, %rs8;
+; CHECK-NEXT: fma.rn.bf16 %rs10, %rs6, %rs7, %rs9;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs10;
; CHECK-NEXT: ret;
;
; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
@@ -959,9 +948,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
; CHECK-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<7>;
-; CHECK-NEXT: .reg .b32 %r<20>;
-; CHECK-NEXT: .reg .f32 %f<11>;
+; CHECK-NEXT: .reg .b32 %r<11>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
@@ -970,34 +957,11 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
; CHECK-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1;
; CHECK-NEXT: mov.b32 %r5, 0;
; CHECK-NEXT: max.bf16x2 %r6, %r4, %r5;
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-NEXT: cvt.u32.u16 %r7, %rs2;
-; CHECK-NEXT: shl.b32 %r8, %r7, 16;
-; CHECK-NEXT: mov.b32 %f1, %r8;
-; CHECK-NEXT: add.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f2;
-; CHECK-NEXT: cvt.u32.u16 %r9, %rs1;
-; CHECK-NEXT: shl.b32 %r10, %r9, 16;
-; CHECK-NEXT: mov.b32 %f3, %r10;
-; CHECK-NEXT: add.f32 %f4, %f3, 0f40E00000;
-; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f4;
-; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r6;
-; CHECK-NEXT: cvt.u32.u16 %r11, %rs5;
-; CHECK-NEXT: shl.b32 %r12, %r11, 16;
-; CHECK-NEXT: mov.b32 %f5, %r12;
-; CHECK-NEXT: cvt.u32.u16 %r13, %rs4;
-; CHECK-NEXT: shl.b32 %r14, %r13, 16;
-; CHECK-NEXT: mov.b32 %f6, %r14;
-; CHECK-NEXT: add.f32 %f7, %f5, %f6;
-; CHECK-NEXT: cvt.u32.u16 %r15, %rs6;
-; CHECK-NEXT: shl.b32 %r16, %r15, 16;
-; CHECK-NEXT: mov.b32 %f8, %r16;
-; CHECK-NEXT: cvt.u32.u16 %r17, %rs3;
-; CHECK-NEXT: shl.b32 %r18, %r17, 16;
-; CHECK-NEXT: mov.b32 %f9, %r18;
-; CHECK-NEXT: add.f32 %f10, %f8, %f9;
-; CHECK-NEXT: cvt.rn.bf16x2.f32 %r19, %f10, %f7;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r19;
+; CHECK-NEXT: mov.b32 %r7, 1065369472;
+; CHECK-NEXT: mov.b32 %r8, 1088438496;
+; CHECK-NEXT: fma.rn.bf16x2 %r9, %r4, %r7, %r8;
+; CHECK-NEXT: fma.rn.bf16x2 %r10, %r6, %r7, %r9;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r10;
; CHECK-NEXT: ret;
;
; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
index 561f2b0cc06730..eb51d7db81372d 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
@@ -221,26 +221,18 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
; CHECK-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<7>;
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b16 %rs<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
; CHECK-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NEXT: cvt.u32.u16 %r1, %rs4;
-; CHECK-NEXT: shl.b32 %r2, %r1, 16;
-; CHECK-NEXT: mov.b32 %f1, %r2;
-; CHECK-NEXT: add.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT: cvt.rn.bf16.f32 %rs5, %f2;
-; CHECK-NEXT: cvt.u32.u16 %r3, %rs5;
-; CHECK-NEXT: shl.b32 %r4, %r3, 16;
-; CHECK-NEXT: mov.b32 %f3, %r4;
-; CHECK-NEXT: add.f32 %f4, %f3, %f1;
-; CHECK-NEXT: cvt.rn.bf16.f32 %rs6, %f4;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs6;
+; CHECK-NEXT: mov.b16 %rs5, 0x3F80;
+; CHECK-NEXT: mov.b16 %rs6, 0x40E0;
+; CHECK-NEXT: fma.rn.bf16 %rs7, %rs4, %rs5, %rs6;
+; CHECK-NEXT: fma.rn.bf16 %rs8, %rs7, %rs5, %rs4;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs8;
; CHECK-NEXT: ret;
;
; CHECK-FTZ-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
@@ -642,36 +634,18 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
; CHECK-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<5>;
-; CHECK-NEXT: .reg .b32 %r<14>;
-; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_2];
; CHECK-NEXT: ld.param.b32 %r2, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_1];
; CHECK-NEXT: ld.param.b32 %r3, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_0];
; CHECK-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1;
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-NEXT: cvt.u32.u16 %r5, %rs2;
-; CHECK-NEXT: shl.b32 %r6, %r5, 16;
-; CHECK-NEXT: mov.b32 %f1, %r6;
-; CHECK-NEXT: add.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f2;
-; CHECK-NEXT: cvt.u32.u16 %r7, %rs1;
-; CHECK-NEXT: shl.b32 %r8, %r7, 16;
-; CHECK-NEXT: mov.b32 %f3, %r8;
-; CHECK-NEXT: add.f32 %f4, %f3, 0f40E00000;
-; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f4;
-; CHECK-NEXT: cvt.u32.u16 %r9, %rs4;
-; CHECK-NEXT: shl.b32 %r10, %r9, 16;
-; CHECK-NEXT: mov.b32 %f5, %r10;
-; CHECK-NEXT: add.f32 %f6, %f5, %f3;
-; CHECK-NEXT: cvt.u32.u16 %r11, %rs3;
-; CHECK-NEXT: shl.b32 %r12, %r11, 16;
-; CHECK-NEXT: mov.b32 %f7, %r12;
-; CHECK-NEXT: add.f32 %f8, %f7, %f1;
-; CHECK-NEXT: cvt.rn.bf16x2.f32 %r13, %f8, %f6;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-NEXT: mov.b32 %r5, 1065369472;
+; CHECK-NEXT: mov.b32 %r6, 1088438496;
+; CHECK-NEXT: fma.rn.bf16x2 %r7, %r4, %r5, %r6;
+; CHECK-NEXT: fma.rn.bf16x2 %r8, %r7, %r5, %r4;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
; CHECK-NEXT: ret;
;
; CHECK-FTZ-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
index b20ca24dd91a0c..a3545f51714259 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
@@ -233,9 +233,7 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) {
define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) {
; CHECK-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<9>;
-; CHECK-NEXT: .reg .b32 %r<7>;
-; CHECK-NEXT: .reg .f32 %f<6>;
+; CHECK-NEXT: .reg .b16 %rs<11>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
@@ -244,20 +242,11 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
; CHECK-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
; CHECK-NEXT: mov.b16 %rs5, 0x0000;
; CHECK-NEXT: max.bf16 %rs6, %rs4, %rs5;
-; CHECK-NEXT: cvt.u32.u16 %r1, %rs4;
-; CHECK-NEXT: shl.b32 %r2, %r1, 16;
-; CHECK-NEXT: mov.b32 %f1, %r2;
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT: cvt.rn.bf16.f32 %rs7, %f2;
-; CHECK-NEXT: cvt.u32.u16 %r3, %rs6;
-; CHECK-NEXT: shl.b32 %r4, %r3, 16;
-; CHECK-NEXT: mov.b32 %f3, %r4;
-; CHECK-NEXT: cvt.u32.u16 %r5, %rs7;
-; CHECK-NEXT: shl.b32 %r6, %r5, 16;
-; CHECK-NEXT: mov.b32 %f4, %r6;
-; CHECK-NEXT: add.rn.f32 %f5, %f3, %f4;
-; CHECK-NEXT: cvt.rn.bf16.f32 %rs8, %f5;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs8;
+; CHECK-NEXT: mov.b16 %rs7, 0x3F80;
+; CHECK-NEXT: mov.b16 %rs8, 0x40E0;
+; CHECK-NEXT: fma.rn.bf16 %rs9, %rs4, %rs7, %rs8;
+; CHECK-NEXT: fma.rn.bf16 %rs10, %rs6, %rs7, %rs9;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs10;
; CHECK-NEXT: ret;
;
; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
@@ -694,9 +683,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
; CHECK-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<7>;
-; CHECK-NEXT: .reg .b32 %r<20>;
-; CHECK-NEXT: .reg .f32 %f<11>;
+; CHECK-NEXT: .reg .b32 %r<11>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
@@ -705,34 +692,11 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
; CHECK-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1;
; CHECK-NEXT: mov.b32 %r5, 0;
; CHECK-NEXT: max.bf16x2 %r6, %r4, %r5;
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-NEXT: cvt.u32.u16 %r7, %rs2;
-; CHECK-NEXT: shl.b32 %r8, %r7, 16;
-; CHECK-NEXT: mov.b32 %f1, %r8;
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f2;
-; CHECK-NEXT: cvt.u32.u16 %r9, %rs1;
-; CHECK-NEXT: shl.b32 %r10, %r9, 16;
-; CHECK-NEXT: mov.b32 %f3, %r10;
-; CHECK-NEXT: add.rn.f32 %f4, %f3, 0f40E00000;
-; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f4;
-; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r6;
-; CHECK-NEXT: cvt.u32.u16 %r11, %rs5;
-; CHECK-NEXT: shl.b32 %r12, %r11, 16;
-; CHECK-NEXT: mov.b32 %f5, %r12;
-; CHECK-NEXT: cvt.u32.u16 %r13, %rs4;
-; CHECK-NEXT: shl.b32 %r14, %r13, 16;
-; CHECK-NEXT: mov.b32 %f6, %r14;
-; CHECK-NEXT: add.rn.f32 %f7, %f5, %f6;
-; CHECK-NEXT: cvt.u32.u16 %r15, %rs6;
-; CHECK-NEXT: shl.b32 %r16, %r15, 16;
-; CHECK-NEXT: mov.b32 %f8, %r16;
-; CHECK-NEXT: cvt.u32.u16 %r17, %rs3;
-; CHECK-NEXT: shl.b32 %r18, %r17, 16;
-; CHECK-NEXT: mov.b32 %f9, %r18;
-; CHECK-NEXT: add.rn.f32 %f10, %f8, %f9;
-; CHECK-NEXT: cvt.rn.bf16x2.f32 %r19, %f10, %f7;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r19;
+; CHECK-NEXT: mov.b32 %r7, 1065369472;
+; CHECK-NEXT: mov.b32 %r8, 1088438496;
+; CHECK-NEXT: fma.rn.bf16x2 %r9, %r4, %r7, %r8;
+; CHECK-NEXT: fma.rn.bf16x2 %r10, %r6, %r7, %r9;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r10;
; CHECK-NEXT: ret;
;
; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
@@ -1204,26 +1168,18 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) {
define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) {
; CHECK-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<7>;
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b16 %rs<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
; CHECK-NEXT: ld.param.b16 %rs2, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
; CHECK-NEXT: ld.param.b16 %rs3, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
; CHECK-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NEXT: cvt.u32.u16 %r1, %rs4;
-; CHECK-NEXT: shl.b32 %r2, %r1, 16;
-; CHECK-NEXT: mov.b32 %f1, %r2;
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT: cvt.rn.bf16.f32 %rs5, %f2;
-; CHECK-NEXT: cvt.u32.u16 %r3, %rs5;
-; CHECK-NEXT: shl.b32 %r4, %r3, 16;
-; CHECK-NEXT: mov.b32 %f3, %r4;
-; CHECK-NEXT: add.rn.f32 %f4, %f3, %f1;
-; CHECK-NEXT: cvt.rn.bf16.f32 %rs6, %f4;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs6;
+; CHECK-NEXT: mov.b16 %rs5, 0x3F80;
+; CHECK-NEXT: mov.b16 %rs6, 0x40E0;
+; CHECK-NEXT: fma.rn.bf16 %rs7, %rs4, %rs5, %rs6;
+; CHECK-NEXT: fma.rn.bf16 %rs8, %rs7, %rs5, %rs4;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs8;
; CHECK-NEXT: ret;
;
; CHECK-FTZ-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
@@ -1629,36 +1585,18 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
; CHECK-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<5>;
-; CHECK-NEXT: .reg .b32 %r<14>;
-; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_2];
; CHECK-NEXT: ld.param.b32 %r2, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_1];
; CHECK-NEXT: ld.param.b32 %r3, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_0];
; CHECK-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1;
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-NEXT: cvt.u32.u16 %r5, %rs2;
-; CHECK-NEXT: shl.b32 %r6, %r5, 16;
-; CHECK-NEXT: mov.b32 %f1, %r6;
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f2;
-; CHECK-NEXT: cvt.u32.u16 %r7, %rs1;
-; CHECK-NEXT: shl.b32 %r8, %r7, 16;
-; CHECK-NEXT: mov.b32 %f3, %r8;
-; CHECK-NEXT: add.rn.f32 %f4, %f3, 0f40E00000;
-; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f4;
-; CHECK-NEXT: cvt.u32.u16 %r9, %rs4;
-; CHECK-NEXT: shl.b32 %r10, %r9, 16;
-; CHECK-NEXT: mov.b32 %f5, %r10;
-; CHECK-NEXT: add.rn.f32 %f6, %f5, %f3;
-; CHECK-NEXT: cvt.u32.u16 %r11, %rs3;
-; CHECK-NEXT: shl.b32 %r12, %r11, 16;
-; CHECK-NEXT: mov.b32 %f7, %r12;
-; CHECK-NEXT: add.rn.f32 %f8, %f7, %f1;
-; CHECK-NEXT: cvt.rn.bf16x2.f32 %r13, %f8, %f6;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-NEXT: mov.b32 %r5, 1065369472;
+; CHECK-NEXT: mov.b32 %r6, 1088438496;
+; CHECK-NEXT: fma.rn.bf16x2 %r7, %r4, %r5, %r6;
+; CHECK-NEXT: fma.rn.bf16x2 %r8, %r7, %r5, %r4;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
; CHECK-NEXT: ret;
;
; CHECK-FTZ-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
More information about the llvm-commits
mailing list