[llvm] [NVPTX] Consistently check fast-math flags when lowering div (PR #136890)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 23 07:46:44 PDT 2025
https://github.com/AlexMaclean created https://github.com/llvm/llvm-project/pull/136890
When choosing the `div.*` variant during ISel, check the instruction-level fast-math flags.
>From 3d1a67c75e34820f72f846fc5567ad517c5cd0db Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Tue, 22 Apr 2025 19:41:36 +0000
Subject: [PATCH] [NVPTX] Consistently check fast-math flags when lowering div
---
llvm/lib/Target/NVPTX/NVPTX.h | 9 +-
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 5 +-
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 2 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 36 ++++--
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 9 +-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 125 ++++++++++----------
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 12 +-
llvm/test/CodeGen/NVPTX/div.ll | 74 ++++++++++++
8 files changed, 179 insertions(+), 93 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 98e77ca80b8d5..43bd23a811849 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -255,7 +255,14 @@ enum PrmtMode {
RC16,
};
}
-}
+
+enum class DivPrecisionLevel : unsigned {
+ Approx = 0,
+ Full = 1,
+ IEEE754 = 2,
+};
+
+} // namespace NVPTX
void initializeNVPTXDAGToDAGISelLegacyPass(PassRegistry &);
} // namespace llvm
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 486c7c815435a..6fbea39357a0c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -65,8 +65,9 @@ bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
return SelectionDAGISel::runOnMachineFunction(MF);
}
-int NVPTXDAGToDAGISel::getDivF32Level() const {
- return Subtarget->getTargetLowering()->getDivF32Level();
+NVPTX::DivPrecisionLevel
+NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const {
+ return Subtarget->getTargetLowering()->getDivF32Level(*MF, N);
}
bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 23cbd458571a0..9c832bdcd06c6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -43,7 +43,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
// If true, generate mul.wide from sext and mul
bool doMulWide;
- int getDivF32Level() const;
+ NVPTX::DivPrecisionLevel getDivF32Level(const SDNode *N) const;
bool usePrecSqrtF32() const;
bool useF32FTZ() const;
bool allowFMA() const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 49f4f30096f00..79ce0d64f4dbd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -85,11 +85,16 @@ static cl::opt<unsigned> FMAContractLevelOpt(
" 1: do it 2: do it aggressively"),
cl::init(2));
-static cl::opt<int> UsePrecDivF32(
+static cl::opt<NVPTX::DivPrecisionLevel> UsePrecDivF32(
"nvptx-prec-divf32", cl::Hidden,
cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
" IEEE Compliant F32 div.rnd if available."),
- cl::init(2));
+ cl::values(clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0",
+ "Use div.approx"),
+ clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"),
+ clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2",
+ "Use IEEE Compliant F32 div.rnd if available")),
+ cl::init(NVPTX::DivPrecisionLevel::IEEE754));
static cl::opt<bool> UsePrecSqrtF32(
"nvptx-prec-sqrtf32", cl::Hidden,
@@ -109,17 +114,24 @@ static cl::opt<bool> ForceMinByValParamAlign(
" params of device functions."),
cl::init(false));
-int NVPTXTargetLowering::getDivF32Level() const {
- if (UsePrecDivF32.getNumOccurrences() > 0) {
- // If nvptx-prec-div32=N is used on the command-line, always honor it
+NVPTX::DivPrecisionLevel
+NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF,
+ const SDNode *N) const {
+ // If nvptx-prec-div32=N is used on the command-line, always honor it
+ if (UsePrecDivF32.getNumOccurrences() > 0)
return UsePrecDivF32;
- } else {
- // Otherwise, use div.approx if fast math is enabled
- if (getTargetMachine().Options.UnsafeFPMath)
- return 0;
- else
- return 2;
+
+ // Otherwise, use div.approx if fast math is enabled
+ if (allowUnsafeFPMath(MF))
+ return NVPTX::DivPrecisionLevel::Approx;
+
+ if (N) {
+ const SDNodeFlags Flags = N->getFlags();
+ if (Flags.hasApproximateFuncs())
+ return NVPTX::DivPrecisionLevel::Approx;
}
+
+ return NVPTX::DivPrecisionLevel::IEEE754;
}
bool NVPTXTargetLowering::usePrecSqrtF32() const {
@@ -4947,7 +4959,7 @@ bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
return allowUnsafeFPMath(MF);
}
-bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
+bool NVPTXTargetLowering::allowUnsafeFPMath(const MachineFunction &MF) const {
// Honor TargetOptions flags that explicitly say unsafe math is okay.
if (MF.getTarget().Options.UnsafeFPMath)
return true;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 7a8bf3bf33a94..2a0b16078dee0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -214,11 +214,8 @@ class NVPTXTargetLowering : public TargetLowering {
// Get the degree of precision we want from 32-bit floating point division
// operations.
- //
- // 0 - Use ptx div.approx
- // 1 - Use ptx.div.full (approximate, but less so than div.approx)
- // 2 - Use IEEE-compliant div instructions, if available.
- int getDivF32Level() const;
+ NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF,
+ const SDNode *N) const;
// Get whether we should use a precise or approximate 32-bit floating point
// sqrt instruction.
@@ -235,7 +232,7 @@ class NVPTXTargetLowering : public TargetLowering {
unsigned combineRepeatedFPDivisors() const override { return 2; }
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const;
- bool allowUnsafeFPMath(MachineFunction &MF) const;
+ bool allowUnsafeFPMath(const MachineFunction &MF) const;
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT) const override {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index ee6380a8a89c4..732a88822f853 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -150,9 +150,6 @@ def doRsqrtOpt : Predicate<"doRsqrtOpt()">;
def doMulWide : Predicate<"doMulWide">;
-def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
-def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
-
def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
@@ -1108,26 +1105,19 @@ def INEG64 :
//-----------------------------------
// Constant 1.0f
-def FloatConst1 : PatLeaf<(fpimm), [{
- return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
- N->getValueAPF().convertToFloat() == 1.0f;
+def f32imm_1 : FPImmLeaf<f32, [{
+ return &Imm.getSemantics() == &llvm::APFloat::IEEEsingle() &&
+ Imm.convertToFloat() == 1.0f;
}]>;
// Constant 1.0 (double)
-def DoubleConst1 : PatLeaf<(fpimm), [{
- return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
- N->getValueAPF().convertToDouble() == 1.0;
+def f64imm_1 : FPImmLeaf<f64, [{
+ return &Imm.getSemantics() == &llvm::APFloat::IEEEdouble() &&
+ Imm.convertToDouble() == 1.0;
}]>;
// Constant -1.0 (double)
-def DoubleConstNeg1 : PatLeaf<(fpimm), [{
- return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
- N->getValueAPF().convertToDouble() == -1.0;
-}]>;
-
-
-// Constant -X -> X (double)
-def NegDoubleConst : SDNodeXForm<fpimm, [{
- return CurDAG->getTargetConstantFP(-(N->getValueAPF()),
- SDLoc(N), MVT::f64);
+def f64imm_neg1 : FPImmLeaf<f64, [{
+ return &Imm.getSemantics() == &llvm::APFloat::IEEEdouble() &&
+ Imm.convertToDouble() == -1.0;
}]>;
defm FADD : F3_fma_component<"add", fadd>;
@@ -1178,11 +1168,11 @@ def BFNEG16x2 : FNEG_BF16_F16X2<"neg.bf16x2", v2bf16, Int32Regs, True>;
//
// F64 division
//
-def FDIV641r :
+def FRCP64r :
NVPTXInst<(outs Float64Regs:$dst),
- (ins f64imm:$a, Float64Regs:$b),
+ (ins Float64Regs:$b),
"rcp.rn.f64 \t$dst, $b;",
- [(set f64:$dst, (fdiv DoubleConst1:$a, f64:$b))]>;
+ [(set f64:$dst, (fdiv f64imm_1, f64:$b))]>;
def FDIV64rr :
NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, Float64Regs:$b),
@@ -1196,24 +1186,31 @@ def FDIV64ri :
// fdiv will be converted to rcp
// fneg (fdiv 1.0, X) => fneg (rcp.rn X)
-def : Pat<(fdiv DoubleConstNeg1:$a, f64:$b),
- (FNEGf64 (FDIV641r (NegDoubleConst node:$a), $b))>;
+def : Pat<(fdiv f64imm_neg1, f64:$b),
+ (FNEGf64 (FRCP64r $b))>;
//
// F32 Approximate reciprocal
//
-def FDIV321r_ftz :
+
+def fdiv_approx : PatFrag<(ops node:$a, node:$b),
+ (fdiv node:$a, node:$b), [{
+ return getDivF32Level(N) == NVPTX::DivPrecisionLevel::Approx;
+}]>;
+
+
+def FRCP32_approx_r_ftz :
NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
+ (ins Float32Regs:$b),
"rcp.approx.ftz.f32 \t$dst, $b;",
- [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
- Requires<[do_DIVF32_APPROX, doF32FTZ]>;
-def FDIV321r :
+ [(set f32:$dst, (fdiv_approx f32imm_1, f32:$b))]>,
+ Requires<[doF32FTZ]>;
+def FRCP32_approx_r :
NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
+ (ins Float32Regs:$b),
"rcp.approx.f32 \t$dst, $b;",
- [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
- Requires<[do_DIVF32_APPROX]>;
+ [(set f32:$dst, (fdiv_approx f32imm_1, f32:$b))]>;
+
//
// F32 Approximate division
//
@@ -1221,43 +1218,43 @@ def FDIV32approxrr_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
"div.approx.ftz.f32 \t$dst, $a, $b;",
- [(set f32:$dst, (fdiv f32:$a, f32:$b))]>,
- Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+ [(set f32:$dst, (fdiv_approx f32:$a, f32:$b))]>,
+ Requires<[doF32FTZ]>;
def FDIV32approxri_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
"div.approx.ftz.f32 \t$dst, $a, $b;",
- [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
- Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+ [(set f32:$dst, (fdiv_approx f32:$a, fpimm:$b))]>,
+ Requires<[doF32FTZ]>;
def FDIV32approxrr :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
"div.approx.f32 \t$dst, $a, $b;",
- [(set f32:$dst, (fdiv f32:$a, f32:$b))]>,
- Requires<[do_DIVF32_APPROX]>;
+ [(set f32:$dst, (fdiv_approx f32:$a, f32:$b))]>;
def FDIV32approxri :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
"div.approx.f32 \t$dst, $a, $b;",
- [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
- Requires<[do_DIVF32_APPROX]>;
+ [(set f32:$dst, (fdiv_approx f32:$a, fpimm:$b))]>;
//
// F32 Semi-accurate reciprocal
//
// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
//
-def FDIV321r_approx_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
- "rcp.approx.ftz.f32 \t$dst, $b;",
- [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
- Requires<[do_DIVF32_FULL, doF32FTZ]>;
-def FDIV321r_approx :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
- "rcp.approx.f32 \t$dst, $b;",
- [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
- Requires<[do_DIVF32_FULL]>;
+
+def fdiv_full : PatFrag<(ops node:$a, node:$b),
+ (fdiv node:$a, node:$b), [{
+ return getDivF32Level(N) == NVPTX::DivPrecisionLevel::Full;
+}]>;
+
+
+def : Pat<(fdiv_full f32imm_1, f32:$b),
+ (FRCP32_approx_r_ftz $b)>,
+ Requires<[doF32FTZ]>;
+
+def : Pat<(fdiv_full f32imm_1, f32:$b),
+ (FRCP32_approx_r $b)>;
+
//
// F32 Semi-accurate division
//
@@ -1265,40 +1262,38 @@ def FDIV32rr_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
"div.full.ftz.f32 \t$dst, $a, $b;",
- [(set f32:$dst, (fdiv Float32Regs:$a, f32:$b))]>,
- Requires<[do_DIVF32_FULL, doF32FTZ]>;
+ [(set f32:$dst, (fdiv_full f32:$a, f32:$b))]>,
+ Requires<[doF32FTZ]>;
def FDIV32ri_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
"div.full.ftz.f32 \t$dst, $a, $b;",
- [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
- Requires<[do_DIVF32_FULL, doF32FTZ]>;
+ [(set f32:$dst, (fdiv_full f32:$a, fpimm:$b))]>,
+ Requires<[doF32FTZ]>;
def FDIV32rr :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
"div.full.f32 \t$dst, $a, $b;",
- [(set f32:$dst, (fdiv f32:$a, f32:$b))]>,
- Requires<[do_DIVF32_FULL]>;
+ [(set f32:$dst, (fdiv_full f32:$a, f32:$b))]>;
def FDIV32ri :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
"div.full.f32 \t$dst, $a, $b;",
- [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
- Requires<[do_DIVF32_FULL]>;
+ [(set f32:$dst, (fdiv_full f32:$a, fpimm:$b))]>;
//
// F32 Accurate reciprocal
//
def FDIV321r_prec_ftz :
NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
+ (ins Float32Regs:$b),
"rcp.rn.ftz.f32 \t$dst, $b;",
- [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
+ [(set f32:$dst, (fdiv f32imm_1, f32:$b))]>,
Requires<[doF32FTZ]>;
-def FDIV321r_prec :
+def FRCP32r_prec :
NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
+ (ins Float32Regs:$b),
"rcp.rn.f32 \t$dst, $b;",
- [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>;
+ [(set f32:$dst, (fdiv f32imm_1, f32:$b))]>;
//
// F32 Accurate division
//
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 4ba3e6f06bb5f..562d5951bab0e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1606,24 +1606,24 @@ def INT_NVVM_RSQRT_APPROX_D : F_MATH_1<"rsqrt.approx.f64 \t$dst, $src0;",
F64RT, F64RT, int_nvvm_rsqrt_approx_d>;
// 1.0f / sqrt_approx -> rsqrt_approx
-def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_approx_f f32:$a)),
+def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_approx_f f32:$a)),
(INT_NVVM_RSQRT_APPROX_F $a)>,
Requires<[doRsqrtOpt]>;
-def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_approx_ftz_f f32:$a)),
+def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_approx_ftz_f f32:$a)),
(INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
Requires<[doRsqrtOpt]>;
// same for int_nvvm_sqrt_f when non-precision sqrt is requested
-def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f f32:$a)),
+def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_f f32:$a)),
(INT_NVVM_RSQRT_APPROX_F $a)>,
Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doNoF32FTZ]>;
-def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f f32:$a)),
+def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_f f32:$a)),
(INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doF32FTZ]>;
-def: Pat<(fdiv FloatConst1, (fsqrt f32:$a)),
+def: Pat<(fdiv f32imm_1, (fsqrt f32:$a)),
(INT_NVVM_RSQRT_APPROX_F $a)>,
Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doNoF32FTZ]>;
-def: Pat<(fdiv FloatConst1, (fsqrt f32:$a)),
+def: Pat<(fdiv f32imm_1, (fsqrt f32:$a)),
(INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doF32FTZ]>;
//
diff --git a/llvm/test/CodeGen/NVPTX/div.ll b/llvm/test/CodeGen/NVPTX/div.ll
index 4f9d58758ca9e..2382ecdc72dad 100644
--- a/llvm/test/CodeGen/NVPTX/div.ll
+++ b/llvm/test/CodeGen/NVPTX/div.ll
@@ -24,3 +24,77 @@ define float @div_full(float %a, float %b) {
%4 = call float @llvm.nvvm.div.full.ftz(float %3, float 4.0)
ret float %4
}
+
+; CHECK-LABEL: div_fast_rr(
+; CHECK: div.approx.f32
+define float @div_fast_rr(float %a, float %b) {
+ %t1 = fdiv afn float %a, %b
+ ret float %t1
+}
+
+; CHECK-LABEL: div_fast_rr_ftz(
+; CHECK: div.approx.ftz.f32
+define float @div_fast_rr_ftz(float %a, float %b) #0 {
+ %t1 = fdiv afn float %a, %b
+ ret float %t1
+}
+
+; CHECK-LABEL: div_fast_ri(
+; CHECK: mul.rn.f32
+define float @div_fast_ri(float %a, float %b) {
+ %t1 = fdiv afn float %a, 2.0
+ ret float %t1
+}
+
+; CHECK-LABEL: div_fast_ri_ftz(
+; CHECK: mul.rn.ftz.f32
+define float @div_fast_ri_ftz(float %a, float %b) #0 {
+ %t1 = fdiv afn float %a, 2.0
+ ret float %t1
+}
+
+; CHECK-LABEL: rcp_fast(
+; CHECK: rcp.approx.f32
+define float @rcp_fast(float %a) {
+ %t1 = fdiv afn float 1.0, %a
+ ret float %t1
+}
+
+; CHECK-LABEL: rcp_fast_ftz(
+; CHECK: rcp.approx.ftz.f32
+define float @rcp_fast_ftz(float %a) #0 {
+ %t1 = fdiv afn float 1.0, %a
+ ret float %t1
+}
+
+; CHECK-LABEL: div_fast_vec(
+; CHECK: div.approx.f32
+; CHECK: div.approx.f32
+define float @div_fast_vec(float %a, float %b, float %c, float %d) {
+ %ins_a0 = insertelement <2 x float> undef, float %a, i32 0
+ %ins_a1 = insertelement <2 x float> %ins_a0, float %b, i32 1
+ %ins_b0 = insertelement <2 x float> undef, float %c, i32 0
+ %ins_b1 = insertelement <2 x float> %ins_b0, float %d, i32 1
+ %fdiv = fdiv fast <2 x float> %ins_a1, %ins_b1
+ %ext0 = extractelement <2 x float> %fdiv, i32 0
+ %ext1 = extractelement <2 x float> %fdiv, i32 1
+ %fadd = fadd float %ext0, %ext1
+ ret float %fadd
+}
+
+; CHECK-LABEL: div_fast_vec_ftz(
+; CHECK: div.approx.ftz.f32
+; CHECK: div.approx.ftz.f32
+define float @div_fast_vec_ftz(float %a, float %b, float %c, float %d) #0 {
+ %ins_a0 = insertelement <2 x float> undef, float %a, i32 0
+ %ins_a1 = insertelement <2 x float> %ins_a0, float %b, i32 1
+ %ins_b0 = insertelement <2 x float> undef, float %c, i32 0
+ %ins_b1 = insertelement <2 x float> %ins_b0, float %d, i32 1
+ %fdiv = fdiv fast <2 x float> %ins_a1, %ins_b1
+ %ext0 = extractelement <2 x float> %fdiv, i32 0
+ %ext1 = extractelement <2 x float> %fdiv, i32 1
+ %fadd = fadd float %ext0, %ext1
+ ret float %fadd
+}
+
+attributes #0 = { "denormal-fp-math-f32" = "preserve-sign" }
\ No newline at end of file
More information about the llvm-commits
mailing list