[llvm] 7299250 - DAG: Use fast variants of fast math libcalls (#147481)

Sat Jul 12 18:41:48 PDT 2025

Author: Matt Arsenault
Date: 2025-07-13T10:41:45+09:00
New Revision: 7299250c030546b6811f5a1e0c4fd86f4380192e

URL: https://github.com/llvm/llvm-project/commit/7299250c030546b6811f5a1e0c4fd86f4380192e
DIFF: https://github.com/llvm/llvm-project/commit/7299250c030546b6811f5a1e0c4fd86f4380192e.diff

LOG: DAG: Use fast variants of fast math libcalls (#147481)

Hexagon currently has an untested global flag to control fast
math variants of libcalls. Add fast variants as explicit libcall
options so this can be a flag based lowering decision, and implement
it. I have no idea what fast math flags the hexagon case requires,
so I picked the maximally potentially relevant set of flags although
this probably is refinable per call. Looking in compiler-rt, I'm not
sure if the fast variants are anything more than aliases.

Added: 
    llvm/test/CodeGen/Hexagon/fast-math-libcalls.ll

Modified: 
    llvm/include/llvm/IR/RuntimeLibcalls.td
    llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
    llvm/lib/IR/RuntimeLibcalls.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index c236e698759cc..57f5d9fd6d3a6 100644

--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -62,13 +62,24 @@ foreach IntTy = ["I32", "I64", "I128"] in {
 
 foreach FPTy = ["F32", "F64", "F80", "F128", "PPCF128"] in {
   def ADD_#FPTy : RuntimeLibcall;
+  def FAST_ADD_#FPTy : RuntimeLibcall;
+
   def SUB_#FPTy : RuntimeLibcall;
+  def FAST_SUB_#FPTy : RuntimeLibcall;
+
   def MUL_#FPTy : RuntimeLibcall;
+  def FAST_MUL_#FPTy : RuntimeLibcall;
+
   def DIV_#FPTy : RuntimeLibcall;
+  def FAST_DIV_#FPTy : RuntimeLibcall;
+
   def REM_#FPTy : RuntimeLibcall;
   def FMA_#FPTy : RuntimeLibcall;
   def POWI_#FPTy : RuntimeLibcall;
+
   def SQRT_#FPTy : RuntimeLibcall;
+  def FAST_SQRT_#FPTy : RuntimeLibcall;
+
   def CBRT_#FPTy : RuntimeLibcall;
   def LOG_#FPTy : RuntimeLibcall;
   def LOG_FINITE_#FPTy : RuntimeLibcall;
@@ -1470,27 +1481,26 @@ def __hexagon_moddi3 : RuntimeLibcallImpl<SREM_I64>;
 def __hexagon_umodsi3 : RuntimeLibcallImpl<UREM_I32>;
 def __hexagon_umoddi3 : RuntimeLibcallImpl<UREM_I64>;
 
-// FIXME: "Fast" versions should be treated as a separate RTLIB::FAST_* function
 def __hexagon_adddf3 : RuntimeLibcallImpl<ADD_F64>;
-def __hexagon_fast_adddf3 : RuntimeLibcallImpl<ADD_F64>;
+def __hexagon_fast_adddf3 : RuntimeLibcallImpl<FAST_ADD_F64>;
 
 def __hexagon_subdf3 : RuntimeLibcallImpl<SUB_F64>;
-def __hexagon_fast_subdf3 : RuntimeLibcallImpl<SUB_F64>;
+def __hexagon_fast_subdf3 : RuntimeLibcallImpl<FAST_SUB_F64>;
 
 def __hexagon_muldf3 : RuntimeLibcallImpl<MUL_F64>;
-def __hexagon_fast_muldf3 : RuntimeLibcallImpl<MUL_F64>;
+def __hexagon_fast_muldf3 : RuntimeLibcallImpl<FAST_MUL_F64>;
 
 def __hexagon_divdf3 : RuntimeLibcallImpl<DIV_F64>;
-def __hexagon_fast_divdf3 : RuntimeLibcallImpl<DIV_F64>;
+def __hexagon_fast_divdf3 : RuntimeLibcallImpl<FAST_DIV_F64>;
 
 def __hexagon_divsf3 : RuntimeLibcallImpl<DIV_F32>;
-def __hexagon_fast_divsf3 : RuntimeLibcallImpl<DIV_F32>;
+def __hexagon_fast_divsf3 : RuntimeLibcallImpl<FAST_DIV_F32>;
 
 def __hexagon_sqrtf : RuntimeLibcallImpl<SQRT_F32>;
-def __hexagon_fast2_sqrtf : RuntimeLibcallImpl<SQRT_F32>;
+def __hexagon_fast2_sqrtf : RuntimeLibcallImpl<FAST_SQRT_F32>;
 
 // This is the only fast library function for sqrtd.
-def __hexagon_fast2_sqrtdf2 : RuntimeLibcallImpl<SQRT_F64>;
+def __hexagon_fast2_sqrtdf2 : RuntimeLibcallImpl<FAST_SQRT_F64>;
 
 def __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
     : RuntimeLibcallImpl<HEXAGON_MEMCPY_LIKELY_ALIGNED_MIN32BYTES_MULT8BYTES>;

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 528136a55f14a..7266940c94bf1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -140,12 +140,19 @@ class SelectionDAGLegalize {
                        RTLIB::Libcall Call_F128,
                        RTLIB::Libcall Call_PPCF128,
                        SmallVectorImpl<SDValue> &Results);
-  SDValue ExpandIntLibCall(SDNode *Node, bool isSigned,
-                           RTLIB::Libcall Call_I8,
-                           RTLIB::Libcall Call_I16,
-                           RTLIB::Libcall Call_I32,
-                           RTLIB::Libcall Call_I64,
-                           RTLIB::Libcall Call_I128);
+
+  void
+  ExpandFastFPLibCall(SDNode *Node, bool IsFast,
+                      std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_F32,
+                      std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_F64,
+                      std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_F80,
+                      std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_F128,
+                      std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_PPCF128,
+                      SmallVectorImpl<SDValue> &Results);
+
+  SDValue ExpandIntLibCall(SDNode *Node, bool isSigned, RTLIB::Libcall Call_I8,
+                           RTLIB::Libcall Call_I16, RTLIB::Libcall Call_I32,
+                           RTLIB::Libcall Call_I64, RTLIB::Libcall Call_I128);
   void ExpandArgFPLibCall(SDNode *Node,
                           RTLIB::Libcall Call_F32, RTLIB::Libcall Call_F64,
                           RTLIB::Libcall Call_F80, RTLIB::Libcall Call_F128,
@@ -2228,6 +2235,37 @@ void SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
   ExpandFPLibCall(Node, LC, Results);
 }
 
+void SelectionDAGLegalize::ExpandFastFPLibCall(
+    SDNode *Node, bool IsFast,
+    std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_F32,
+    std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_F64,
+    std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_F80,
+    std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_F128,
+    std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_PPCF128,
+    SmallVectorImpl<SDValue> &Results) {
+
+  EVT VT = Node->getSimpleValueType(0);
+
+  RTLIB::Libcall LC;
+
+  // FIXME: Probably should define fast to respect nan/inf and only be
+  // approximate functions.
+
+  if (IsFast) {
+    LC = RTLIB::getFPLibCall(VT, Call_F32.first, Call_F64.first, Call_F80.first,
+                             Call_F128.first, Call_PPCF128.first);
+  }
+
+  if (!IsFast || TLI.getLibcallImpl(LC) == RTLIB::Unsupported) {
+    // Fall back if we don't have a fast implementation.
+    LC = RTLIB::getFPLibCall(VT, Call_F32.second, Call_F64.second,
+                             Call_F80.second, Call_F128.second,
+                             Call_PPCF128.second);
+  }
+
+  ExpandFPLibCall(Node, LC, Results);
+}
+
 SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
                                                RTLIB::Libcall Call_I8,
                                                RTLIB::Libcall Call_I16,
@@ -4514,6 +4552,18 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   return true;
 }
 
+/// Return if we can use the FAST_* variant of a math libcall for the node.
+/// FIXME: This is just guessing, we probably should have unique specific sets
+/// flags required per libcall.
+static bool canUseFastMathLibcall(const SDNode *Node) {
+  // FIXME: Probably should define fast to respect nan/inf and only be
+  // approximate functions.
+
+  SDNodeFlags Flags = Node->getFlags();
+  return Flags.hasApproximateFuncs() && Flags.hasNoNaNs() &&
+         Flags.hasNoInfs() && Flags.hasNoSignedZeros();
+}
+
 void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
   LLVM_DEBUG(dbgs() << "Trying to convert node to libcall\n");
   SmallVector<SDValue, 8> Results;
@@ -4634,11 +4684,18 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                     RTLIB::FMAXIMUM_NUM_PPCF128, Results);
     break;
   case ISD::FSQRT:
-  case ISD::STRICT_FSQRT:
-    ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64,
-                    RTLIB::SQRT_F80, RTLIB::SQRT_F128,
-                    RTLIB::SQRT_PPCF128, Results);
+  case ISD::STRICT_FSQRT: {
+    // FIXME: Probably should define fast to respect nan/inf and only be
+    // approximate functions.
+    ExpandFastFPLibCall(Node, canUseFastMathLibcall(Node),
+                        {RTLIB::FAST_SQRT_F32, RTLIB::SQRT_F32},
+                        {RTLIB::FAST_SQRT_F64, RTLIB::SQRT_F64},
+                        {RTLIB::FAST_SQRT_F80, RTLIB::SQRT_F80},
+                        {RTLIB::FAST_SQRT_F128, RTLIB::SQRT_F128},
+                        {RTLIB::FAST_SQRT_PPCF128, RTLIB::SQRT_PPCF128},
+                        Results);
     break;
+  }
   case ISD::FCBRT:
     ExpandFPLibCall(Node, RTLIB::CBRT_F32, RTLIB::CBRT_F64,
                     RTLIB::CBRT_F80, RTLIB::CBRT_F128,
@@ -4875,11 +4932,15 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                        RTLIB::LLRINT_PPCF128, Results);
     break;
   case ISD::FDIV:
-  case ISD::STRICT_FDIV:
-    ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64,
-                    RTLIB::DIV_F80, RTLIB::DIV_F128,
-                    RTLIB::DIV_PPCF128, Results);
+  case ISD::STRICT_FDIV: {
+    ExpandFastFPLibCall(Node, canUseFastMathLibcall(Node),
+                        {RTLIB::FAST_DIV_F32, RTLIB::DIV_F32},
+                        {RTLIB::FAST_DIV_F64, RTLIB::DIV_F64},
+                        {RTLIB::FAST_DIV_F80, RTLIB::DIV_F80},
+                        {RTLIB::FAST_DIV_F128, RTLIB::DIV_F128},
+                        {RTLIB::FAST_DIV_PPCF128, RTLIB::DIV_PPCF128}, Results);
     break;
+  }
   case ISD::FREM:
   case ISD::STRICT_FREM:
     ExpandFPLibCall(Node, RTLIB::REM_F32, RTLIB::REM_F64,
@@ -4893,17 +4954,25 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                     RTLIB::FMA_PPCF128, Results);
     break;
   case ISD::FADD:
-  case ISD::STRICT_FADD:
-    ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64,
-                    RTLIB::ADD_F80, RTLIB::ADD_F128,
-                    RTLIB::ADD_PPCF128, Results);
+  case ISD::STRICT_FADD: {
+    ExpandFastFPLibCall(Node, canUseFastMathLibcall(Node),
+                        {RTLIB::FAST_ADD_F32, RTLIB::ADD_F32},
+                        {RTLIB::FAST_ADD_F64, RTLIB::ADD_F64},
+                        {RTLIB::FAST_ADD_F80, RTLIB::ADD_F80},
+                        {RTLIB::FAST_ADD_F128, RTLIB::ADD_F128},
+                        {RTLIB::FAST_ADD_PPCF128, RTLIB::ADD_PPCF128}, Results);
     break;
+  }
   case ISD::FMUL:
-  case ISD::STRICT_FMUL:
-    ExpandFPLibCall(Node, RTLIB::MUL_F32, RTLIB::MUL_F64,
-                    RTLIB::MUL_F80, RTLIB::MUL_F128,
-                    RTLIB::MUL_PPCF128, Results);
+  case ISD::STRICT_FMUL: {
+    ExpandFastFPLibCall(Node, canUseFastMathLibcall(Node),
+                        {RTLIB::FAST_MUL_F32, RTLIB::MUL_F32},
+                        {RTLIB::FAST_MUL_F64, RTLIB::MUL_F64},
+                        {RTLIB::FAST_MUL_F80, RTLIB::MUL_F80},
+                        {RTLIB::FAST_MUL_F128, RTLIB::MUL_F128},
+                        {RTLIB::FAST_MUL_PPCF128, RTLIB::MUL_PPCF128}, Results);
     break;
+  }
   case ISD::FP16_TO_FP:
     if (Node->getValueType(0) == MVT::f32) {
       Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false).first);
@@ -5076,11 +5145,15 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     break;
   }
   case ISD::FSUB:
-  case ISD::STRICT_FSUB:
-    ExpandFPLibCall(Node, RTLIB::SUB_F32, RTLIB::SUB_F64,
-                    RTLIB::SUB_F80, RTLIB::SUB_F128,
-                    RTLIB::SUB_PPCF128, Results);
+  case ISD::STRICT_FSUB: {
+    ExpandFastFPLibCall(Node, canUseFastMathLibcall(Node),
+                        {RTLIB::FAST_SUB_F32, RTLIB::SUB_F32},
+                        {RTLIB::FAST_SUB_F64, RTLIB::SUB_F64},
+                        {RTLIB::FAST_SUB_F80, RTLIB::SUB_F80},
+                        {RTLIB::FAST_SUB_F128, RTLIB::SUB_F128},
+                        {RTLIB::FAST_SUB_PPCF128, RTLIB::SUB_PPCF128}, Results);
     break;
+  }
   case ISD::SREM:
     Results.push_back(ExpandIntLibCall(Node, true,
                                        RTLIB::SREM_I8,

diff  --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 64c9415c54d4d..c4fd40f313077 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -18,10 +18,6 @@ using namespace RTLIB;
 #undef GET_INIT_RUNTIME_LIBCALL_NAMES
 #undef GET_SET_TARGET_RUNTIME_LIBCALL_SETS
 
-static cl::opt<bool>
-    HexagonEnableFastMathRuntimeCalls("hexagon-fast-math", cl::Hidden,
-                                      cl::desc("Enable Fast Math processing"));
-
 static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT,
                                FloatABI::ABIType FloatABIType,
                                EABI EABIVersion) {
@@ -268,32 +264,25 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
     setLibcallImpl(RTLIB::UREM_I32, RTLIB::__hexagon_umodsi3);
     setLibcallImpl(RTLIB::UREM_I64, RTLIB::__hexagon_umoddi3);
 
-    const bool FastMath = HexagonEnableFastMathRuntimeCalls;
-    // This is the only fast library function for sqrtd.
-    if (FastMath)
-      setLibcallImpl(RTLIB::SQRT_F64, RTLIB::__hexagon_fast2_sqrtdf2);
-
     // Prefix is: nothing  for "slow-math",
     //            "fast2_" for V5+ fast-math double-precision
     // (actually, keep fast-math and fast-math2 separate for now)
-    if (FastMath) {
-      setLibcallImpl(RTLIB::ADD_F64, RTLIB::__hexagon_fast_adddf3);
-      setLibcallImpl(RTLIB::SUB_F64, RTLIB::__hexagon_fast_subdf3);
-      setLibcallImpl(RTLIB::MUL_F64, RTLIB::__hexagon_fast_muldf3);
-      setLibcallImpl(RTLIB::DIV_F64, RTLIB::__hexagon_fast_divdf3);
-      setLibcallImpl(RTLIB::DIV_F32, RTLIB::__hexagon_fast_divsf3);
-    } else {
-      setLibcallImpl(RTLIB::ADD_F64, RTLIB::__hexagon_adddf3);
-      setLibcallImpl(RTLIB::SUB_F64, RTLIB::__hexagon_subdf3);
-      setLibcallImpl(RTLIB::MUL_F64, RTLIB::__hexagon_muldf3);
-      setLibcallImpl(RTLIB::DIV_F64, RTLIB::__hexagon_divdf3);
-      setLibcallImpl(RTLIB::DIV_F32, RTLIB::__hexagon_divsf3);
-    }
 
-    if (FastMath)
-      setLibcallImpl(RTLIB::SQRT_F32, RTLIB::__hexagon_fast2_sqrtf);
-    else
-      setLibcallImpl(RTLIB::SQRT_F32, RTLIB::__hexagon_sqrtf);
+    setLibcallImpl(RTLIB::FAST_ADD_F64, RTLIB::__hexagon_fast_adddf3);
+    setLibcallImpl(RTLIB::FAST_SUB_F64, RTLIB::__hexagon_fast_subdf3);
+    setLibcallImpl(RTLIB::FAST_MUL_F64, RTLIB::__hexagon_fast_muldf3);
+    setLibcallImpl(RTLIB::FAST_DIV_F64, RTLIB::__hexagon_fast_divdf3);
+    setLibcallImpl(RTLIB::FAST_DIV_F32, RTLIB::__hexagon_fast_divsf3);
+    setLibcallImpl(RTLIB::FAST_SQRT_F32, RTLIB::__hexagon_fast2_sqrtf);
+    // This is the only fast library function for sqrtd.
+    setLibcallImpl(RTLIB::FAST_SQRT_F64, RTLIB::__hexagon_fast2_sqrtdf2);
+
+    setLibcallImpl(RTLIB::ADD_F64, RTLIB::__hexagon_adddf3);
+    setLibcallImpl(RTLIB::SUB_F64, RTLIB::__hexagon_subdf3);
+    setLibcallImpl(RTLIB::MUL_F64, RTLIB::__hexagon_muldf3);
+    setLibcallImpl(RTLIB::DIV_F64, RTLIB::__hexagon_divdf3);
+    setLibcallImpl(RTLIB::DIV_F32, RTLIB::__hexagon_divsf3);
+    setLibcallImpl(RTLIB::SQRT_F32, RTLIB::__hexagon_sqrtf);
 
     setLibcallImpl(
         RTLIB::HEXAGON_MEMCPY_LIKELY_ALIGNED_MIN32BYTES_MULT8BYTES,

diff  --git a/llvm/test/CodeGen/Hexagon/fast-math-libcalls.ll b/llvm/test/CodeGen/Hexagon/fast-math-libcalls.ll
new file mode 100644
index 0000000000000..6bc60132d3e6a
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/fast-math-libcalls.ll
@@ -0,0 +1,369 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+
+;---------------------------------------------------------------------
+; fast sqrt
+;---------------------------------------------------------------------
+
+define float @fast_sqrt_f32(float %x) {
+; CHECK-LABEL: fast_sqrt_f32:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_fast2_sqrtf
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call nnan ninf nsz afn float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define double @fast_sqrt_f64(double %x) {
+; CHECK-LABEL: fast_sqrt_f64:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_fast2_sqrtdf2
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call nnan ninf nsz afn double @llvm.sqrt.f64(double %x)
+  ret double %result
+}
+
+;---------------------------------------------------------------------
+; fast fadd
+;---------------------------------------------------------------------
+
+define float @fast_add_f32(float %x, float %y) {
+; CHECK-LABEL: fast_add_f32:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = sfadd(r0,r1)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fadd nnan ninf nsz afn float %x, %y
+  ret float %result
+}
+
+define double @fast_add_f64(double %x, double %y) {
+; CHECK-LABEL: fast_add_f64:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_fast_adddf3
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = fadd nnan ninf nsz afn double %x, %y
+  ret double %result
+}
+
+;---------------------------------------------------------------------
+; fast fsub
+;---------------------------------------------------------------------
+
+define float @fast_sub_f32(float %x, float %y) {
+; CHECK-LABEL: fast_sub_f32:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = sfsub(r0,r1)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fsub nnan ninf nsz afn float %x, %y
+  ret float %result
+}
+
+define double @fast_sub_f64(double %x, double %y) {
+; CHECK-LABEL: fast_sub_f64:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_fast_subdf3
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = fsub nnan ninf nsz afn double %x, %y
+  ret double %result
+}
+
+;---------------------------------------------------------------------
+; fast fmul
+;---------------------------------------------------------------------
+
+define float @fast_mul_f32(float %x, float %y) {
+; CHECK-LABEL: fast_mul_f32:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = sfmpy(r0,r1)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fmul nnan ninf nsz afn float %x, %y
+  ret float %result
+}
+
+define double @fast_mul_f64(double %x, double %y) {
+; CHECK-LABEL: fast_mul_f64:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_fast_muldf3
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = fmul nnan ninf nsz afn double %x, %y
+  ret double %result
+}
+
+;---------------------------------------------------------------------
+; fast fdiv
+;---------------------------------------------------------------------
+
+define float @fast_div_f32(float %x, float %y) {
+; CHECK-LABEL: fast_div_f32:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = sffixupn(r0,r1)
+; CHECK-NEXT:     r4,p0 = sfrecipa(r0,r1)
+; CHECK-NEXT:     r5 = ##1065353216
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = sffixupd(r0,r1)
+; CHECK-NEXT:     r6 = ##1065353216
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 -= sfmpy(r1,r4):lib
+; CHECK-NEXT:     r0 = and(r2,##-2147483648)
+; CHECK-NEXT:     r3 = r2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r4 += sfmpy(r5,r4):lib
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 += sfmpy(r2,r4):lib
+; CHECK-NEXT:     r6 -= sfmpy(r1,r4):lib
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r3 -= sfmpy(r1,r0):lib
+; CHECK-NEXT:     r4 += sfmpy(r6,r4):lib
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 += sfmpy(r3,r4):lib
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 -= sfmpy(r0,r1):lib
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 += sfmpy(r2,r4,p0):scale
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fdiv nnan ninf nsz afn float %x, %y
+  ret float %result
+}
+
+define double @fast_div_f64(double %x, double %y) {
+; CHECK-LABEL: fast_div_f64:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_fast_divdf3
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = fdiv nnan ninf nsz afn double %x, %y
+  ret double %result
+}
+
+;---------------------------------------------------------------------
+; Negative tests sqrt
+;---------------------------------------------------------------------
+
+; TODO: What flags do we really need here?
+define float @sqrt_f32__afn(float %x) {
+; CHECK-LABEL: sqrt_f32__afn:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_sqrtf
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call afn float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @sqrt_f32__afn_ninf(float %x) {
+; CHECK-LABEL: sqrt_f32__afn_ninf:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_sqrtf
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call afn ninf float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @sqrt_f32__afn_nnan(float %x) {
+; CHECK-LABEL: sqrt_f32__afn_nnan:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_sqrtf
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call afn nnan float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @sqrt_f32__nnan(float %x) {
+; CHECK-LABEL: sqrt_f32__nnan:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_sqrtf
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call nnan float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @sqrt_f32_nnan_ninf_afn(float %x) {
+; CHECK-LABEL: sqrt_f32_nnan_ninf_afn:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_sqrtf
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call nnan ninf afn float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+;---------------------------------------------------------------------
+; Negative tests fadd
+;---------------------------------------------------------------------
+
+; TODO: What flags do we really need here?
+define float @fadd_f32_afn(float %x, float %y) {
+; CHECK-LABEL: fadd_f32_afn:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = sfadd(r0,r1)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fadd afn float %x, %y
+  ret float %result
+}
+
+define float @fadd_f32__afn_ninf(float %x, float %y) {
+; CHECK-LABEL: fadd_f32__afn_ninf:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = sfadd(r0,r1)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fadd afn ninf float %x, %y
+  ret float %result
+}
+
+define float @fadd_f32__afn_nnan(float %x, float %y) {
+; CHECK-LABEL: fadd_f32__afn_nnan:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = sfadd(r0,r1)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fadd afn nnan float %x, %y
+  ret float %result
+}
+
+define float @fadd_f32__nnan(float %x, float %y) {
+; CHECK-LABEL: fadd_f32__nnan:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = sfadd(r0,r1)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fadd nnan float %x, %y
+  ret float %result
+}
+
+define float @fadd_f32__nnan_ninf_afn(float %x, float %y) {
+; CHECK-LABEL: fadd_f32__nnan_ninf_afn:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = sfadd(r0,r1)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fadd nnan ninf afn float %x, %y
+  ret float %result
+}