[libc-commits] [libc] c5f8a0a - [libc] Add support for x86-64 targets that do not have FMA instructions.

Fri Apr 8 11:12:40 PDT 2022

Author: Tue Ly
Date: 2022-04-08T14:12:24-04:00
New Revision: c5f8a0a1e929ea41bd3e0d781c6b394a3f619427

URL: https://github.com/llvm/llvm-project/commit/c5f8a0a1e929ea41bd3e0d781c6b394a3f619427
DIFF: https://github.com/llvm/llvm-project/commit/c5f8a0a1e929ea41bd3e0d781c6b394a3f619427.diff

LOG: [libc] Add support for x86-64 targets that do not have FMA instructions.

Make FMA flag checks more accurate for x86-64 targets, and refactor
polyeval to use multiply and add instead when FMA instructions are not
available.

Reviewed By: michaelrj, sivachandra

Differential Revision: https://reviews.llvm.org/D123335

Added: 
    libc/src/__support/FPUtil/multiply_add.h

Modified: 
    libc/src/__support/FPUtil/CMakeLists.txt
    libc/src/__support/FPUtil/FMA.h
    libc/src/__support/FPUtil/PolyEval.h
    libc/src/__support/FPUtil/aarch64/FMA.h
    libc/src/__support/FPUtil/generic/CMakeLists.txt
    libc/src/__support/FPUtil/generic/FMA.h
    libc/src/__support/FPUtil/x86_64/FMA.h
    libc/src/__support/architectures.h
    libc/src/math/CMakeLists.txt
    libc/src/math/generic/CMakeLists.txt
    libc/src/math/generic/expm1f.cpp
    libc/src/math/generic/log10f.cpp
    libc/src/math/generic/log1pf.cpp
    libc/src/math/generic/logf.cpp
    utils/bazel/llvm-project-overlay/libc/BUILD.bazel

Removed: 
    


################################################################################
diff  --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
index f1cd0b587d963..7f1cecc25f038 100644

--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -12,7 +12,6 @@ add_header_library(
     NearestIntegerOperations.h
     NormalFloat.h
     PlatformDefs.h
-    PolyEval.h
     UInt.h
     XFloat.h
   DEPENDS
@@ -34,4 +33,29 @@ add_header_library(
     libc.src.__support.FPUtil.generic.sqrt
 )
 
+add_header_library(
+  fma
+  HDRS
+    FMA.h
+  DEPENDS
+    .fputil
+    libc.src.__support.FPUtil.generic.fma
+)
+
+add_header_library(
+  multiply_add
+  HDRS
+    multiply_add.h
+  DEPENDS
+    .fma
+)
+
+add_header_library(
+  polyeval
+  HDRS
+    PolyEval.h
+  DEPENDS
+    .multiply_add
+)
+
 add_subdirectory(generic)

diff  --git a/libc/src/__support/FPUtil/FMA.h b/libc/src/__support/FPUtil/FMA.h
index c735c069fd4da..6823dd0f897b5 100644
--- a/libc/src/__support/FPUtil/FMA.h
+++ b/libc/src/__support/FPUtil/FMA.h
@@ -11,11 +11,16 @@
 
 #include "src/__support/architectures.h"
 
+#if defined(LIBC_TARGET_HAS_FMA)
+
 #if defined(LLVM_LIBC_ARCH_X86_64)
 #include "x86_64/FMA.h"
 #elif defined(LLVM_LIBC_ARCH_AARCH64)
 #include "aarch64/FMA.h"
+#endif
+
 #else
+// FMA instructions are not available
 #include "generic/FMA.h"
 #include "src/__support/CPP/TypeTraits.h"
 

diff  --git a/libc/src/__support/FPUtil/PolyEval.h b/libc/src/__support/FPUtil/PolyEval.h
index 368ee3848ddc3..c9e818accd1b1 100644
--- a/libc/src/__support/FPUtil/PolyEval.h
+++ b/libc/src/__support/FPUtil/PolyEval.h
@@ -9,19 +9,15 @@
 #ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H
 #define LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H
 
-#include "src/__support/CPP/TypeTraits.h"
-#include "src/__support/architectures.h"
+#include "multiply_add.h"
 
 // Evaluate polynomial using Horner's Scheme:
 // With polyeval(x, a_0, a_1, ..., a_n) = a_n * x^n + ... + a_1 * x + a_0, we
 // evaluated it as:  a_0 + x * (a_1 + x * ( ... (a_(n-1) + x * a_n) ... ) ) ).
-// We will use fma instructions if available.
+// We will use FMA instructions if available.
 // Example: to evaluate x^3 + 2*x^2 + 3*x + 4, call
 //   polyeval( x, 4.0, 3.0, 2.0, 1.0 )
 
-#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64)
-#include "FMA.h"
-
 namespace __llvm_libc {
 namespace fputil {
 
@@ -29,35 +25,10 @@ template <typename T> static inline T polyeval(T x, T a0) { return a0; }
 
 template <typename T, typename... Ts>
 INLINE_FMA static inline T polyeval(T x, T a0, Ts... a) {
-  return fma(x, polyeval(x, a...), a0);
+  return multiply_add(x, polyeval(x, a...), a0);
 }
 
 } // namespace fputil
 } // namespace __llvm_libc
 
-#ifdef LLVM_LIBC_ARCH_X86_64
-
-// [DISABLED] There is a regression with using vectorized version for polyeval
-// compared to the naive Horner's scheme with fma.  Need further investigation
-// #include "x86_64/PolyEval.h"
-
-#endif // LLVM_LIBC_ARCH_X86_64
-
-#else
-
-namespace __llvm_libc {
-namespace fputil {
-
-template <typename T> static inline T polyeval(T x, T a0) { return a0; }
-
-template <typename T, typename... Ts>
-static inline T polyeval(T x, T a0, Ts... a) {
-  return x * polyeval(x, a...) + a0;
-}
-
-} // namespace fputil
-} // namespace __llvm_libc
-
-#endif
-
-#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_FMA_H
+#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H

diff  --git a/libc/src/__support/FPUtil/aarch64/FMA.h b/libc/src/__support/FPUtil/aarch64/FMA.h
index c236c9a90b741..ed637c848658e 100644
--- a/libc/src/__support/FPUtil/aarch64/FMA.h
+++ b/libc/src/__support/FPUtil/aarch64/FMA.h
@@ -15,6 +15,10 @@
 #error "Invalid include"
 #endif
 
+#if !defined(LIBC_TARGET_HAS_FMA)
+#error "FMA instructions are not supported"
+#endif
+
 #include "src/__support/CPP/TypeTraits.h"
 
 namespace __llvm_libc {

diff  --git a/libc/src/__support/FPUtil/generic/CMakeLists.txt b/libc/src/__support/FPUtil/generic/CMakeLists.txt
index bf69e7dd961cd..a755e7670ce68 100644
--- a/libc/src/__support/FPUtil/generic/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/generic/CMakeLists.txt
@@ -4,3 +4,9 @@ add_header_library(
     sqrt.h
     sqrt_80_bit_long_double.h
 )
+
+add_header_library(
+  fma
+  HDRS
+    FMA.h
+)

diff  --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index efdd8b7129753..78b640c2c1a13 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_SRC_SUPPORT_FPUTIL_GENERIC_FMA_H
 
 #include "src/__support/CPP/TypeTraits.h"
+#include "src/__support/FPUtil/FPBits.h"
 
 namespace __llvm_libc {
 namespace fputil {

diff  --git a/libc/src/__support/FPUtil/multiply_add.h b/libc/src/__support/FPUtil/multiply_add.h
new file mode 100644
index 0000000000000..8f5da22a53cb1
--- /dev/null
+++ b/libc/src/__support/FPUtil/multiply_add.h
@@ -0,0 +1,41 @@
+//===-- Common header for multiply-add implementations ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H
+#define LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H
+
+#include "src/__support/architectures.h"
+
+namespace __llvm_libc {
+namespace fputil {
+
+// Implement a simple wrapper for multiply-add operation:
+//   multiply_add(x, y, z) = x*y + z
+// which uses FMA instructions to speed up if available.
+
+template <typename T> static inline T multiply_add(T x, T y, T z) {
+  return x * y + z;
+}
+
+#if defined(LIBC_TARGET_HAS_FMA)
+// FMA instructions are available.
+#include "FMA.h"
+
+template <> inline float multiply_add<float>(float x, float y, float z) {
+  return fma(x, y, z);
+}
+
+template <> inline double multiply_add<double>(double x, double y, double z) {
+  return fma(x, y, z);
+}
+#endif // LIBC_TARGET_HAS_FMA
+
+} // namespace fputil
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H

diff  --git a/libc/src/__support/FPUtil/x86_64/FMA.h b/libc/src/__support/FPUtil/x86_64/FMA.h
index 70ebe382e841a..08de6da344107 100644
--- a/libc/src/__support/FPUtil/x86_64/FMA.h
+++ b/libc/src/__support/FPUtil/x86_64/FMA.h
@@ -15,6 +15,10 @@
 #error "Invalid include"
 #endif
 
+#if !defined(LIBC_TARGET_HAS_FMA)
+#error "FMA instructions are not supported"
+#endif
+
 #include "src/__support/CPP/TypeTraits.h"
 #include <immintrin.h>
 

diff  --git a/libc/src/__support/architectures.h b/libc/src/__support/architectures.h
index 14eb1a586463f..70eeb99107b2b 100644
--- a/libc/src/__support/architectures.h
+++ b/libc/src/__support/architectures.h
@@ -37,7 +37,15 @@
 #define LLVM_LIBC_ARCH_ANY_ARM
 #endif
 
-#if defined(LLVM_LIBC_ARCH_X86_64)
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+#define LIBC_TARGET_HAS_FMA
+#elif defined(LLVM_LIBC_ARCH_X86_64)
+#if (defined(__AVX2__) || defined(__FMA__))
+#define LIBC_TARGET_HAS_FMA
+#endif
+#endif
+
+#if (defined(LLVM_LIBC_ARCH_X86_64) && defined(LIBC_TARGET_HAS_FMA))
 #define INLINE_FMA __attribute__((target("fma")))
 #else
 #define INLINE_FMA

diff  --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 9737f4a362dc0..8fc550d1f9682 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -48,8 +48,9 @@ add_entrypoint_object(
     fmaf.h
   DEPENDS
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.fma
   COMPILE_OPTIONS
-    -O2
+    -O3
     -mfma
 )
 
@@ -61,8 +62,9 @@ add_entrypoint_object(
     fma.h
   DEPENDS
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.fma
   COMPILE_OPTIONS
-    -O2
+    -O3
     -mfma
 )
 

diff  --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 73957cf789104..6a96b55108045 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -478,6 +478,7 @@ add_entrypoint_object(
   DEPENDS
     .common_constants
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.polyeval
     libc.include.math
   COMPILE_OPTIONS
     -O3
@@ -492,6 +493,7 @@ add_entrypoint_object(
     ../exp2f.h
   DEPENDS
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.polyeval
     libc.include.math
   COMPILE_OPTIONS
     -O3
@@ -507,6 +509,8 @@ add_entrypoint_object(
   DEPENDS
     .common_constants
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
     libc.include.math
   COMPILE_OPTIONS
     -O3
@@ -674,6 +678,8 @@ add_entrypoint_object(
   DEPENDS
     .common_constants
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
   COMPILE_OPTIONS
     -O3
     -mfma
@@ -688,6 +694,8 @@ add_entrypoint_object(
   DEPENDS
     .common_constants
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
   COMPILE_OPTIONS
     -O3
     -mfma
@@ -702,6 +710,7 @@ add_entrypoint_object(
   DEPENDS
     .common_constants
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.polyeval
     COMPILE_OPTIONS
     -O3
     -mfma
@@ -716,6 +725,8 @@ add_entrypoint_object(
   DEPENDS
     .common_constants
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
   COMPILE_OPTIONS
     -O3
     -mfma

diff  --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp
index b0544b76e09ac..76232d6ab6a95 100644
--- a/libc/src/math/generic/expm1f.cpp
+++ b/libc/src/math/generic/expm1f.cpp
@@ -83,7 +83,7 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
       //   = x otherwise.
       // To simplify the rounding decision and make it more efficient, we use
       //   fma(x, x, x) ~ x + x^2 instead.
-      return fputil::fma(x, x, x);
+      return fputil::multiply_add(x, x, x);
     }
 
     // 2^-25 <= |x| < 2^-4
@@ -96,7 +96,7 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
         fputil::polyeval(xd, 0x1p-1, 0x1.55555555557ddp-3, 0x1.55555555552fap-5,
                          0x1.111110fcd58b7p-7, 0x1.6c16c1717660bp-10,
                          0x1.a0241f0006d62p-13, 0x1.a01e3f8d3c06p-16);
-    return static_cast<float>(fputil::fma(r, xsq, xd));
+    return static_cast<float>(fputil::multiply_add(r, xsq, xd));
   }
 
   // For -18 < x < 89, to compute expm1(x), we perform the following range
@@ -132,7 +132,7 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
   double exp_lo =
       fputil::polyeval(xd, 0x1.0p0, 0x1.ffffffffff777p-1, 0x1.000000000071cp-1,
                        0x1.555566668e5e7p-3, 0x1.55555555ef243p-5);
-  return static_cast<float>(fputil::fma(exp_hi_mid, exp_lo, -1.0));
+  return static_cast<float>(fputil::multiply_add(exp_hi_mid, exp_lo, -1.0));
 }
 
 } // namespace __llvm_libc

diff  --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp
index 59ca6590b640a..878ae68f85eb7 100644
--- a/libc/src/math/generic/log10f.cpp
+++ b/libc/src/math/generic/log10f.cpp
@@ -170,7 +170,7 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) {
   double d = static_cast<float>(xbits) - static_cast<float>(f);
   d *= ONE_OVER_F[f_index];
 
-  double extra_factor = fputil::fma(m, LOG10_2, LOG10_F[f_index]);
+  double extra_factor = fputil::multiply_add(m, LOG10_2, LOG10_F[f_index]);
 
   double r = fputil::polyeval(d, extra_factor, 0x1.bcb7b1526e4c5p-2,
                               -0x1.bcb7b1518a5e9p-3, 0x1.287a72a6f716p-3,

diff  --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp
index 7d1e71e919902..6e8c6781e6ef3 100644
--- a/libc/src/math/generic/log1pf.cpp
+++ b/libc/src/math/generic/log1pf.cpp
@@ -66,7 +66,7 @@ INLINE_FMA static inline float log(double x) {
   double d = static_cast<double>(xbits) - static_cast<double>(f);
   d *= ONE_OVER_F[f_index];
 
-  double extra_factor = fputil::fma(m, LOG_2, LOG_F[f_index]);
+  double extra_factor = fputil::multiply_add(m, LOG_2, LOG_F[f_index]);
 
   double r = fputil::polyeval(d, extra_factor, 0x1.fffffffffffacp-1,
                               -0x1.fffffffef9cb2p-2, 0x1.5555513bc679ap-2,
@@ -161,7 +161,7 @@ LLVM_LIBC_FUNCTION(float, log1pf, (float x)) {
   // > fpminimax(log(1 + x)/x, 5, [|D...|], [-2^-8; 2^-8]);
   r = fputil::polyeval(xd, -0x1p-1, 0x1.5555555515551p-2, -0x1.ffffffff82bdap-3,
                        0x1.999b33348d3aep-3, -0x1.5556cae3adcc3p-3);
-  return static_cast<float>(fputil::fma(r, xd * xd, xd));
+  return static_cast<float>(fputil::multiply_add(r, xd * xd, xd));
 }
 
 } // namespace __llvm_libc

diff  --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp
index 3e712378b64c3..747f8c73c27c5 100644
--- a/libc/src/math/generic/logf.cpp
+++ b/libc/src/math/generic/logf.cpp
@@ -120,7 +120,7 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) {
   d *= ONE_OVER_F[f_index];
 
   double extra_factor =
-      fputil::fma(static_cast<double>(m), LOG_2, LOG_F[f_index]);
+      fputil::multiply_add(static_cast<double>(m), LOG_2, LOG_F[f_index]);
 
   double r = __llvm_libc::fputil::polyeval(
       d, extra_factor, 0x1.fffffffffffacp-1, -0x1.fffffffef9cb2p-2,

diff  --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index ad8bcd0f476c1..93c45b93ad476 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -196,6 +196,54 @@ cc_library(
     ],
 )
 
+fma_common_hdrs = [
+    "src/__support/FPUtil/FMA.h",
+    "src/__support/FPUtil/generic/FMA.h",
+]
+
+fma_hdrs = selects.with_or({
+    "//conditions:default": fma_common_hdrs,
+    PLATFORM_CPU_X86_64: fma_common_hdrs + [
+        "src/__support/FPUtil/x86_64/FMA.h",
+    ],
+    PLATFORM_CPU_ARM64: fma_common_hdrs + [
+        "src/__support/FPUtil/aarch64/FMA.h",
+    ],
+})
+
+cc_library(
+    name = "__support_fputil_fma",
+    hdrs = fma_hdrs,
+    deps = [
+        ":__support_common",
+        ":__support_cpp_bit",
+        ":__support_cpp_type_traits",
+        ":__support_fputil",
+        ":libc_root",
+    ],
+)
+
+cc_library(
+    name = "__support_fputil_multiply_add",
+    hdrs = [
+        "src/__support/FPUtil/multiply_add.h",
+    ],
+    deps = [
+        ":__support_common",
+        ":__support_fputil_fma",
+    ],
+)
+
+cc_library(
+    name = "__support_fputil_polyeval",
+    hdrs = [
+        "src/__support/FPUtil/PolyEval.h",
+    ],
+    deps = [
+        ":__support_fputil_multiply_add",
+    ],
+)
+
 ################################ fenv targets ################################
 
 libc_function(