[libclc] [libclc] Move fma to the CLC library (PR #126052)

Mon Feb 24 01:32:45 PST 2025

https://github.com/frasercrmck updated https://github.com/llvm/llvm-project/pull/126052

>From 18d67915671a081540abde18b1b6c009ac094a27 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser at codeplay.com>
Date: Tue, 28 Jan 2025 12:19:23 +0000
Subject: [PATCH 1/2] [libclc] Move fma to the CLC library

This builtin is a little more involved than others as targets deal with
fma in various different ways.

Fundamentally, the CLC __clc_fma builtin compiles to
__builtin_elementwise_fma, which compiles to the @llvm.fma intrinsic.
However, in the case of fp32 fma some targets call the __clc_sw_fma
function, which provides a software implementation of the builtin. This
in principle is controlled by the __CLC_HAVE_HW_FMA32 macro and may be a
runtime decision, depending on how the target defines that macro.

All targets build the CLC fma functions for all types. This is to the
CLC library can have a reliable internal implementation for its own
purposes.

For AMD/NVPTX targets there are no meaningful changes to the generated
LLVM bytecode. Some blocks of code have moved around, which confounds
llvm-diff.

For the clspv and SPIR-V/Mesa targets, only fp32 fma is of interest. Its
use in libclc is tightly controlled by checking __CLC_HAVE_HW_FMA32
first. This can either be a compile-time constant (0, for clspv) or a
runtime function for SPIR-V/Mesa.

The SPIR-V/Mesa target only provided fp32 fma in the OpenCL layer. It
unconditionally mapped that to the __clc_sw_fma builtin, even though the
generic version in theory had a runtime toggle through
__CLC_HAVE_HW_FMA32 specifically for that target. Callers of fma,
though, would end up using the ExtInst fma, *not* calling the _Z3fmafff
function provided by libclc.

This commit keeps this system in place in the OpenCL layer, by mapping
fma to __clc_sw_fma. Where other builtins would previously call fma
(i.e., result in the ExtInst), they now call __clc_fma. This function
checks the __CLC_HAVE_HW_FMA32 runtime toggle, which selects between the
slow version or the quick version. The quick version is the LLVM fma
intrinsic which llvm-spirv translates to the ExtInst.

The clspv target had its own software implementation of fp32 fma, which
it called unconditionally - even though __CLC_HAVE_HW_FMA32 is 1 for
that target. This is potentially just so its library ships a software
version which it can fall back on. In the OpenCL layer, the target
doesn't provide fp64 fma, and maps fp16 fma to fp32 mad.

This commit keeps this system roughly in place: in the OpenCL layer it
maps fp32 fma to __clc_sw_fma, and fp16 fma to mad. Where builtins would
previously call into fma, they now call __clc_fma, which compiles to the
LLVM intrinsic. If this goes through a translation to SPIR-V it will
become the fma ExtInst, or the intrinsic could be replaced by the
_Z3fmafff software implementation.

The clspv and SPIR-V/Mesa targets could potentially be cleaned up later,
depending on their needs.
---
 libclc/CMakeLists.txt                         |   2 +
 .../include/clc/internal/math/clc_sw_fma.h}   |  11 +-
 libclc/clc/include/clc/math/clc_fma.h         |  12 +
 libclc/clc/include/clc/math/math.h            |   6 +-
 libclc/clc/lib/clspv/SOURCES                  |   1 +
 libclc/clc/lib/clspv/math/clc_sw_fma.cl       | 287 ++++++++++++++++++
 libclc/clc/lib/generic/SOURCES                |   2 +
 libclc/clc/lib/generic/math/clc_fma.cl        |   6 +
 libclc/clc/lib/generic/math/clc_fma.inc       |   8 +
 .../lib/generic/math/clc_sw_fma.cl}           |  57 ++--
 libclc/clc/lib/spirv/SOURCES                  |   1 +
 .../spirv/math/clc_runtime_has_hw_fma32.cl    |   1 +
 libclc/clspv/lib/math/fma.cl                  | 270 +---------------
 libclc/generic/lib/SOURCES                    |   1 -
 libclc/generic/lib/math/fma.cl                |  19 +-
 libclc/generic/lib/math/fma.inc               |   7 -
 libclc/generic/lib/math/sincos_helpers.cl     |   4 +-
 libclc/spirv/lib/SOURCES                      |   1 -
 libclc/spirv/lib/math/fma.cl                  |  12 +-
 libclc/spirv/lib/math/fma.inc                 |   3 -
 20 files changed, 386 insertions(+), 325 deletions(-)
 rename libclc/{generic/include/math/clc_fma.h => clc/include/clc/internal/math/clc_sw_fma.h} (52%)
 create mode 100644 libclc/clc/include/clc/math/clc_fma.h
 create mode 100644 libclc/clc/lib/clspv/SOURCES
 create mode 100644 libclc/clc/lib/clspv/math/clc_sw_fma.cl
 create mode 100644 libclc/clc/lib/generic/math/clc_fma.cl
 create mode 100644 libclc/clc/lib/generic/math/clc_fma.inc
 rename libclc/{generic/lib/math/clc_fma.cl => clc/lib/generic/math/clc_sw_fma.cl} (79%)
 create mode 100644 libclc/clc/lib/spirv/SOURCES
 create mode 100644 libclc/clc/lib/spirv/math/clc_runtime_has_hw_fma32.cl
 delete mode 100644 libclc/generic/lib/math/fma.inc
 delete mode 100644 libclc/spirv/lib/math/fma.inc

diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 5a9a26c44f368..2a2ebec66f6a9 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -28,6 +28,8 @@ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
   spirv/lib/SOURCES;
   # CLC internal libraries
   clc/lib/generic/SOURCES;
+  clc/lib/clspv/SOURCES;
+  clc/lib/spirv/SOURCES;
 )
 
 set( LIBCLC_MIN_LLVM 3.9.0 )
diff --git a/libclc/generic/include/math/clc_fma.h b/libclc/clc/include/clc/internal/math/clc_sw_fma.h
similarity index 52%
rename from libclc/generic/include/math/clc_fma.h
rename to libclc/clc/include/clc/internal/math/clc_sw_fma.h
index 598df66cf72e9..81994d3b02547 100644
--- a/libclc/generic/include/math/clc_fma.h
+++ b/libclc/clc/include/clc/internal/math/clc_sw_fma.h
@@ -1,11 +1,12 @@
-#define __CLC_FUNCTION __clc_fma
-#define __CLC_INTRINSIC "llvm.fma"
-#include "math/ternary_intrin.inc"
+#ifndef __CLC_INTERNAL_MATH_CLC_SW_FMA_H__
+#define __CLC_INTERNAL_MATH_CLC_SW_FMA_H__
 
-#define __FLOAT_ONLY
 #define __CLC_FUNCTION __clc_sw_fma
 #define __CLC_BODY <clc/shared/ternary_decl.inc>
+
 #include <clc/math/gentype.inc>
+
 #undef __CLC_BODY
 #undef __CLC_FUNCTION
-#undef __FLOAT_ONLY
+
+#endif // __CLC_INTERNAL_MATH_CLC_SW_FMA_H__
diff --git a/libclc/clc/include/clc/math/clc_fma.h b/libclc/clc/include/clc/math/clc_fma.h
new file mode 100644
index 0000000000000..bff777253fefe
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_fma.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_MATH_CLC_FMA_H__
+#define __CLC_MATH_CLC_FMA_H__
+
+#define __CLC_FUNCTION __clc_fma
+#define __CLC_BODY <clc/shared/ternary_decl.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_FMA_H__
diff --git a/libclc/clc/include/clc/math/math.h b/libclc/clc/include/clc/math/math.h
index ed37af237bf82..be33d5dd8af0b 100644
--- a/libclc/clc/include/clc/math/math.h
+++ b/libclc/clc/include/clc/math/math.h
@@ -39,12 +39,12 @@
 #define PINF 0x200
 
 #if (defined __AMDGCN__ || defined __R600__) && !defined __HAS_FMAF__
-#define HAVE_HW_FMA32() (0)
+#define __CLC_HAVE_HW_FMA32() (0)
 #elif defined(CLC_SPIRV)
 bool __attribute__((noinline)) __clc_runtime_has_hw_fma32(void);
-#define HAVE_HW_FMA32() __clc_runtime_has_hw_fma32()
+#define __CLC_HAVE_HW_FMA32() __clc_runtime_has_hw_fma32()
 #else
-#define HAVE_HW_FMA32() (1)
+#define __CLC_HAVE_HW_FMA32() (1)
 #endif
 
 #define HAVE_BITALIGN() (0)
diff --git a/libclc/clc/lib/clspv/SOURCES b/libclc/clc/lib/clspv/SOURCES
new file mode 100644
index 0000000000000..b1401f8307a4c
--- /dev/null
+++ b/libclc/clc/lib/clspv/SOURCES
@@ -0,0 +1 @@
+math/clc_sw_fma.cl
diff --git a/libclc/clc/lib/clspv/math/clc_sw_fma.cl b/libclc/clc/lib/clspv/math/clc_sw_fma.cl
new file mode 100644
index 0000000000000..1d65bcf14b2b4
--- /dev/null
+++ b/libclc/clc/lib/clspv/math/clc_sw_fma.cl
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// This version is derived from the generic fma software implementation
+// (__clc_sw_fma), but avoids the use of ulong in favor of uint2. The logic has
+// been updated as appropriate.
+
+#include <clc/clc_as_type.h>
+#include <clc/clcmacro.h>
+#include <clc/float/definitions.h>
+#include <clc/integer/clc_abs.h>
+#include <clc/integer/clc_clz.h>
+#include <clc/integer/clc_hadd.h>
+#include <clc/integer/clc_mul_hi.h>
+#include <clc/integer/definitions.h>
+#include <clc/math/clc_mad.h>
+#include <clc/math/math.h>
+#include <clc/relational/clc_isinf.h>
+#include <clc/relational/clc_isnan.h>
+#include <clc/shared/clc_max.h>
+
+struct fp {
+  uint2 mantissa;
+  int exponent;
+  uint sign;
+};
+
+static uint2 u2_set(uint hi, uint lo) {
+  uint2 res;
+  res.lo = lo;
+  res.hi = hi;
+  return res;
+}
+
+static uint2 u2_set_u(uint val) { return u2_set(0, val); }
+
+static uint2 u2_mul(uint a, uint b) {
+  uint2 res;
+  res.hi = __clc_mul_hi(a, b);
+  res.lo = a * b;
+  return res;
+}
+
+static uint2 u2_sll(uint2 val, uint shift) {
+  if (shift == 0)
+    return val;
+  if (shift < 32) {
+    val.hi <<= shift;
+    val.hi |= val.lo >> (32 - shift);
+    val.lo <<= shift;
+  } else {
+    val.hi = val.lo << (shift - 32);
+    val.lo = 0;
+  }
+  return val;
+}
+
+static uint2 u2_srl(uint2 val, uint shift) {
+  if (shift == 0)
+    return val;
+  if (shift < 32) {
+    val.lo >>= shift;
+    val.lo |= val.hi << (32 - shift);
+    val.hi >>= shift;
+  } else {
+    val.lo = val.hi >> (shift - 32);
+    val.hi = 0;
+  }
+  return val;
+}
+
+static uint2 u2_or(uint2 a, uint b) {
+  a.lo |= b;
+  return a;
+}
+
+static uint2 u2_and(uint2 a, uint2 b) {
+  a.lo &= b.lo;
+  a.hi &= b.hi;
+  return a;
+}
+
+static uint2 u2_add(uint2 a, uint2 b) {
+  uint carry = (__clc_hadd(a.lo, b.lo) >> 31) & 0x1;
+  a.lo += b.lo;
+  a.hi += b.hi + carry;
+  return a;
+}
+
+static uint2 u2_add_u(uint2 a, uint b) { return u2_add(a, u2_set_u(b)); }
+
+static uint2 u2_inv(uint2 a) {
+  a.lo = ~a.lo;
+  a.hi = ~a.hi;
+  return u2_add_u(a, 1);
+}
+
+static uint u2_clz(uint2 a) {
+  uint leading_zeroes = __clc_clz(a.hi);
+  if (leading_zeroes == 32) {
+    leading_zeroes += __clc_clz(a.lo);
+  }
+  return leading_zeroes;
+}
+
+static bool u2_eq(uint2 a, uint2 b) { return a.lo == b.lo && a.hi == b.hi; }
+
+static bool u2_zero(uint2 a) { return u2_eq(a, u2_set_u(0)); }
+
+static bool u2_gt(uint2 a, uint2 b) {
+  return a.hi > b.hi || (a.hi == b.hi && a.lo > b.lo);
+}
+
+_CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) {
+  /* special cases */
+  if (__clc_isnan(a) || __clc_isnan(b) || __clc_isnan(c) || __clc_isinf(a) ||
+      __clc_isinf(b)) {
+    return __clc_mad(a, b, c);
+  }
+
+  /* If only c is inf, and both a,b are regular numbers, the result is c*/
+  if (__clc_isinf(c)) {
+    return c;
+  }
+
+  a = __clc_flush_denormal_if_not_supported(a);
+  b = __clc_flush_denormal_if_not_supported(b);
+  c = __clc_flush_denormal_if_not_supported(c);
+
+  if (a == 0.0f || b == 0.0f) {
+    return c;
+  }
+
+  if (c == 0) {
+    return a * b;
+  }
+
+  struct fp st_a, st_b, st_c;
+
+  st_a.exponent = a == .0f ? 0 : ((__clc_as_uint(a) & 0x7f800000) >> 23) - 127;
+  st_b.exponent = b == .0f ? 0 : ((__clc_as_uint(b) & 0x7f800000) >> 23) - 127;
+  st_c.exponent = c == .0f ? 0 : ((__clc_as_uint(c) & 0x7f800000) >> 23) - 127;
+
+  st_a.mantissa =
+      u2_set_u(a == .0f ? 0 : (__clc_as_uint(a) & 0x7fffff) | 0x800000);
+  st_b.mantissa =
+      u2_set_u(b == .0f ? 0 : (__clc_as_uint(b) & 0x7fffff) | 0x800000);
+  st_c.mantissa =
+      u2_set_u(c == .0f ? 0 : (__clc_as_uint(c) & 0x7fffff) | 0x800000);
+
+  st_a.sign = __clc_as_uint(a) & 0x80000000;
+  st_b.sign = __clc_as_uint(b) & 0x80000000;
+  st_c.sign = __clc_as_uint(c) & 0x80000000;
+
+  // Multiplication.
+  // Move the product to the highest bits to maximize precision
+  // mantissa is 24 bits => product is 48 bits, 2bits non-fraction.
+  // Add one bit for future addition overflow,
+  // add another bit to detect subtraction underflow
+  struct fp st_mul;
+  st_mul.sign = st_a.sign ^ st_b.sign;
+  st_mul.mantissa = u2_sll(u2_mul(st_a.mantissa.lo, st_b.mantissa.lo), 14);
+  st_mul.exponent =
+      !u2_zero(st_mul.mantissa) ? st_a.exponent + st_b.exponent : 0;
+
+  // FIXME: Detecting a == 0 || b == 0 above crashed GCN isel
+  if (st_mul.exponent == 0 && u2_zero(st_mul.mantissa))
+    return c;
+
+// Mantissa is 23 fractional bits, shift it the same way as product mantissa
+#define C_ADJUST 37ul
+
+  // both exponents are bias adjusted
+  int exp_diff = st_mul.exponent - st_c.exponent;
+
+  st_c.mantissa = u2_sll(st_c.mantissa, C_ADJUST);
+  uint2 cutoff_bits = u2_set_u(0);
+  uint2 cutoff_mask = u2_add(u2_sll(u2_set_u(1), __clc_abs(exp_diff)),
+                             u2_set(0xffffffff, 0xffffffff));
+  if (exp_diff > 0) {
+    cutoff_bits =
+        exp_diff >= 64 ? st_c.mantissa : u2_and(st_c.mantissa, cutoff_mask);
+    st_c.mantissa =
+        exp_diff >= 64 ? u2_set_u(0) : u2_srl(st_c.mantissa, exp_diff);
+  } else {
+    cutoff_bits = -exp_diff >= 64 ? st_mul.mantissa
+                                  : u2_and(st_mul.mantissa, cutoff_mask);
+    st_mul.mantissa =
+        -exp_diff >= 64 ? u2_set_u(0) : u2_srl(st_mul.mantissa, -exp_diff);
+  }
+
+  struct fp st_fma;
+  st_fma.sign = st_mul.sign;
+  st_fma.exponent = __clc_max(st_mul.exponent, st_c.exponent);
+  if (st_c.sign == st_mul.sign) {
+    st_fma.mantissa = u2_add(st_mul.mantissa, st_c.mantissa);
+  } else {
+    // cutoff bits borrow one
+    st_fma.mantissa =
+        u2_add(u2_add(st_mul.mantissa, u2_inv(st_c.mantissa)),
+               (!u2_zero(cutoff_bits) && (st_mul.exponent > st_c.exponent)
+                    ? u2_set(0xffffffff, 0xffffffff)
+                    : u2_set_u(0)));
+  }
+
+  // underflow: st_c.sign != st_mul.sign, and magnitude switches the sign
+  if (u2_gt(st_fma.mantissa, u2_set(0x7fffffff, 0xffffffff))) {
+    st_fma.mantissa = u2_inv(st_fma.mantissa);
+    st_fma.sign = st_mul.sign ^ 0x80000000;
+  }
+
+  // detect overflow/underflow
+  int overflow_bits = 3 - u2_clz(st_fma.mantissa);
+
+  // adjust exponent
+  st_fma.exponent += overflow_bits;
+
+  // handle underflow
+  if (overflow_bits < 0) {
+    st_fma.mantissa = u2_sll(st_fma.mantissa, -overflow_bits);
+    overflow_bits = 0;
+  }
+
+  // rounding
+  uint2 trunc_mask = u2_add(u2_sll(u2_set_u(1), C_ADJUST + overflow_bits),
+                            u2_set(0xffffffff, 0xffffffff));
+  uint2 trunc_bits =
+      u2_or(u2_and(st_fma.mantissa, trunc_mask), !u2_zero(cutoff_bits));
+  uint2 last_bit =
+      u2_and(st_fma.mantissa, u2_sll(u2_set_u(1), C_ADJUST + overflow_bits));
+  uint2 grs_bits = u2_sll(u2_set_u(4), C_ADJUST - 3 + overflow_bits);
+
+  // round to nearest even
+  if (u2_gt(trunc_bits, grs_bits) ||
+      (u2_eq(trunc_bits, grs_bits) && !u2_zero(last_bit))) {
+    st_fma.mantissa =
+        u2_add(st_fma.mantissa, u2_sll(u2_set_u(1), C_ADJUST + overflow_bits));
+  }
+
+  // Shift mantissa back to bit 23
+  st_fma.mantissa = u2_srl(st_fma.mantissa, C_ADJUST + overflow_bits);
+
+  // Detect rounding overflow
+  if (u2_gt(st_fma.mantissa, u2_set_u(0xffffff))) {
+    ++st_fma.exponent;
+    st_fma.mantissa = u2_srl(st_fma.mantissa, 1);
+  }
+
+  if (u2_zero(st_fma.mantissa)) {
+    return 0.0f;
+  }
+
+  // Flating point range limit
+  if (st_fma.exponent > 127) {
+    return __clc_as_float(__clc_as_uint(INFINITY) | st_fma.sign);
+  }
+
+  // Flush denormals
+  if (st_fma.exponent <= -127) {
+    return __clc_as_float(st_fma.sign);
+  }
+
+  return __clc_as_float(st_fma.sign | ((st_fma.exponent + 127) << 23) |
+                        ((uint)st_fma.mantissa.lo & 0x7fffff));
+}
+
+_CLC_TERNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_sw_fma, float,
+                       float, float)
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index f7fdba0a341ed..0ab563df6a274 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -20,12 +20,14 @@ integer/clc_upsample.cl
 math/clc_ceil.cl
 math/clc_copysign.cl
 math/clc_fabs.cl
+math/clc_fma.cl
 math/clc_floor.cl
 math/clc_frexp.cl
 math/clc_mad.cl
 math/clc_modf.cl
 math/clc_nextafter.cl
 math/clc_rint.cl
+math/clc_sw_fma.cl
 math/clc_trunc.cl
 relational/clc_all.cl
 relational/clc_any.cl
diff --git a/libclc/clc/lib/generic/math/clc_fma.cl b/libclc/clc/lib/generic/math/clc_fma.cl
new file mode 100644
index 0000000000000..7d65418d7b59d
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_fma.cl
@@ -0,0 +1,6 @@
+#include <clc/internal/clc.h>
+#include <clc/internal/math/clc_sw_fma.h>
+#include <clc/math/math.h>
+
+#define __CLC_BODY <clc_fma.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/math/clc_fma.inc b/libclc/clc/lib/generic/math/clc_fma.inc
new file mode 100644
index 0000000000000..5ab6d0a6b3863
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_fma.inc
@@ -0,0 +1,8 @@
+_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_fma(__CLC_GENTYPE a, __CLC_GENTYPE b,
+                                               __CLC_GENTYPE c) {
+#if __CLC_FPSIZE == 32
+  if (!__CLC_HAVE_HW_FMA32())
+    return __clc_sw_fma(a, b, c);
+#endif
+  return __builtin_elementwise_fma(a, b, c);
+}
diff --git a/libclc/generic/lib/math/clc_fma.cl b/libclc/clc/lib/generic/math/clc_sw_fma.cl
similarity index 79%
rename from libclc/generic/lib/math/clc_fma.cl
rename to libclc/clc/lib/generic/math/clc_sw_fma.cl
index 33f5072425d98..f1ce17b0348c4 100644
--- a/libclc/generic/lib/math/clc_fma.cl
+++ b/libclc/clc/lib/generic/math/clc_sw_fma.cl
@@ -20,11 +20,14 @@
  * THE SOFTWARE.
  */
 
-#include <clc/clc.h>
+#include <clc/clc_as_type.h>
 #include <clc/clcmacro.h>
+#include <clc/float/definitions.h>
 #include <clc/integer/clc_abs.h>
 #include <clc/integer/clc_clz.h>
-#include <clc/math/clc_subnormal_config.h>
+#include <clc/integer/definitions.h>
+#include <clc/internal/clc.h>
+#include <clc/math/clc_mad.h>
 #include <clc/math/math.h>
 #include <clc/relational/clc_isinf.h>
 #include <clc/relational/clc_isnan.h>
@@ -39,33 +42,36 @@ struct fp {
 _CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) {
   /* special cases */
   if (__clc_isnan(a) || __clc_isnan(b) || __clc_isnan(c) || __clc_isinf(a) ||
-      __clc_isinf(b))
-    return mad(a, b, c);
+      __clc_isinf(b)) {
+    return __clc_mad(a, b, c);
+  }
 
   /* If only c is inf, and both a,b are regular numbers, the result is c*/
-  if (__clc_isinf(c))
+  if (__clc_isinf(c)) {
     return c;
+  }
 
   a = __clc_flush_denormal_if_not_supported(a);
   b = __clc_flush_denormal_if_not_supported(b);
   c = __clc_flush_denormal_if_not_supported(c);
 
-  if (c == 0)
+  if (c == 0) {
     return a * b;
+  }
 
   struct fp st_a, st_b, st_c;
 
-  st_a.exponent = a == .0f ? 0 : ((as_uint(a) & 0x7f800000) >> 23) - 127;
-  st_b.exponent = b == .0f ? 0 : ((as_uint(b) & 0x7f800000) >> 23) - 127;
-  st_c.exponent = c == .0f ? 0 : ((as_uint(c) & 0x7f800000) >> 23) - 127;
+  st_a.exponent = a == .0f ? 0 : ((__clc_as_uint(a) & 0x7f800000) >> 23) - 127;
+  st_b.exponent = b == .0f ? 0 : ((__clc_as_uint(b) & 0x7f800000) >> 23) - 127;
+  st_c.exponent = c == .0f ? 0 : ((__clc_as_uint(c) & 0x7f800000) >> 23) - 127;
 
-  st_a.mantissa = a == .0f ? 0 : (as_uint(a) & 0x7fffff) | 0x800000;
-  st_b.mantissa = b == .0f ? 0 : (as_uint(b) & 0x7fffff) | 0x800000;
-  st_c.mantissa = c == .0f ? 0 : (as_uint(c) & 0x7fffff) | 0x800000;
+  st_a.mantissa = a == .0f ? 0 : (__clc_as_uint(a) & 0x7fffff) | 0x800000;
+  st_b.mantissa = b == .0f ? 0 : (__clc_as_uint(b) & 0x7fffff) | 0x800000;
+  st_c.mantissa = c == .0f ? 0 : (__clc_as_uint(c) & 0x7fffff) | 0x800000;
 
-  st_a.sign = as_uint(a) & 0x80000000;
-  st_b.sign = as_uint(b) & 0x80000000;
-  st_c.sign = as_uint(c) & 0x80000000;
+  st_a.sign = __clc_as_uint(a) & 0x80000000;
+  st_b.sign = __clc_as_uint(b) & 0x80000000;
+  st_c.sign = __clc_as_uint(c) & 0x80000000;
 
   // Multiplication.
   // Move the product to the highest bits to maximize precision
@@ -137,8 +143,9 @@ _CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) {
   ulong grs_bits = (0x4ul << (C_ADJUST - 3 + overflow_bits));
 
   // round to nearest even
-  if ((trunc_bits > grs_bits) || (trunc_bits == grs_bits && last_bit != 0))
+  if ((trunc_bits > grs_bits) || (trunc_bits == grs_bits && last_bit != 0)) {
     st_fma.mantissa += (1ul << (C_ADJUST + overflow_bits));
+  }
 
   // Shift mantissa back to bit 23
   st_fma.mantissa = (st_fma.mantissa >> (C_ADJUST + overflow_bits));
@@ -149,19 +156,23 @@ _CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) {
     st_fma.mantissa >>= 1;
   }
 
-  if (st_fma.mantissa == 0)
+  if (st_fma.mantissa == 0) {
     return .0f;
+  }
 
   // Flating point range limit
-  if (st_fma.exponent > 127)
-    return as_float(as_uint(INFINITY) | st_fma.sign);
+  if (st_fma.exponent > 127) {
+    return __clc_as_float(__clc_as_uint(INFINITY) | st_fma.sign);
+  }
 
   // Flush denormals
-  if (st_fma.exponent <= -127)
-    return as_float(st_fma.sign);
+  if (st_fma.exponent <= -127) {
+    return __clc_as_float(st_fma.sign);
+  }
 
-  return as_float(st_fma.sign | ((st_fma.exponent + 127) << 23) |
-                  ((uint)st_fma.mantissa & 0x7fffff));
+  return __clc_as_float(st_fma.sign | ((st_fma.exponent + 127) << 23) |
+                        ((uint)st_fma.mantissa & 0x7fffff));
 }
+
 _CLC_TERNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_sw_fma, float,
                        float, float)
diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES
new file mode 100644
index 0000000000000..cd6e0b2ea7088
--- /dev/null
+++ b/libclc/clc/lib/spirv/SOURCES
@@ -0,0 +1 @@
+math/clc_runtime_has_hw_fma32.cl
diff --git a/libclc/clc/lib/spirv/math/clc_runtime_has_hw_fma32.cl b/libclc/clc/lib/spirv/math/clc_runtime_has_hw_fma32.cl
new file mode 100644
index 0000000000000..8e3cf4f87fbb0
--- /dev/null
+++ b/libclc/clc/lib/spirv/math/clc_runtime_has_hw_fma32.cl
@@ -0,0 +1 @@
+bool __clc_runtime_has_hw_fma32() { return false; }
diff --git a/libclc/clspv/lib/math/fma.cl b/libclc/clspv/lib/math/fma.cl
index 73c6e158601d9..98af17c56fd34 100644
--- a/libclc/clspv/lib/math/fma.cl
+++ b/libclc/clspv/lib/math/fma.cl
@@ -1,274 +1,8 @@
-/*
- * Copyright (c) 2014 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-// This version is derived from the generic fma software implementation
-// (__clc_sw_fma), but avoids the use of ulong in favor of uint2. The logic has
-// been updated as appropriate.
-
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-#include <clc/math/math.h>
-
-struct fp {
-  uint2 mantissa;
-  int exponent;
-  uint sign;
-};
-
-static uint2 u2_set(uint hi, uint lo) {
-  uint2 res;
-  res.lo = lo;
-  res.hi = hi;
-  return res;
-}
-
-static uint2 u2_set_u(uint val) { return u2_set(0, val); }
-
-static uint2 u2_mul(uint a, uint b) {
-  uint2 res;
-  res.hi = mul_hi(a, b);
-  res.lo = a * b;
-  return res;
-}
-
-static uint2 u2_sll(uint2 val, uint shift) {
-  if (shift == 0)
-    return val;
-  if (shift < 32) {
-    val.hi <<= shift;
-    val.hi |= val.lo >> (32 - shift);
-    val.lo <<= shift;
-  } else {
-    val.hi = val.lo << (shift - 32);
-    val.lo = 0;
-  }
-  return val;
-}
-
-static uint2 u2_srl(uint2 val, uint shift) {
-  if (shift == 0)
-    return val;
-  if (shift < 32) {
-    val.lo >>= shift;
-    val.lo |= val.hi << (32 - shift);
-    val.hi >>= shift;
-  } else {
-    val.lo = val.hi >> (shift - 32);
-    val.hi = 0;
-  }
-  return val;
-}
-
-static uint2 u2_or(uint2 a, uint b) {
-  a.lo |= b;
-  return a;
-}
-
-static uint2 u2_and(uint2 a, uint2 b) {
-  a.lo &= b.lo;
-  a.hi &= b.hi;
-  return a;
-}
-
-static uint2 u2_add(uint2 a, uint2 b) {
-  uint carry = (hadd(a.lo, b.lo) >> 31) & 0x1;
-  a.lo += b.lo;
-  a.hi += b.hi + carry;
-  return a;
-}
-
-static uint2 u2_add_u(uint2 a, uint b) { return u2_add(a, u2_set_u(b)); }
-
-static uint2 u2_inv(uint2 a) {
-  a.lo = ~a.lo;
-  a.hi = ~a.hi;
-  return u2_add_u(a, 1);
-}
-
-static uint u2_clz(uint2 a) {
-  uint leading_zeroes = clz(a.hi);
-  if (leading_zeroes == 32) {
-    leading_zeroes += clz(a.lo);
-  }
-  return leading_zeroes;
-}
-
-static bool u2_eq(uint2 a, uint2 b) { return a.lo == b.lo && a.hi == b.hi; }
-
-static bool u2_zero(uint2 a) { return u2_eq(a, u2_set_u(0)); }
-
-static bool u2_gt(uint2 a, uint2 b) {
-  return a.hi > b.hi || (a.hi == b.hi && a.lo > b.lo);
-}
+#include <clc/internal/math/clc_sw_fma.h>
 
-_CLC_DEF _CLC_OVERLOAD float fma(float a, float b, float c) {
-  /* special cases */
-  if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b)) {
-    return mad(a, b, c);
-  }
-
-  /* If only c is inf, and both a,b are regular numbers, the result is c*/
-  if (isinf(c)) {
-    return c;
-  }
-
-  a = __clc_flush_denormal_if_not_supported(a);
-  b = __clc_flush_denormal_if_not_supported(b);
-  c = __clc_flush_denormal_if_not_supported(c);
-
-  if (a == 0.0f || b == 0.0f) {
-    return c;
-  }
-
-  if (c == 0) {
-    return a * b;
-  }
-
-  struct fp st_a, st_b, st_c;
-
-  st_a.exponent = a == .0f ? 0 : ((as_uint(a) & 0x7f800000) >> 23) - 127;
-  st_b.exponent = b == .0f ? 0 : ((as_uint(b) & 0x7f800000) >> 23) - 127;
-  st_c.exponent = c == .0f ? 0 : ((as_uint(c) & 0x7f800000) >> 23) - 127;
-
-  st_a.mantissa = u2_set_u(a == .0f ? 0 : (as_uint(a) & 0x7fffff) | 0x800000);
-  st_b.mantissa = u2_set_u(b == .0f ? 0 : (as_uint(b) & 0x7fffff) | 0x800000);
-  st_c.mantissa = u2_set_u(c == .0f ? 0 : (as_uint(c) & 0x7fffff) | 0x800000);
-
-  st_a.sign = as_uint(a) & 0x80000000;
-  st_b.sign = as_uint(b) & 0x80000000;
-  st_c.sign = as_uint(c) & 0x80000000;
-
-  // Multiplication.
-  // Move the product to the highest bits to maximize precision
-  // mantissa is 24 bits => product is 48 bits, 2bits non-fraction.
-  // Add one bit for future addition overflow,
-  // add another bit to detect subtraction underflow
-  struct fp st_mul;
-  st_mul.sign = st_a.sign ^ st_b.sign;
-  st_mul.mantissa = u2_sll(u2_mul(st_a.mantissa.lo, st_b.mantissa.lo), 14);
-  st_mul.exponent =
-      !u2_zero(st_mul.mantissa) ? st_a.exponent + st_b.exponent : 0;
-
-  // FIXME: Detecting a == 0 || b == 0 above crashed GCN isel
-  if (st_mul.exponent == 0 && u2_zero(st_mul.mantissa))
-    return c;
-
-// Mantissa is 23 fractional bits, shift it the same way as product mantissa
-#define C_ADJUST 37ul
-
-  // both exponents are bias adjusted
-  int exp_diff = st_mul.exponent - st_c.exponent;
-
-  st_c.mantissa = u2_sll(st_c.mantissa, C_ADJUST);
-  uint2 cutoff_bits = u2_set_u(0);
-  uint2 cutoff_mask = u2_add(u2_sll(u2_set_u(1), abs(exp_diff)),
-                             u2_set(0xffffffff, 0xffffffff));
-  if (exp_diff > 0) {
-    cutoff_bits =
-        exp_diff >= 64 ? st_c.mantissa : u2_and(st_c.mantissa, cutoff_mask);
-    st_c.mantissa =
-        exp_diff >= 64 ? u2_set_u(0) : u2_srl(st_c.mantissa, exp_diff);
-  } else {
-    cutoff_bits = -exp_diff >= 64 ? st_mul.mantissa
-                                  : u2_and(st_mul.mantissa, cutoff_mask);
-    st_mul.mantissa =
-        -exp_diff >= 64 ? u2_set_u(0) : u2_srl(st_mul.mantissa, -exp_diff);
-  }
-
-  struct fp st_fma;
-  st_fma.sign = st_mul.sign;
-  st_fma.exponent = max(st_mul.exponent, st_c.exponent);
-  if (st_c.sign == st_mul.sign) {
-    st_fma.mantissa = u2_add(st_mul.mantissa, st_c.mantissa);
-  } else {
-    // cutoff bits borrow one
-    st_fma.mantissa =
-        u2_add(u2_add(st_mul.mantissa, u2_inv(st_c.mantissa)),
-               (!u2_zero(cutoff_bits) && (st_mul.exponent > st_c.exponent)
-                    ? u2_set(0xffffffff, 0xffffffff)
-                    : u2_set_u(0)));
-  }
-
-  // underflow: st_c.sign != st_mul.sign, and magnitude switches the sign
-  if (u2_gt(st_fma.mantissa, u2_set(0x7fffffff, 0xffffffff))) {
-    st_fma.mantissa = u2_inv(st_fma.mantissa);
-    st_fma.sign = st_mul.sign ^ 0x80000000;
-  }
-
-  // detect overflow/underflow
-  int overflow_bits = 3 - u2_clz(st_fma.mantissa);
-
-  // adjust exponent
-  st_fma.exponent += overflow_bits;
-
-  // handle underflow
-  if (overflow_bits < 0) {
-    st_fma.mantissa = u2_sll(st_fma.mantissa, -overflow_bits);
-    overflow_bits = 0;
-  }
-
-  // rounding
-  uint2 trunc_mask = u2_add(u2_sll(u2_set_u(1), C_ADJUST + overflow_bits),
-                            u2_set(0xffffffff, 0xffffffff));
-  uint2 trunc_bits =
-      u2_or(u2_and(st_fma.mantissa, trunc_mask), !u2_zero(cutoff_bits));
-  uint2 last_bit =
-      u2_and(st_fma.mantissa, u2_sll(u2_set_u(1), C_ADJUST + overflow_bits));
-  uint2 grs_bits = u2_sll(u2_set_u(4), C_ADJUST - 3 + overflow_bits);
-
-  // round to nearest even
-  if (u2_gt(trunc_bits, grs_bits) ||
-      (u2_eq(trunc_bits, grs_bits) && !u2_zero(last_bit))) {
-    st_fma.mantissa =
-        u2_add(st_fma.mantissa, u2_sll(u2_set_u(1), C_ADJUST + overflow_bits));
-  }
-
-  // Shift mantissa back to bit 23
-  st_fma.mantissa = u2_srl(st_fma.mantissa, C_ADJUST + overflow_bits);
-
-  // Detect rounding overflow
-  if (u2_gt(st_fma.mantissa, u2_set_u(0xffffff))) {
-    ++st_fma.exponent;
-    st_fma.mantissa = u2_srl(st_fma.mantissa, 1);
-  }
-
-  if (u2_zero(st_fma.mantissa)) {
-    return 0.0f;
-  }
-
-  // Flating point range limit
-  if (st_fma.exponent > 127) {
-    return as_float(as_uint(INFINITY) | st_fma.sign);
-  }
-
-  // Flush denormals
-  if (st_fma.exponent <= -127) {
-    return as_float(st_fma.sign);
-  }
-
-  return as_float(st_fma.sign | ((st_fma.exponent + 127) << 23) |
-                  ((uint)st_fma.mantissa.lo & 0x7fffff));
-}
-_CLC_TERNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, fma, float, float, float)
+_CLC_DEFINE_TERNARY_BUILTIN(float, fma, __clc_sw_fma, float, float, float)
 
 #ifdef cl_khr_fp16
 
diff --git a/libclc/generic/lib/SOURCES b/libclc/generic/lib/SOURCES
index a62c87902a6a7..bea78adf2cf08 100644
--- a/libclc/generic/lib/SOURCES
+++ b/libclc/generic/lib/SOURCES
@@ -106,7 +106,6 @@ math/exp10.cl
 math/fabs.cl
 math/fdim.cl
 math/floor.cl
-math/clc_fma.cl
 math/fma.cl
 math/fmax.cl
 math/fmin.cl
diff --git a/libclc/generic/lib/math/fma.cl b/libclc/generic/lib/math/fma.cl
index 00d5857fb897b..121895be81494 100644
--- a/libclc/generic/lib/math/fma.cl
+++ b/libclc/generic/lib/math/fma.cl
@@ -1,7 +1,20 @@
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/math.h>
 
-#include "math/clc_fma.h"
+_CLC_DEFINE_TERNARY_BUILTIN(float, fma, __clc_fma, float, float, float)
 
-#define __CLC_BODY <fma.inc>
-#include <clc/math/gentype.inc>
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEFINE_TERNARY_BUILTIN(double, fma, __clc_fma, double, double, double)
+
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_TERNARY_BUILTIN(half, fma, __clc_fma, half, half, half)
+
+#endif
diff --git a/libclc/generic/lib/math/fma.inc b/libclc/generic/lib/math/fma.inc
deleted file mode 100644
index 654208fac21ac..0000000000000
--- a/libclc/generic/lib/math/fma.inc
+++ /dev/null
@@ -1,7 +0,0 @@
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE fma(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c) {
-#if __CLC_FPSIZE == 32 && HAVE_HW_FMA32() == 0
-	return __clc_sw_fma(a, b, c);
-#else
-	return __clc_fma(a, b, c);
-#endif
-}
diff --git a/libclc/generic/lib/math/sincos_helpers.cl b/libclc/generic/lib/math/sincos_helpers.cl
index 441bad2be432f..9e2d375443888 100644
--- a/libclc/generic/lib/math/sincos_helpers.cl
+++ b/libclc/generic/lib/math/sincos_helpers.cl
@@ -121,7 +121,7 @@ _CLC_DEF float __clc_tanf_piby4(float x, int regn) {
 
 _CLC_DEF void __clc_fullMulS(private float *hi, private float *lo, float a,
                              float b, float bh, float bt) {
-  if (HAVE_HW_FMA32()) {
+  if (__CLC_HAVE_HW_FMA32()) {
     float ph = a * b;
     *hi = ph;
     *lo = fma(a, b, -ph);
@@ -310,7 +310,7 @@ _CLC_DEF int __clc_argReductionLargeS(private float *r, private float *rr,
 
   float rh, rt;
 
-  if (HAVE_HW_FMA32()) {
+  if (__CLC_HAVE_HW_FMA32()) {
     rh = q1 * pio2h;
     rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh)));
   } else {
diff --git a/libclc/spirv/lib/SOURCES b/libclc/spirv/lib/SOURCES
index 854cba614c8bf..82416a9e69bfc 100644
--- a/libclc/spirv/lib/SOURCES
+++ b/libclc/spirv/lib/SOURCES
@@ -41,7 +41,6 @@ subnormal_config.cl
 ../../generic/lib/math/exp2.cl
 ../../generic/lib/math/clc_exp10.cl
 ../../generic/lib/math/exp10.cl
-../../generic/lib/math/clc_fma.cl
 math/fma.cl
 ../../generic/lib/math/clc_fmod.cl
 ../../generic/lib/math/fmod.cl
diff --git a/libclc/spirv/lib/math/fma.cl b/libclc/spirv/lib/math/fma.cl
index 79142425e52d2..1fe7a67d0568a 100644
--- a/libclc/spirv/lib/math/fma.cl
+++ b/libclc/spirv/lib/math/fma.cl
@@ -1,11 +1,5 @@
 #include <clc/clc.h>
-#include <math/clc_fma.h>
+#include <clc/clcmacro.h>
+#include <clc/internal/math/clc_sw_fma.h>
 
-#define __CLC_BODY <fma.inc>
-#define __FLOAT_ONLY
-#include <clc/math/gentype.inc>
-
-bool __clc_runtime_has_hw_fma32()
-{
-    return false;
-}
+_CLC_DEFINE_TERNARY_BUILTIN(float, fma, __clc_sw_fma, float, float, float)
diff --git a/libclc/spirv/lib/math/fma.inc b/libclc/spirv/lib/math/fma.inc
deleted file mode 100644
index 0f12c565758ff..0000000000000
--- a/libclc/spirv/lib/math/fma.inc
+++ /dev/null
@@ -1,3 +0,0 @@
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE fma(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c) {
-	return __clc_sw_fma(a, b, c);
-}

>From f06c74aac2ee8fc5f18b7722941e7c5d9439380b Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser at codeplay.com>
Date: Thu, 6 Feb 2025 08:44:30 +0000
Subject: [PATCH 2/2] [libclc] Update uses of fma to __clc_fma in CLC functions

---
 libclc/generic/lib/math/clc_exp10.cl      | 27 +++++++------
 libclc/generic/lib/math/clc_fmod.cl       |  5 ++-
 libclc/generic/lib/math/clc_hypot.cl      |  3 +-
 libclc/generic/lib/math/clc_pow.cl        | 47 ++++++++++++----------
 libclc/generic/lib/math/clc_pown.cl       | 47 ++++++++++++----------
 libclc/generic/lib/math/clc_powr.cl       | 47 ++++++++++++----------
 libclc/generic/lib/math/clc_remainder.cl  |  7 ++--
 libclc/generic/lib/math/clc_remquo.cl     |  7 ++--
 libclc/generic/lib/math/clc_rootn.cl      | 49 +++++++++++++----------
 libclc/generic/lib/math/sincos_helpers.cl | 39 ++++++++++--------
 10 files changed, 156 insertions(+), 122 deletions(-)

diff --git a/libclc/generic/lib/math/clc_exp10.cl b/libclc/generic/lib/math/clc_exp10.cl
index 4f839a9815ac0..7c93e23de48bf 100644
--- a/libclc/generic/lib/math/clc_exp10.cl
+++ b/libclc/generic/lib/math/clc_exp10.cl
@@ -23,6 +23,7 @@
 #include <clc/clc.h>
 #include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
@@ -123,23 +124,25 @@ _CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x) {
   int j = n & 0x3f;
   int m = n >> 6;
 
-  double r =
-      R_LN10 * fma(-R_LOG10_2_BY_64_TL, dn, fma(-R_LOG10_2_BY_64_LD, dn, x));
+  double r = R_LN10 * __clc_fma(-R_LOG10_2_BY_64_TL, dn,
+                                __clc_fma(-R_LOG10_2_BY_64_LD, dn, x));
 
   // 6 term tail of Taylor expansion of e^r
   double z2 =
-      r *
-      fma(r,
-          fma(r,
-              fma(r,
-                  fma(r, fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7),
-                      0x1.5555555555555p-5),
-                  0x1.5555555555555p-3),
-              0x1.0000000000000p-1),
-          1.0);
+      r * __clc_fma(
+              r,
+              __clc_fma(r,
+                        __clc_fma(r,
+                                  __clc_fma(r,
+                                            __clc_fma(r, 0x1.6c16c16c16c17p-10,
+                                                      0x1.1111111111111p-7),
+                                            0x1.5555555555555p-5),
+                                  0x1.5555555555555p-3),
+                        0x1.0000000000000p-1),
+              1.0);
 
   double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
-  z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;
+  z2 = __clc_fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;
 
   int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0));
 
diff --git a/libclc/generic/lib/math/clc_fmod.cl b/libclc/generic/lib/math/clc_fmod.cl
index 31a5d4dc05c03..b33dae23550c5 100644
--- a/libclc/generic/lib/math/clc_fmod.cl
+++ b/libclc/generic/lib/math/clc_fmod.cl
@@ -25,6 +25,7 @@
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_clz.h>
 #include <clc/math/clc_floor.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/clc_trunc.h>
 #include <clc/math/math.h>
@@ -124,7 +125,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) {
 
     // Compute w * t in quad precision
     p = w * t;
-    pp = fma(w, t, -p);
+    pp = __clc_fma(w, t, -p);
 
     // Subtract w * t from dx
     v = dx - p;
@@ -144,7 +145,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) {
   int todd = lt & 1;
 
   p = w * t;
-  pp = fma(w, t, -p);
+  pp = __clc_fma(w, t, -p);
   v = dx - p;
   dx = v + (((dx - v) - p) - pp);
   i = dx < 0.0;
diff --git a/libclc/generic/lib/math/clc_hypot.cl b/libclc/generic/lib/math/clc_hypot.cl
index fd2e87b4a1ed8..5e6a99b70f22a 100644
--- a/libclc/generic/lib/math/clc_hypot.cl
+++ b/libclc/generic/lib/math/clc_hypot.cl
@@ -23,6 +23,7 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_abs.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
@@ -80,7 +81,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_hypot(double x, double y) {
   double ay = y * preadjust;
 
   // The post adjust may overflow, but this can't be avoided in any case
-  double r = sqrt(fma(ax, ax, ay * ay)) * postadjust;
+  double r = sqrt(__clc_fma(ax, ax, ay * ay)) * postadjust;
 
   // If the difference in exponents between x and y is large
   double s = x + y;
diff --git a/libclc/generic/lib/math/clc_pow.cl b/libclc/generic/lib/math/clc_pow.cl
index fce9573c39bac..43794d9afea10 100644
--- a/libclc/generic/lib/math/clc_pow.cl
+++ b/libclc/generic/lib/math/clc_pow.cl
@@ -24,6 +24,7 @@
 #include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
@@ -283,26 +284,29 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y) {
     double log_t = tv.s1;
     double f_inv = (log_h + log_t) * f;
     double r1 = __clc_as_double(__clc_as_long(f_inv) & 0xfffffffff8000000L);
-    double r2 = fma(-F, r1, f) * (log_h + log_t);
+    double r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
     double r = r1 + r2;
 
-    double poly = fma(
-        r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+    double poly = __clc_fma(
+        r,
+        __clc_fma(r,
+                  __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
+                  1.0 / 4.0),
         1.0 / 3.0);
     poly = poly * r * r * r;
 
     double hr1r1 = 0.5 * r1 * r1;
     double poly0h = r1 + hr1r1;
     double poly0t = r1 - poly0h + hr1r1;
-    poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+    poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;
 
     tv = USE_TABLE(powlog_tbl, index);
     log_h = tv.s0;
     log_t = tv.s1;
 
-    double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+    double resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
     double resT = resT_t - poly0h;
-    double resH = fma(xexp, real_log2_lead, log_h);
+    double resH = __clc_fma(xexp, real_log2_lead, log_h);
     double resT_h = poly0h;
 
     double H = resT + resH;
@@ -313,9 +317,9 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y) {
     double y_head = __clc_as_double(uy & 0xfffffffff8000000L);
     double y_tail = y - y_head;
 
-    double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
-    v = fma(y_head, H, temp);
-    vt = fma(y_head, H, -v) + temp;
+    double temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
+    v = __clc_fma(y_head, H, temp);
+    vt = __clc_fma(y_head, H, -v) + temp;
   }
 
   // Now calculate exp of (v,vt)
@@ -339,21 +343,22 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y) {
     double f2 = tv.s1;
     double f = f1 + f2;
 
-    double r1 = fma(dn, -lnof2_by_64_head, v);
+    double r1 = __clc_fma(dn, -lnof2_by_64_head, v);
     double r2 = dn * lnof2_by_64_tail;
     double r = (r1 + r2) + vt;
 
-    double q = fma(
-        r,
-        fma(r,
-            fma(r,
-                fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
-                4.16666666662260795726e-02),
-            1.66666666665260878863e-01),
-        5.00000000000000008883e-01);
-    q = fma(r * r, q, r);
-
-    expv = fma(f, q, f2) + f1;
+    double q =
+        __clc_fma(r,
+                  __clc_fma(r,
+                            __clc_fma(r,
+                                      __clc_fma(r, 1.38889490863777199667e-03,
+                                                8.33336798434219616221e-03),
+                                      4.16666666662260795726e-02),
+                            1.66666666665260878863e-01),
+                  5.00000000000000008883e-01);
+    q = __clc_fma(r * r, q, r);
+
+    expv = __clc_fma(f, q, f2) + f1;
     expv = ldexp(expv, m);
 
     expv = v > max_exp_arg ? __clc_as_double(0x7FF0000000000000L) : expv;
diff --git a/libclc/generic/lib/math/clc_pown.cl b/libclc/generic/lib/math/clc_pown.cl
index a613b2998c3f6..3db67791ab169 100644
--- a/libclc/generic/lib/math/clc_pown.cl
+++ b/libclc/generic/lib/math/clc_pown.cl
@@ -24,6 +24,7 @@
 #include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
@@ -267,26 +268,29 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny) {
     double log_t = tv.s1;
     double f_inv = (log_h + log_t) * f;
     double r1 = __clc_as_double(__clc_as_long(f_inv) & 0xfffffffff8000000L);
-    double r2 = fma(-F, r1, f) * (log_h + log_t);
+    double r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
     double r = r1 + r2;
 
-    double poly = fma(
-        r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+    double poly = __clc_fma(
+        r,
+        __clc_fma(r,
+                  __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
+                  1.0 / 4.0),
         1.0 / 3.0);
     poly = poly * r * r * r;
 
     double hr1r1 = 0.5 * r1 * r1;
     double poly0h = r1 + hr1r1;
     double poly0t = r1 - poly0h + hr1r1;
-    poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+    poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;
 
     tv = USE_TABLE(powlog_tbl, index);
     log_h = tv.s0;
     log_t = tv.s1;
 
-    double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+    double resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
     double resT = resT_t - poly0h;
-    double resH = fma(xexp, real_log2_lead, log_h);
+    double resH = __clc_fma(xexp, real_log2_lead, log_h);
     double resT_h = poly0h;
 
     double H = resT + resH;
@@ -303,9 +307,9 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny) {
     double y_tail1 = (double)nyt;
     y_tail = mask_2_24 ? y_tail1 : y_tail;
 
-    double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
-    v = fma(y_head, H, temp);
-    vt = fma(y_head, H, -v) + temp;
+    double temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
+    v = __clc_fma(y_head, H, temp);
+    vt = __clc_fma(y_head, H, -v) + temp;
   }
 
   // Now calculate exp of (v,vt)
@@ -329,21 +333,22 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny) {
     double f2 = tv.s1;
     double f = f1 + f2;
 
-    double r1 = fma(dn, -lnof2_by_64_head, v);
+    double r1 = __clc_fma(dn, -lnof2_by_64_head, v);
     double r2 = dn * lnof2_by_64_tail;
     double r = (r1 + r2) + vt;
 
-    double q = fma(
-        r,
-        fma(r,
-            fma(r,
-                fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
-                4.16666666662260795726e-02),
-            1.66666666665260878863e-01),
-        5.00000000000000008883e-01);
-    q = fma(r * r, q, r);
-
-    expv = fma(f, q, f2) + f1;
+    double q =
+        __clc_fma(r,
+                  __clc_fma(r,
+                            __clc_fma(r,
+                                      __clc_fma(r, 1.38889490863777199667e-03,
+                                                8.33336798434219616221e-03),
+                                      4.16666666662260795726e-02),
+                            1.66666666665260878863e-01),
+                  5.00000000000000008883e-01);
+    q = __clc_fma(r * r, q, r);
+
+    expv = __clc_fma(f, q, f2) + f1;
     expv = ldexp(expv, m);
 
     expv = v > max_exp_arg ? __clc_as_double(0x7FF0000000000000L) : expv;
diff --git a/libclc/generic/lib/math/clc_powr.cl b/libclc/generic/lib/math/clc_powr.cl
index 7876acaee89a6..ccfb7a0111229 100644
--- a/libclc/generic/lib/math/clc_powr.cl
+++ b/libclc/generic/lib/math/clc_powr.cl
@@ -24,6 +24,7 @@
 #include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
@@ -270,26 +271,29 @@ _CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y) {
     double log_t = tv.s1;
     double f_inv = (log_h + log_t) * f;
     double r1 = __clc_as_double(__clc_as_long(f_inv) & 0xfffffffff8000000L);
-    double r2 = fma(-F, r1, f) * (log_h + log_t);
+    double r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
     double r = r1 + r2;
 
-    double poly = fma(
-        r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+    double poly = __clc_fma(
+        r,
+        __clc_fma(r,
+                  __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
+                  1.0 / 4.0),
         1.0 / 3.0);
     poly = poly * r * r * r;
 
     double hr1r1 = 0.5 * r1 * r1;
     double poly0h = r1 + hr1r1;
     double poly0t = r1 - poly0h + hr1r1;
-    poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+    poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;
 
     tv = USE_TABLE(powlog_tbl, index);
     log_h = tv.s0;
     log_t = tv.s1;
 
-    double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+    double resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
     double resT = resT_t - poly0h;
-    double resH = fma(xexp, real_log2_lead, log_h);
+    double resH = __clc_fma(xexp, real_log2_lead, log_h);
     double resT_h = poly0h;
 
     double H = resT + resH;
@@ -300,9 +304,9 @@ _CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y) {
     double y_head = __clc_as_double(uy & 0xfffffffff8000000L);
     double y_tail = y - y_head;
 
-    double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
-    v = fma(y_head, H, temp);
-    vt = fma(y_head, H, -v) + temp;
+    double temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
+    v = __clc_fma(y_head, H, temp);
+    vt = __clc_fma(y_head, H, -v) + temp;
   }
 
   // Now calculate exp of (v,vt)
@@ -326,21 +330,22 @@ _CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y) {
     double f2 = tv.s1;
     double f = f1 + f2;
 
-    double r1 = fma(dn, -lnof2_by_64_head, v);
+    double r1 = __clc_fma(dn, -lnof2_by_64_head, v);
     double r2 = dn * lnof2_by_64_tail;
     double r = (r1 + r2) + vt;
 
-    double q = fma(
-        r,
-        fma(r,
-            fma(r,
-                fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
-                4.16666666662260795726e-02),
-            1.66666666665260878863e-01),
-        5.00000000000000008883e-01);
-    q = fma(r * r, q, r);
-
-    expv = fma(f, q, f2) + f1;
+    double q =
+        __clc_fma(r,
+                  __clc_fma(r,
+                            __clc_fma(r,
+                                      __clc_fma(r, 1.38889490863777199667e-03,
+                                                8.33336798434219616221e-03),
+                                      4.16666666662260795726e-02),
+                            1.66666666665260878863e-01),
+                  5.00000000000000008883e-01);
+    q = __clc_fma(r * r, q, r);
+
+    expv = __clc_fma(f, q, f2) + f1;
     expv = ldexp(expv, m);
 
     expv = v > max_exp_arg ? __clc_as_double(0x7FF0000000000000L) : expv;
diff --git a/libclc/generic/lib/math/clc_remainder.cl b/libclc/generic/lib/math/clc_remainder.cl
index 6302b9776782f..d517e36b53277 100644
--- a/libclc/generic/lib/math/clc_remainder.cl
+++ b/libclc/generic/lib/math/clc_remainder.cl
@@ -25,6 +25,7 @@
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_clz.h>
 #include <clc/math/clc_floor.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/clc_trunc.h>
 #include <clc/math/math.h>
@@ -136,7 +137,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y) {
 
     // Compute w * t in quad precision
     p = w * t;
-    pp = fma(w, t, -p);
+    pp = __clc_fma(w, t, -p);
 
     // Subtract w * t from dx
     v = dx - p;
@@ -156,7 +157,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y) {
   int todd = lt & 1;
 
   p = w * t;
-  pp = fma(w, t, -p);
+  pp = __clc_fma(w, t, -p);
   v = dx - p;
   dx = v + (((dx - v) - p) - pp);
   i = dx < 0.0;
@@ -197,7 +198,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y) {
   c &= (yexp<1023 & 2.0 * dx> dy) | (dx > 0.5 * dy);
   // we could use a conversion here instead since qsgn = +-1
   p = qsgn == 1 ? -1.0 : 1.0;
-  t = fma(y, p, x);
+  t = __clc_fma(y, p, x);
   ret = c ? t : ret;
 
   // We don't need anything special for |x| == 0
diff --git a/libclc/generic/lib/math/clc_remquo.cl b/libclc/generic/lib/math/clc_remquo.cl
index 699517e180708..d2a5845bc2684 100644
--- a/libclc/generic/lib/math/clc_remquo.cl
+++ b/libclc/generic/lib/math/clc_remquo.cl
@@ -25,6 +25,7 @@
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_clz.h>
 #include <clc/math/clc_floor.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/clc_trunc.h>
 #include <clc/math/math.h>
@@ -176,7 +177,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y,
 
     // Compute w * t in quad precision
     p = w * t;
-    pp = fma(w, t, -p);
+    pp = __clc_fma(w, t, -p);
 
     // Subtract w * t from dx
     v = dx - p;
@@ -196,7 +197,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y,
   int todd = lt & 1;
 
   p = w * t;
-  pp = fma(w, t, -p);
+  pp = __clc_fma(w, t, -p);
   v = dx - p;
   dx = v + (((dx - v) - p) - pp);
   i = dx < 0.0;
@@ -242,7 +243,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y,
   quo = c ? qsgn : quo;
   // we could use a conversion here instead since qsgn = +-1
   p = qsgn == 1 ? -1.0 : 1.0;
-  t = fma(y, p, x);
+  t = __clc_fma(y, p, x);
   ret = c ? t : ret;
 
   // We don't need anything special for |x| == 0
diff --git a/libclc/generic/lib/math/clc_rootn.cl b/libclc/generic/lib/math/clc_rootn.cl
index dabaa2a4f3f2a..42bb69f96836a 100644
--- a/libclc/generic/lib/math/clc_rootn.cl
+++ b/libclc/generic/lib/math/clc_rootn.cl
@@ -24,6 +24,7 @@
 #include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
@@ -271,26 +272,29 @@ _CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny) {
     double log_t = tv.s1;
     double f_inv = (log_h + log_t) * f;
     double r1 = __clc_as_double(__clc_as_long(f_inv) & 0xfffffffff8000000L);
-    double r2 = fma(-F, r1, f) * (log_h + log_t);
+    double r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
     double r = r1 + r2;
 
-    double poly = fma(
-        r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+    double poly = __clc_fma(
+        r,
+        __clc_fma(r,
+                  __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
+                  1.0 / 4.0),
         1.0 / 3.0);
     poly = poly * r * r * r;
 
     double hr1r1 = 0.5 * r1 * r1;
     double poly0h = r1 + hr1r1;
     double poly0t = r1 - poly0h + hr1r1;
-    poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+    poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;
 
     tv = USE_TABLE(powlog_tbl, index);
     log_h = tv.s0;
     log_t = tv.s1;
 
-    double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+    double resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
     double resT = resT_t - poly0h;
-    double resH = fma(xexp, real_log2_lead, log_h);
+    double resH = __clc_fma(xexp, real_log2_lead, log_h);
     double resT_h = poly0h;
 
     double H = resT + resH;
@@ -303,11 +307,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny) {
 
     double fnyh = __clc_as_double(__clc_as_long(dny) & 0xfffffffffff00000);
     double fnyt = (double)(ny - (int)fnyh);
-    y_tail = fma(-fnyt, y_head, fma(-fnyh, y_head, 1.0)) / dny;
+    y_tail = __clc_fma(-fnyt, y_head, __clc_fma(-fnyh, y_head, 1.0)) / dny;
 
-    double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
-    v = fma(y_head, H, temp);
-    vt = fma(y_head, H, -v) + temp;
+    double temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
+    v = __clc_fma(y_head, H, temp);
+    vt = __clc_fma(y_head, H, -v) + temp;
   }
 
   // Now calculate exp of (v,vt)
@@ -331,21 +335,22 @@ _CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny) {
     double f2 = tv.s1;
     double f = f1 + f2;
 
-    double r1 = fma(dn, -lnof2_by_64_head, v);
+    double r1 = __clc_fma(dn, -lnof2_by_64_head, v);
     double r2 = dn * lnof2_by_64_tail;
     double r = (r1 + r2) + vt;
 
-    double q = fma(
-        r,
-        fma(r,
-            fma(r,
-                fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
-                4.16666666662260795726e-02),
-            1.66666666665260878863e-01),
-        5.00000000000000008883e-01);
-    q = fma(r * r, q, r);
-
-    expv = fma(f, q, f2) + f1;
+    double q =
+        __clc_fma(r,
+                  __clc_fma(r,
+                            __clc_fma(r,
+                                      __clc_fma(r, 1.38889490863777199667e-03,
+                                                8.33336798434219616221e-03),
+                                      4.16666666662260795726e-02),
+                            1.66666666665260878863e-01),
+                  5.00000000000000008883e-01);
+    q = __clc_fma(r * r, q, r);
+
+    expv = __clc_fma(f, q, f2) + f1;
     expv = ldexp(expv, m);
 
     expv = v > max_exp_arg ? __clc_as_double(0x7FF0000000000000L) : expv;
diff --git a/libclc/generic/lib/math/sincos_helpers.cl b/libclc/generic/lib/math/sincos_helpers.cl
index 9e2d375443888..57c914fea5a89 100644
--- a/libclc/generic/lib/math/sincos_helpers.cl
+++ b/libclc/generic/lib/math/sincos_helpers.cl
@@ -24,6 +24,7 @@
 #include <clc/clc.h>
 #include <clc/integer/clc_clz.h>
 #include <clc/integer/clc_mul_hi.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_trunc.h>
 #include <clc/math/math.h>
@@ -124,7 +125,7 @@ _CLC_DEF void __clc_fullMulS(private float *hi, private float *lo, float a,
   if (__CLC_HAVE_HW_FMA32()) {
     float ph = a * b;
     *hi = ph;
-    *lo = fma(a, b, -ph);
+    *lo = __clc_fma(a, b, -ph);
   } else {
     float ah = as_float(as_uint(a) & 0xfffff000U);
     float at = a - ah;
@@ -312,7 +313,7 @@ _CLC_DEF int __clc_argReductionLargeS(private float *r, private float *rr,
 
   if (__CLC_HAVE_HW_FMA32()) {
     rh = q1 * pio2h;
-    rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh)));
+    rt = __clc_fma(q0, pio2h, __clc_fma(q1, pio2t, __clc_fma(q1, pio2h, -rh)));
   } else {
     float q1h = as_float(as_uint(q1) & 0xfffff000);
     float q1t = q1 - q1h;
@@ -349,7 +350,7 @@ _CLC_DEF void __clc_remainder_piby2_medium(double x, private double *r,
                                            private int *regn) {
   // How many pi/2 is x a multiple of?
   const double two_by_pi = 0x1.45f306dc9c883p-1;
-  double dnpi2 = __clc_trunc(fma(x, two_by_pi, 0.5));
+  double dnpi2 = __clc_trunc(__clc_fma(x, two_by_pi, 0.5));
 
   const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
   const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
@@ -357,11 +358,11 @@ _CLC_DEF void __clc_remainder_piby2_medium(double x, private double *r,
 
   // Compute product of npi2 with 159 bits of 2/pi
   double p_hh = piby2_h * dnpi2;
-  double p_ht = fma(piby2_h, dnpi2, -p_hh);
+  double p_ht = __clc_fma(piby2_h, dnpi2, -p_hh);
   double p_mh = piby2_m * dnpi2;
-  double p_mt = fma(piby2_m, dnpi2, -p_mh);
+  double p_mt = __clc_fma(piby2_m, dnpi2, -p_mh);
   double p_th = piby2_t * dnpi2;
-  double p_tt = fma(piby2_t, dnpi2, -p_th);
+  double p_tt = __clc_fma(piby2_t, dnpi2, -p_th);
 
   // Reduce to 159 bits
   double ph = p_hh;
@@ -469,13 +470,13 @@ _CLC_DEF void __clc_remainder_piby2_large(double x, private double *r,
 
   // Exact multiply
   double f0h = p0 * x;
-  double f0l = fma(p0, x, -f0h);
+  double f0l = __clc_fma(p0, x, -f0h);
   double f1h = p1 * x;
-  double f1l = fma(p1, x, -f1h);
+  double f1l = __clc_fma(p1, x, -f1h);
   double f2h = p2 * x;
-  double f2l = fma(p2, x, -f2h);
+  double f2l = __clc_fma(p2, x, -f2h);
   double f3h = p3 * x;
-  double f3l = fma(p3, x, -f3h);
+  double f3l = __clc_fma(p3, x, -f3h);
 
   // Accumulate product into 4 doubles
   double s, t;
@@ -533,7 +534,7 @@ _CLC_DEF void __clc_remainder_piby2_large(double x, private double *r,
   const double p2t = 4967757600021510.0 / 0x1.0p+106;
 
   double rhi = f3 * p2h;
-  double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
+  double rlo = __clc_fma(f2, p2h, __clc_fma(f3, p2t, __clc_fma(f3, p2h, -rhi)));
 
   *r = rhi + rlo;
   *rr = rlo - (*r - rhi);
@@ -581,15 +582,21 @@ _CLC_DEF double2 __clc_sincos_piby4(double x, double xx) {
   double r = 0.5 * x2;
   double t = 1.0 - r;
 
-  double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
+  double sp = __clc_fma(
+      __clc_fma(__clc_fma(__clc_fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
 
   double cp =
-      t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2),
-                  x2, cc1),
-              x2 * x2, fma(x, xx, (1.0 - t) - r));
+      t +
+      __clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(cc6, x2, cc5),
+                                                        x2, cc4),
+                                              x2, cc3),
+                                    x2, cc2),
+                          x2, cc1),
+                x2 * x2, __clc_fma(x, xx, (1.0 - t) - r));
 
   double2 ret;
-  ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5 * xx), x2, -xx));
+  ret.lo =
+      x - __clc_fma(-x3, sc1, __clc_fma(__clc_fma(-x3, sp, 0.5 * xx), x2, -xx));
   ret.hi = cp;
 
   return ret;