[libclc] [libclc] Move mad to the CLC library (PR #123607)

Mon Jan 20 04:58:53 PST 2025

https://github.com/frasercrmck created https://github.com/llvm/llvm-project/pull/123607

All targets build `__clc_mad` -- even SPIR-V targets -- since it compiles to the optimal `llvm.fmuladd` intrinsic. There is no change to the bytecode generated for non-SPIR-V targets.

The `mix` builtin, which is implemented as a wrapper around `mad`, is left as an OpenCL-layer wrapper of `__clc_mad`. I don't know if it's worth having a specific CLC version of `mix`.

The changes to the other CLC files/functions are moving uses of `mad` to `__clc_mad`, and reformatting. There is an additional instance of `trunc` becoming `__clc_trunc`, which was missed before.

>From 97f51f4ee193d364a3f1a343bf8f0df7eda04174 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser at codeplay.com>
Date: Mon, 20 Jan 2025 12:45:26 +0000
Subject: [PATCH] [libclc] Move mad to the CLC library

The mix builtin, which is implemented as a wrapper around mad, is left
as an OpenCL-layer wrapper of __clc_mad. I don't know if it's worth
having a specific CLC version of mix.

All targets build __clc_mad -- even SPIR-V targets -- since it compiles
to the optimal llvm.fmuladd intrinsic. There is no change to the
bytecode generated for non-SPIR-V targets.

The changes to the other CLC files/functions are moving uses of mad to
__clc_mad, and reformatting. There is an additional instance of trunc
becoming __clc_trunc, which was missed before.
---
 libclc/clc/include/clc/clcmacro.h             |   27 +
 libclc/clc/include/clc/math/clc_mad.h         |   12 +
 libclc/clc/include/clc/math/ternary_decl.inc  |    3 +
 libclc/clc/lib/clspv/SOURCES                  |    1 +
 libclc/clc/lib/generic/SOURCES                |    1 +
 libclc/clc/lib/generic/math/clc_mad.cl        |    4 +
 libclc/clc/lib/generic/math/clc_mad.inc       |    4 +
 libclc/clc/lib/spirv/SOURCES                  |    1 +
 libclc/clc/lib/spirv64/SOURCES                |    1 +
 .../generic/include/clc/math/ternary_decl.inc |    1 -
 libclc/generic/lib/common/mix.cl              |    1 +
 libclc/generic/lib/common/mix.inc             |   10 +-
 libclc/generic/lib/math/clc_exp10.cl          |  188 ++--
 libclc/generic/lib/math/clc_hypot.cl          |    3 +-
 libclc/generic/lib/math/clc_pow.cl            |  666 +++++------
 libclc/generic/lib/math/clc_pown.cl           |  610 +++++-----
 libclc/generic/lib/math/clc_powr.cl           |  632 ++++++-----
 libclc/generic/lib/math/clc_rootn.cl          |  609 +++++-----
 libclc/generic/lib/math/mad.cl                |   19 +-
 libclc/generic/lib/math/mad.inc               |    3 -
 libclc/generic/lib/math/sincos_helpers.cl     | 1000 +++++++++--------
 libclc/generic/lib/math/sincospiF_piby4.h     |   57 +-
 22 files changed, 2013 insertions(+), 1840 deletions(-)
 create mode 100644 libclc/clc/include/clc/math/clc_mad.h
 create mode 100644 libclc/clc/include/clc/math/ternary_decl.inc
 create mode 100644 libclc/clc/lib/generic/math/clc_mad.cl
 create mode 100644 libclc/clc/lib/generic/math/clc_mad.inc
 delete mode 100644 libclc/generic/include/clc/math/ternary_decl.inc
 delete mode 100644 libclc/generic/lib/math/mad.inc

diff --git a/libclc/clc/include/clc/clcmacro.h b/libclc/clc/include/clc/clcmacro.h
index 3c3a69f4f848bb..44928b2e428bfa 100644
--- a/libclc/clc/include/clc/clcmacro.h
+++ b/libclc/clc/include/clc/clcmacro.h
@@ -184,6 +184,33 @@
     return BUILTIN(x);                                                         \
   }
 
+#define _CLC_DEFINE_TERNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE,    \
+                                    ARG2_TYPE, ARG3_TYPE)                      \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y,           \
+                                           ARG3_TYPE z) {                      \
+    return BUILTIN(x, y, z);                                                   \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y,  \
+                                              ARG3_TYPE##2 z) {                \
+    return BUILTIN(x, y, z);                                                   \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y,  \
+                                              ARG3_TYPE##3 z) {                \
+    return BUILTIN(x, y, z);                                                   \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y,  \
+                                              ARG3_TYPE##4 z) {                \
+    return BUILTIN(x, y, z);                                                   \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y,  \
+                                              ARG3_TYPE##8 z) {                \
+    return BUILTIN(x, y, z);                                                   \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE##16 FUNCTION(                                \
+      ARG1_TYPE##16 x, ARG2_TYPE##16 y, ARG3_TYPE##16 z) {                     \
+    return BUILTIN(x, y, z);                                                   \
+  }
+
 #ifdef cl_khr_fp16
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
diff --git a/libclc/clc/include/clc/math/clc_mad.h b/libclc/clc/include/clc/math/clc_mad.h
new file mode 100644
index 00000000000000..3eb718e87f3705
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_mad.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_MATH_CLC_MAD_H__
+#define __CLC_MATH_CLC_MAD_H__
+
+#define __CLC_BODY <clc/math/ternary_decl.inc>
+#define __CLC_FUNCTION __clc_mad
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_MAD_H__
diff --git a/libclc/clc/include/clc/math/ternary_decl.inc b/libclc/clc/include/clc/math/ternary_decl.inc
new file mode 100644
index 00000000000000..6c1507b3fcf74b
--- /dev/null
+++ b/libclc/clc/include/clc/math/ternary_decl.inc
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
+                                                     __CLC_GENTYPE b,
+                                                     __CLC_GENTYPE c);
diff --git a/libclc/clc/lib/clspv/SOURCES b/libclc/clc/lib/clspv/SOURCES
index e6573f586080cf..209dc0ca61e2b8 100644
--- a/libclc/clc/lib/clspv/SOURCES
+++ b/libclc/clc/lib/clspv/SOURCES
@@ -1,6 +1,7 @@
 ../generic/math/clc_ceil.cl
 ../generic/math/clc_fabs.cl
 ../generic/math/clc_floor.cl
+../generic/math/clc_mad.cl
 ../generic/math/clc_rint.cl
 ../generic/math/clc_trunc.cl
 ../generic/shared/clc_clamp.cl
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index d74bff20ba87ba..877a0a390a7452 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -7,6 +7,7 @@ integer/clc_abs_diff.cl
 math/clc_ceil.cl
 math/clc_fabs.cl
 math/clc_floor.cl
+math/clc_mad.cl
 math/clc_rint.cl
 math/clc_trunc.cl
 relational/clc_all.cl
diff --git a/libclc/clc/lib/generic/math/clc_mad.cl b/libclc/clc/lib/generic/math/clc_mad.cl
new file mode 100644
index 00000000000000..58618cf2477177
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_mad.cl
@@ -0,0 +1,4 @@
+#include <clc/internal/clc.h>
+
+#define __CLC_BODY <clc_mad.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/math/clc_mad.inc b/libclc/clc/lib/generic/math/clc_mad.inc
new file mode 100644
index 00000000000000..c2cf7f65c00f14
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_mad.inc
@@ -0,0 +1,4 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_mad(__CLC_GENTYPE a, __CLC_GENTYPE b,
+                                               __CLC_GENTYPE c) {
+  return a * b + c;
+}
diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES
index ac855ea5184ede..905afa03d8a56c 100644
--- a/libclc/clc/lib/spirv/SOURCES
+++ b/libclc/clc/lib/spirv/SOURCES
@@ -5,6 +5,7 @@
 ../generic/math/clc_ceil.cl
 ../generic/math/clc_fabs.cl
 ../generic/math/clc_floor.cl
+../generic/math/clc_mad.cl
 ../generic/math/clc_rint.cl
 ../generic/math/clc_trunc.cl
 ../generic/shared/clc_clamp.cl
diff --git a/libclc/clc/lib/spirv64/SOURCES b/libclc/clc/lib/spirv64/SOURCES
index ac855ea5184ede..905afa03d8a56c 100644
--- a/libclc/clc/lib/spirv64/SOURCES
+++ b/libclc/clc/lib/spirv64/SOURCES
@@ -5,6 +5,7 @@
 ../generic/math/clc_ceil.cl
 ../generic/math/clc_fabs.cl
 ../generic/math/clc_floor.cl
+../generic/math/clc_mad.cl
 ../generic/math/clc_rint.cl
 ../generic/math/clc_trunc.cl
 ../generic/shared/clc_clamp.cl
diff --git a/libclc/generic/include/clc/math/ternary_decl.inc b/libclc/generic/include/clc/math/ternary_decl.inc
deleted file mode 100644
index 0598684ea4069f..00000000000000
--- a/libclc/generic/include/clc/math/ternary_decl.inc
+++ /dev/null
@@ -1 +0,0 @@
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c);
diff --git a/libclc/generic/lib/common/mix.cl b/libclc/generic/lib/common/mix.cl
index 7f3d5b61497b2e..756e8619f1b3f2 100644
--- a/libclc/generic/lib/common/mix.cl
+++ b/libclc/generic/lib/common/mix.cl
@@ -1,4 +1,5 @@
 #include <clc/clc.h>
+#include <clc/math/clc_mad.h>
 
 #define __CLC_BODY <mix.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/common/mix.inc b/libclc/generic/lib/common/mix.inc
index 1e8b936149bbfc..fd45a810b0ed9d 100644
--- a/libclc/generic/lib/common/mix.inc
+++ b/libclc/generic/lib/common/mix.inc
@@ -1,9 +1,11 @@
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE a) {
-  return mad( y - x, a, x );
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y,
+                                         __CLC_GENTYPE a) {
+  return __clc_mad(y - x, a, x);
 }
 
 #ifndef __CLC_SCALAR
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_SCALAR_GENTYPE a) {
-    return mix(x, y, (__CLC_GENTYPE)a);
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y,
+                                         __CLC_SCALAR_GENTYPE a) {
+  return mix(x, y, (__CLC_GENTYPE)a);
 }
 #endif
diff --git a/libclc/generic/lib/math/clc_exp10.cl b/libclc/generic/lib/math/clc_exp10.cl
index 6ea8743e39c5f7..572aa396942b7d 100644
--- a/libclc/generic/lib/math/clc_exp10.cl
+++ b/libclc/generic/lib/math/clc_exp10.cl
@@ -22,6 +22,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/math/clc_mad.h>
 #include <clc/relational/clc_isnan.h>
 
 #include "config.h"
@@ -53,98 +54,109 @@
 //
 //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
 
-_CLC_DEF _CLC_OVERLOAD float __clc_exp10(float x)
-{
-    const float X_MAX =  0x1.344134p+5f; // 128*log2/log10 : 38.53183944498959
-    const float X_MIN = -0x1.66d3e8p+5f; // -149*log2/log10 : -44.8534693539332
-
-    const float R_64_BY_LOG10_2 = 0x1.a934f0p+7f; // 64*log10/log2 : 212.6033980727912
-    const float R_LOG10_2_BY_64_LD = 0x1.340000p-8f; // log2/(64 * log10) lead : 0.004699707
-    const float R_LOG10_2_BY_64_TL = 0x1.04d426p-18f; // log2/(64 * log10) tail : 0.00000388665057
-    const float R_LN10 = 0x1.26bb1cp+1f;
-
-    int return_nan = __clc_isnan(x);
-    int return_inf = x > X_MAX;
-    int return_zero = x < X_MIN;
-
-    int n = convert_int(x * R_64_BY_LOG10_2);
-
-    float fn = (float)n;
-    int j = n & 0x3f;
-    int m = n >> 6;
-    int m2 = m << EXPSHIFTBITS_SP32;
-    float r;
-
-    r = R_LN10 * mad(fn, -R_LOG10_2_BY_64_TL, mad(fn, -R_LOG10_2_BY_64_LD, x));
-
-    // Truncated Taylor series for e^r
-    float z2 = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
-
-    float two_to_jby64 = USE_TABLE(exp_tbl, j);
-    z2 = mad(two_to_jby64, z2, two_to_jby64);
-
-    float z2s = z2 * as_float(0x1 << (m + 149));
-    float z2n = as_float(as_int(z2) + m2);
-    z2 = m <= -126 ? z2s : z2n;
-
-
-    z2 = return_inf ? as_float(PINFBITPATT_SP32) : z2;
-    z2 = return_zero ? 0.0f : z2;
-    z2 = return_nan ? x : z2;
-    return z2;
+_CLC_DEF _CLC_OVERLOAD float __clc_exp10(float x) {
+  // 128*log2/log10 : 38.53183944498959
+  const float X_MAX = 0x1.344134p+5f;
+  // -149*log2/log10 : -44.8534693539332
+  const float X_MIN = -0x1.66d3e8p+5f;
+  // 64*log10/log2 : 212.6033980727912
+  const float R_64_BY_LOG10_2 = 0x1.a934f0p+7f;
+  // log2/(64 * log10) lead : 0.004699707
+  const float R_LOG10_2_BY_64_LD = 0x1.340000p-8f;
+  // log2/(64 * log10) tail : 0.00000388665057
+  const float R_LOG10_2_BY_64_TL = 0x1.04d426p-18f;
+  const float R_LN10 = 0x1.26bb1cp+1f;
+
+  int return_nan = __clc_isnan(x);
+  int return_inf = x > X_MAX;
+  int return_zero = x < X_MIN;
+
+  int n = convert_int(x * R_64_BY_LOG10_2);
+
+  float fn = (float)n;
+  int j = n & 0x3f;
+  int m = n >> 6;
+  int m2 = m << EXPSHIFTBITS_SP32;
+  float r;
+
+  r = R_LN10 *
+      __clc_mad(fn, -R_LOG10_2_BY_64_TL, __clc_mad(fn, -R_LOG10_2_BY_64_LD, x));
+
+  // Truncated Taylor series for e^r
+  float z2 = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f),
+                                 r, 0x1.000000p-1f),
+                       r * r, r);
+
+  float two_to_jby64 = USE_TABLE(exp_tbl, j);
+  z2 = __clc_mad(two_to_jby64, z2, two_to_jby64);
+
+  float z2s = z2 * as_float(0x1 << (m + 149));
+  float z2n = as_float(as_int(z2) + m2);
+  z2 = m <= -126 ? z2s : z2n;
+
+  z2 = return_inf ? as_float(PINFBITPATT_SP32) : z2;
+  z2 = return_zero ? 0.0f : z2;
+  z2 = return_nan ? x : z2;
+  return z2;
 }
 _CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_exp10, float)
 
 #ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x)
-{
-    const double X_MAX = 0x1.34413509f79ffp+8; // 1024*ln(2)/ln(10)
-    const double X_MIN = -0x1.434e6420f4374p+8; // -1074*ln(2)/ln(10)
-
-    const double R_64_BY_LOG10_2 = 0x1.a934f0979a371p+7; // 64*ln(10)/ln(2)
-    const double R_LOG10_2_BY_64_LD = 0x1.3441350000000p-8; // head ln(2)/(64*ln(10))
-    const double R_LOG10_2_BY_64_TL = 0x1.3ef3fde623e25p-37; // tail ln(2)/(64*ln(10))
-    const double R_LN10 = 0x1.26bb1bbb55516p+1; // ln(10)
-
-    int n = convert_int(x * R_64_BY_LOG10_2);
-
-    double dn = (double)n;
-
-    int j = n & 0x3f;
-    int m = n >> 6;
-
-    double r = R_LN10 * fma(-R_LOG10_2_BY_64_TL, dn, fma(-R_LOG10_2_BY_64_LD, dn, x));
-
-    // 6 term tail of Taylor expansion of e^r
-    double z2 = r * fma(r,
-	                fma(r,
-		            fma(r,
-			        fma(r,
-			            fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7),
-			            0x1.5555555555555p-5),
-			        0x1.5555555555555p-3),
-		            0x1.0000000000000p-1),
-		        1.0);
-
-    double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
-    z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;
-
-    int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0));
-
-	int n1 = m >> 2;
-	int n2 = m-n1;
-	double z3= z2 * as_double(((long)n1 + 1023) << 52);
-	z3 *= as_double(((long)n2 + 1023) << 52);
-
-    z2 = ldexp(z2, m);
-    z2 = small_value ? z3: z2;
-
-    z2 = __clc_isnan(x) ? x : z2;
-
-    z2 = x > X_MAX ? as_double(PINFBITPATT_DP64) : z2;
-    z2 = x < X_MIN ? 0.0 : z2;
-
-    return z2;
+_CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x) {
+  // 1024*ln(2)/ln(10)
+  const double X_MAX = 0x1.34413509f79ffp+8;
+  // -1074*ln(2)/ln(10)
+  const double X_MIN = -0x1.434e6420f4374p+8;
+  // 64*ln(10)/ln(2)
+  const double R_64_BY_LOG10_2 = 0x1.a934f0979a371p+7;
+  // head ln(2)/(64*ln(10))
+  const double R_LOG10_2_BY_64_LD = 0x1.3441350000000p-8;
+  // tail ln(2)/(64*ln(10))
+  const double R_LOG10_2_BY_64_TL = 0x1.3ef3fde623e25p-37;
+  // ln(10)
+  const double R_LN10 = 0x1.26bb1bbb55516p+1;
+
+  int n = convert_int(x * R_64_BY_LOG10_2);
+
+  double dn = (double)n;
+
+  int j = n & 0x3f;
+  int m = n >> 6;
+
+  double r =
+      R_LN10 * fma(-R_LOG10_2_BY_64_TL, dn, fma(-R_LOG10_2_BY_64_LD, dn, x));
+
+  // 6 term tail of Taylor expansion of e^r
+  double z2 =
+      r *
+      fma(r,
+          fma(r,
+              fma(r,
+                  fma(r, fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7),
+                      0x1.5555555555555p-5),
+                  0x1.5555555555555p-3),
+              0x1.0000000000000p-1),
+          1.0);
+
+  double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+  z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;
+
+  int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0));
+
+  int n1 = m >> 2;
+  int n2 = m - n1;
+  double z3 = z2 * as_double(((long)n1 + 1023) << 52);
+  z3 *= as_double(((long)n2 + 1023) << 52);
+
+  z2 = ldexp(z2, m);
+  z2 = small_value ? z3 : z2;
+
+  z2 = __clc_isnan(x) ? x : z2;
+
+  z2 = x > X_MAX ? as_double(PINFBITPATT_DP64) : z2;
+  z2 = x < X_MIN ? 0.0 : z2;
+
+  return z2;
 }
 _CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_exp10, double)
 #endif
diff --git a/libclc/generic/lib/math/clc_hypot.cl b/libclc/generic/lib/math/clc_hypot.cl
index a17e661603fa67..d889969d6d8c23 100644
--- a/libclc/generic/lib/math/clc_hypot.cl
+++ b/libclc/generic/lib/math/clc_hypot.cl
@@ -23,6 +23,7 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_abs.h>
+#include <clc/math/clc_mad.h>
 #include <clc/relational/clc_isnan.h>
 #include <clc/shared/clc_clamp.h>
 #include <math/clc_hypot.h>
@@ -48,7 +49,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) {
   float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
   float fx = as_float(ux) * fi_exp;
   float fy = as_float(uy) * fi_exp;
-  retval = sqrt(mad(fx, fx, fy * fy)) * fx_exp;
+  retval = sqrt(__clc_mad(fx, fx, fy * fy)) * fx_exp;
 
   retval = ux > PINFBITPATT_SP32 | uy == 0 ? as_float(ux) : retval;
   retval = ux == PINFBITPATT_SP32 | uy == PINFBITPATT_SP32
diff --git a/libclc/generic/lib/math/clc_pow.cl b/libclc/generic/lib/math/clc_pow.cl
index 2e2dade0d6b8fa..4abfaf1c10df42 100644
--- a/libclc/generic/lib/math/clc_pow.cl
+++ b/libclc/generic/lib/math/clc_pow.cl
@@ -23,6 +23,7 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/clc_mad.h>
 
 #include "config.h"
 #include "math.h"
@@ -66,334 +67,351 @@
  ((((expT * poly) + expT) + expH*poly) + expH)
 */
 
-_CLC_DEF _CLC_OVERLOAD float __clc_pow(float x, float y)
-{
-
-    int ix = as_int(x);
-    int ax = ix & EXSIGNBIT_SP32;
-    int xpos = ix == ax;
-
-    int iy = as_int(y);
-    int ay = iy & EXSIGNBIT_SP32;
-    int ypos = iy == ay;
-
-    /* Extra precise log calculation
-     *  First handle case that x is close to 1
-     */
-    float r = 1.0f - as_float(ax);
-    int near1 = __clc_fabs(r) < 0x1.0p-4f;
-    float r2 = r*r;
-
-    /* Coefficients are just 1/3, 1/4, 1/5 and 1/6 */
-    float poly = mad(r,
-                     mad(r,
-                         mad(r,
-                             mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
-                             0x1.99999ap-3f),
-                         0x1.000000p-2f),
-                     0x1.555556p-2f);
-
-    poly *= r2*r;
-
-    float lth_near1 = -r2 * 0.5f;
-    float ltt_near1 = -poly;
-    float lt_near1 = lth_near1 + ltt_near1;
-    float lh_near1 = -r;
-    float l_near1 = lh_near1 + lt_near1;
-
-    /* Computations for x not near 1 */
-    int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
-    float mf = (float)m;
-    int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
-    float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
-    int c = m == -127;
-    int ixn = c ? ixs : ax;
-    float mfn = c ? mfs : mf;
-
-    int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
-    /* F - Y */
-    float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (ixn & MANTBITS_SP32));
-
-    indx = indx >> 16;
-    float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
-    float rh = f * tv.s0;
-    float rt = f * tv.s1;
-    r = rh + rt;
-
-    poly = mad(r, mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * (r*r);
-    poly += (rh - r) + rt;
-
-    const float LOG2_HEAD = 0x1.62e000p-1f;  /* 0.693115234 */
-    const float LOG2_TAIL = 0x1.0bfbe8p-15f; /* 0.0000319461833 */
-    tv = USE_TABLE(loge_tbl, indx);
-    float lth = -r;
-    float ltt = mad(mfn, LOG2_TAIL, -poly) + tv.s1;
-    float lt = lth + ltt;
-    float lh = mad(mfn, LOG2_HEAD, tv.s0);
-    float l = lh + lt;
-
-    /* Select near 1 or not */
-    lth = near1 ? lth_near1 : lth;
-    ltt = near1 ? ltt_near1 : ltt;
-    lt = near1 ? lt_near1 : lt;
-    lh = near1 ? lh_near1 : lh;
-    l = near1 ? l_near1 : l;
-
-    float gh = as_float(as_int(l) & 0xfffff000);
-    float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
-    float yh = as_float(iy & 0xfffff000);
-
-    float yt = y - yh;
-
-    float ylogx_s = mad(gt, yh, mad(gh, yt, yt*gt));
-    float ylogx = mad(yh, gh, ylogx_s);
-    float ylogx_t = mad(yh, gh, -ylogx) + ylogx_s;
-
-    /* Extra precise exp of ylogx */
-    const float R_64_BY_LOG2 = 0x1.715476p+6f; /* 64/log2 : 92.332482616893657 */
-    int n = convert_int(ylogx * R_64_BY_LOG2);
-    float nf = (float) n;
-
-    int j = n & 0x3f;
-    m = n >> 6;
-    int m2 = m << EXPSHIFTBITS_SP32;
-
-    const float R_LOG2_BY_64_LD = 0x1.620000p-7f;  /* log2/64 lead: 0.0108032227 */
-    const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; /* log2/64 tail: 0.0000272020388 */
-    r = mad(nf, -R_LOG2_BY_64_TL, mad(nf, -R_LOG2_BY_64_LD, ylogx)) + ylogx_t;
-
-    /* Truncated Taylor series for e^r */
-    poly = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
-
-    tv = USE_TABLE(exp_tbl_ep, j);
-
-    float expylogx = mad(tv.s0, poly, mad(tv.s1, poly, tv.s1)) + tv.s0;
-    float sexpylogx = expylogx * as_float(0x1 << (m + 149));
-    float texpylogx = as_float(as_int(expylogx) + m2);
-    expylogx = m < -125 ? sexpylogx : texpylogx;
-
-    /* Result is +-Inf if (ylogx + ylogx_t) > 128*log2 */
-    expylogx = (ylogx > 0x1.62e430p+6f) | (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f) ? as_float(PINFBITPATT_SP32) : expylogx;
-
-    /* Result is 0 if ylogx < -149*log2 */
-    expylogx = ylogx <  -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
-    /* Classify y:
-     *   inty = 0 means not an integer.
-     *   inty = 1 means odd integer.
-     *   inty = 2 means even integer.
-     */
-
-    int yexp = (int)(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
-    int mask = (1 << (24 - yexp)) - 1;
-    int yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
-    int inty = yodd ? 1 : 2;
-    inty = (iy & mask) != 0 ? 0 : inty;
-    inty = yexp < 1 ? 0 : inty;
-    inty = yexp > 24 ? 2 : inty;
-
-    float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
-    expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
-    int ret = as_int(expylogx);
-
-    /* Corner case handling */
-    ret = (!xpos & (inty == 0)) ? QNANBITPATT_SP32 : ret;
-    ret = ax < 0x3f800000 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
-    ret = ax > 0x3f800000 & iy == NINFBITPATT_SP32 ? 0 : ret;
-    ret = ax < 0x3f800000 & iy == PINFBITPATT_SP32 ? 0 : ret;
-    ret = ax > 0x3f800000 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
-    int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
-    ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
-    ret = ((ax == 0) & !ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
-    int xzero = xpos ? 0 : 0x80000000;
-    ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
-    ret = ((ax == 0) & ypos & (inty != 1)) ? 0 : ret;
-    ret = ((ax == 0) & (iy == NINFBITPATT_SP32)) ? PINFBITPATT_SP32 : ret;
-    ret = ((ix == 0xbf800000) & (ay == PINFBITPATT_SP32)) ? 0x3f800000 : ret;
-    ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
-    ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty != 1)) ? 0 : ret;
-    ret = ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
-    ret = ((ix == NINFBITPATT_SP32) & ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
-    ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
-    ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
-    ret = (ax > PINFBITPATT_SP32) ? ix : ret;
-    ret = (ay > PINFBITPATT_SP32) ? iy : ret;
-    ret = ay == 0 ? 0x3f800000 : ret;
-    ret = ix == 0x3f800000 ? 0x3f800000 : ret;
-
-    return as_float(ret);
+_CLC_DEF _CLC_OVERLOAD float __clc_pow(float x, float y) {
+
+  int ix = as_int(x);
+  int ax = ix & EXSIGNBIT_SP32;
+  int xpos = ix == ax;
+
+  int iy = as_int(y);
+  int ay = iy & EXSIGNBIT_SP32;
+  int ypos = iy == ay;
+
+  /* Extra precise log calculation
+   *  First handle case that x is close to 1
+   */
+  float r = 1.0f - as_float(ax);
+  int near1 = __clc_fabs(r) < 0x1.0p-4f;
+  float r2 = r * r;
+
+  /* Coefficients are just 1/3, 1/4, 1/5 and 1/6 */
+  float poly = __clc_mad(
+      r,
+      __clc_mad(r,
+                __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
+                          0x1.99999ap-3f),
+                0x1.000000p-2f),
+      0x1.555556p-2f);
+
+  poly *= r2 * r;
+
+  float lth_near1 = -r2 * 0.5f;
+  float ltt_near1 = -poly;
+  float lt_near1 = lth_near1 + ltt_near1;
+  float lh_near1 = -r;
+  float l_near1 = lh_near1 + lt_near1;
+
+  /* Computations for x not near 1 */
+  int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+  float mf = (float)m;
+  int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+  float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
+  int c = m == -127;
+  int ixn = c ? ixs : ax;
+  float mfn = c ? mfs : mf;
+
+  int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
+
+  /* F - Y */
+  float f = as_float(0x3f000000 | indx) -
+            as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+
+  indx = indx >> 16;
+  float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
+  float rh = f * tv.s0;
+  float rt = f * tv.s1;
+  r = rh + rt;
+
+  poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
+         (r * r);
+  poly += (rh - r) + rt;
+
+  const float LOG2_HEAD = 0x1.62e000p-1f;  /* 0.693115234 */
+  const float LOG2_TAIL = 0x1.0bfbe8p-15f; /* 0.0000319461833 */
+  tv = USE_TABLE(loge_tbl, indx);
+  float lth = -r;
+  float ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + tv.s1;
+  float lt = lth + ltt;
+  float lh = __clc_mad(mfn, LOG2_HEAD, tv.s0);
+  float l = lh + lt;
+
+  /* Select near 1 or not */
+  lth = near1 ? lth_near1 : lth;
+  ltt = near1 ? ltt_near1 : ltt;
+  lt = near1 ? lt_near1 : lt;
+  lh = near1 ? lh_near1 : lh;
+  l = near1 ? l_near1 : l;
+
+  float gh = as_float(as_int(l) & 0xfffff000);
+  float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
+
+  float yh = as_float(iy & 0xfffff000);
+
+  float yt = y - yh;
+
+  float ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
+  float ylogx = __clc_mad(yh, gh, ylogx_s);
+  float ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
+
+  /* Extra precise exp of ylogx */
+  /* 64/log2 : 92.332482616893657 */
+  const float R_64_BY_LOG2 = 0x1.715476p+6f;
+  int n = convert_int(ylogx * R_64_BY_LOG2);
+  float nf = (float)n;
+
+  int j = n & 0x3f;
+  m = n >> 6;
+  int m2 = m << EXPSHIFTBITS_SP32;
+
+  /* log2/64 lead: 0.0108032227 */
+  const float R_LOG2_BY_64_LD = 0x1.620000p-7f;
+  /* log2/64 tail: 0.0000272020388 */
+  const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
+  r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
+      ylogx_t;
+
+  /* Truncated Taylor series for e^r */
+  poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
+                             0x1.000000p-1f),
+                   r * r, r);
+
+  tv = USE_TABLE(exp_tbl_ep, j);
+
+  float expylogx =
+      __clc_mad(tv.s0, poly, __clc_mad(tv.s1, poly, tv.s1)) + tv.s0;
+  float sexpylogx = expylogx * as_float(0x1 << (m + 149));
+  float texpylogx = as_float(as_int(expylogx) + m2);
+  expylogx = m < -125 ? sexpylogx : texpylogx;
+
+  /* Result is +-Inf if (ylogx + ylogx_t) > 128*log2 */
+  expylogx = (ylogx > 0x1.62e430p+6f) |
+                     (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f)
+                 ? as_float(PINFBITPATT_SP32)
+                 : expylogx;
+
+  /* Result is 0 if ylogx < -149*log2 */
+  expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
+
+  /* Classify y:
+   *   inty = 0 means not an integer.
+   *   inty = 1 means odd integer.
+   *   inty = 2 means even integer.
+   */
+
+  int yexp = (int)(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
+  int mask = (1 << (24 - yexp)) - 1;
+  int yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
+  int inty = yodd ? 1 : 2;
+  inty = (iy & mask) != 0 ? 0 : inty;
+  inty = yexp < 1 ? 0 : inty;
+  inty = yexp > 24 ? 2 : inty;
+
+  float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+  expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
+  int ret = as_int(expylogx);
+
+  /* Corner case handling */
+  ret = (!xpos & (inty == 0)) ? QNANBITPATT_SP32 : ret;
+  ret = ax < 0x3f800000 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+  ret = ax > 0x3f800000 & iy == NINFBITPATT_SP32 ? 0 : ret;
+  ret = ax < 0x3f800000 & iy == PINFBITPATT_SP32 ? 0 : ret;
+  ret = ax > 0x3f800000 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+  int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
+  ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
+  ret = ((ax == 0) & !ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
+  int xzero = xpos ? 0 : 0x80000000;
+  ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
+  ret = ((ax == 0) & ypos & (inty != 1)) ? 0 : ret;
+  ret = ((ax == 0) & (iy == NINFBITPATT_SP32)) ? PINFBITPATT_SP32 : ret;
+  ret = ((ix == 0xbf800000) & (ay == PINFBITPATT_SP32)) ? 0x3f800000 : ret;
+  ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
+  ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty != 1)) ? 0 : ret;
+  ret =
+      ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
+  ret =
+      ((ix == NINFBITPATT_SP32) & ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
+  ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
+  ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
+  ret = (ax > PINFBITPATT_SP32) ? ix : ret;
+  ret = (ay > PINFBITPATT_SP32) ? iy : ret;
+  ret = ay == 0 ? 0x3f800000 : ret;
+  ret = ix == 0x3f800000 ? 0x3f800000 : ret;
+
+  return as_float(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_pow, float, float)
 
 #ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y)
-{
-    const double real_log2_tail = 5.76999904754328540596e-08;
-    const double real_log2_lead = 6.93147122859954833984e-01;
-
-    long ux = as_long(x);
-    long ax = ux & (~SIGNBIT_DP64);
-    int xpos = ax == ux;
-
-    long uy = as_long(y);
-    long ay = uy & (~SIGNBIT_DP64);
-    int ypos = ay == uy;
-
-    // Extended precision log
-    double v, vt;
-    {
-        int exp = (int)(ax >> 52) - 1023;
-        int mask_exp_1023 = exp == -1023;
-        double xexp = (double) exp;
-        long mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
-        long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
-        exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
-        double xexp1 = (double) exp;
-        long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
-        xexp = mask_exp_1023 ? xexp1 : xexp;
-        mantissa = mask_exp_1023 ? mantissa1 : mantissa;
-
-        long rax = (mantissa & 0x000ff00000000000) + ((mantissa & 0x0000080000000000) << 1);
-        int index = rax >> 44;
-
-        double F = as_double(rax | 0x3FE0000000000000L);
-        double Y = as_double(mantissa | 0x3FE0000000000000L);
-        double f = F - Y;
-        double2 tv = USE_TABLE(log_f_inv_tbl, index);
-        double log_h = tv.s0;
-        double log_t = tv.s1;
-        double f_inv = (log_h + log_t) * f;
-        double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
-        double r2 = fma(-F, r1, f) * (log_h + log_t);
-        double r = r1 + r2;
-
-        double poly = fma(r,
-                          fma(r,
-                              fma(r,
-                                  fma(r, 1.0/7.0, 1.0/6.0),
-                                  1.0/5.0),
-                              1.0/4.0),
-                          1.0/3.0);
-        poly = poly * r * r * r;
-
-        double hr1r1 = 0.5*r1*r1;
-        double poly0h = r1 + hr1r1;
-        double poly0t = r1 - poly0h + hr1r1;
-        poly = fma(r1, r2, fma(0.5*r2, r2, poly)) + r2 + poly0t;
-
-        tv = USE_TABLE(powlog_tbl, index);
-        log_h = tv.s0;
-        log_t = tv.s1;
-
-        double resT_t = fma(xexp, real_log2_tail, + log_t) - poly;
-        double resT = resT_t - poly0h;
-        double resH = fma(xexp, real_log2_lead, log_h);
-        double resT_h = poly0h;
-
-        double H = resT + resH;
-        double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
-        double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
-        H = H_h;
-
-        double y_head = as_double(uy & 0xfffffffff8000000L);
-        double y_tail = y - y_head;
-
-        double temp = fma(y_tail, H, fma(y_head, T, y_tail*T));
-        v = fma(y_head, H, temp);
-        vt = fma(y_head, H, -v) + temp;
-    }
-
-    // Now calculate exp of (v,vt)
-
-    double expv;
-    {
-        const double max_exp_arg = 709.782712893384;
-        const double min_exp_arg = -745.1332191019411;
-        const double sixtyfour_by_lnof2 = 92.33248261689366;
-        const double lnof2_by_64_head = 0.010830424260348081;
-        const double lnof2_by_64_tail = -4.359010638708991e-10;
-
-        double temp = v * sixtyfour_by_lnof2;
-        int n = (int)temp;
-        double dn = (double)n;
-        int j = n & 0x0000003f;
-        int m = n >> 6;
-
-        double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
-        double f1 = tv.s0;
-        double f2 = tv.s1;
-        double f = f1 + f2;
-
-        double r1 = fma(dn, -lnof2_by_64_head, v);
-        double r2 = dn * lnof2_by_64_tail;
-        double r = (r1 + r2) + vt;
-
-        double q = fma(r,
-                       fma(r,
-                           fma(r,
-                               fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
-                               4.16666666662260795726e-02),
-                           1.66666666665260878863e-01),
-                       5.00000000000000008883e-01);
-        q = fma(r*r, q, r);
-
-        expv = fma(f, q, f2) + f1;
-	      expv = ldexp(expv, m);
-
-        expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
-        expv = v < min_exp_arg ? 0.0 : expv;
-    }
-
-    // See whether y is an integer.
-    // inty = 0 means not an integer.
-    // inty = 1 means odd integer.
-    // inty = 2 means even integer.
-
-    int inty;
-    {
-        int yexp = (int)(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
-        inty = yexp < 1 ? 0 : 2;
-        inty = yexp > 53 ? 2 : inty;
-        long mask = (1L << (53 - yexp)) - 1L;
-        int inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1 : 2;
-        inty1 = (ay & mask) != 0 ? 0 : inty1;
-        inty = !(yexp < 1) & !(yexp > 53) ? inty1 : inty;
-    }
-
-    expv *= (inty == 1) & !xpos ? -1.0 : 1.0;
-
-    long ret = as_long(expv);
-
-    // Now all the edge cases
-    ret = !xpos & (inty == 0) ? QNANBITPATT_DP64 : ret;
-    ret = ax < 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
-    ret = ax > 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? 0L : ret;
-    ret = ax < 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? 0L : ret;
-    ret = ax > 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
-    long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
-    ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
-    ret = ((ax == 0L) & !ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
-    long xzero = xpos ? 0L : 0x8000000000000000L;
-    ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
-    ret = ((ax == 0L) & ypos & (inty != 1)) ? 0L : ret;
-    ret = ((ax == 0L) & (uy == NINFBITPATT_DP64)) ? PINFBITPATT_DP64 : ret;
-    ret = ((ux == 0xbff0000000000000L) & (ay == PINFBITPATT_DP64)) ? 0x3ff0000000000000L : ret;
-    ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L : ret;
-    ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty != 1)) ? 0L : ret;
-    ret = ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
-    ret = ((ux == NINFBITPATT_DP64) & ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
-    ret = (ux == PINFBITPATT_DP64) & !ypos ? 0L : ret;
-    ret = (ux == PINFBITPATT_DP64) & ypos ? PINFBITPATT_DP64 : ret;
-    ret = ax > PINFBITPATT_DP64 ? ux : ret;
-    ret = ay > PINFBITPATT_DP64 ? uy : ret;
-    ret = ay == 0L ? 0x3ff0000000000000L : ret;
-    ret = ux == 0x3ff0000000000000L ? 0x3ff0000000000000L : ret;
-
-    return as_double(ret);
+_CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y) {
+  const double real_log2_tail = 5.76999904754328540596e-08;
+  const double real_log2_lead = 6.93147122859954833984e-01;
+
+  long ux = as_long(x);
+  long ax = ux & (~SIGNBIT_DP64);
+  int xpos = ax == ux;
+
+  long uy = as_long(y);
+  long ay = uy & (~SIGNBIT_DP64);
+  int ypos = ay == uy;
+
+  // Extended precision log
+  double v, vt;
+  {
+    int exp = (int)(ax >> 52) - 1023;
+    int mask_exp_1023 = exp == -1023;
+    double xexp = (double)exp;
+    long mantissa = ax & 0x000FFFFFFFFFFFFFL;
+
+    long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+    exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
+    double xexp1 = (double)exp;
+    long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
+
+    xexp = mask_exp_1023 ? xexp1 : xexp;
+    mantissa = mask_exp_1023 ? mantissa1 : mantissa;
+
+    long rax = (mantissa & 0x000ff00000000000) +
+               ((mantissa & 0x0000080000000000) << 1);
+    int index = rax >> 44;
+
+    double F = as_double(rax | 0x3FE0000000000000L);
+    double Y = as_double(mantissa | 0x3FE0000000000000L);
+    double f = F - Y;
+    double2 tv = USE_TABLE(log_f_inv_tbl, index);
+    double log_h = tv.s0;
+    double log_t = tv.s1;
+    double f_inv = (log_h + log_t) * f;
+    double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+    double r2 = fma(-F, r1, f) * (log_h + log_t);
+    double r = r1 + r2;
+
+    double poly = fma(
+        r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+        1.0 / 3.0);
+    poly = poly * r * r * r;
+
+    double hr1r1 = 0.5 * r1 * r1;
+    double poly0h = r1 + hr1r1;
+    double poly0t = r1 - poly0h + hr1r1;
+    poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+
+    tv = USE_TABLE(powlog_tbl, index);
+    log_h = tv.s0;
+    log_t = tv.s1;
+
+    double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+    double resT = resT_t - poly0h;
+    double resH = fma(xexp, real_log2_lead, log_h);
+    double resT_h = poly0h;
+
+    double H = resT + resH;
+    double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+    double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
+    H = H_h;
+
+    double y_head = as_double(uy & 0xfffffffff8000000L);
+    double y_tail = y - y_head;
+
+    double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
+    v = fma(y_head, H, temp);
+    vt = fma(y_head, H, -v) + temp;
+  }
+
+  // Now calculate exp of (v,vt)
+
+  double expv;
+  {
+    const double max_exp_arg = 709.782712893384;
+    const double min_exp_arg = -745.1332191019411;
+    const double sixtyfour_by_lnof2 = 92.33248261689366;
+    const double lnof2_by_64_head = 0.010830424260348081;
+    const double lnof2_by_64_tail = -4.359010638708991e-10;
+
+    double temp = v * sixtyfour_by_lnof2;
+    int n = (int)temp;
+    double dn = (double)n;
+    int j = n & 0x0000003f;
+    int m = n >> 6;
+
+    double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+    double f1 = tv.s0;
+    double f2 = tv.s1;
+    double f = f1 + f2;
+
+    double r1 = fma(dn, -lnof2_by_64_head, v);
+    double r2 = dn * lnof2_by_64_tail;
+    double r = (r1 + r2) + vt;
+
+    double q = fma(
+        r,
+        fma(r,
+            fma(r,
+                fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
+                4.16666666662260795726e-02),
+            1.66666666665260878863e-01),
+        5.00000000000000008883e-01);
+    q = fma(r * r, q, r);
+
+    expv = fma(f, q, f2) + f1;
+    expv = ldexp(expv, m);
+
+    expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+    expv = v < min_exp_arg ? 0.0 : expv;
+  }
+
+  // See whether y is an integer.
+  // inty = 0 means not an integer.
+  // inty = 1 means odd integer.
+  // inty = 2 means even integer.
+
+  int inty;
+  {
+    int yexp = (int)(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
+    inty = yexp < 1 ? 0 : 2;
+    inty = yexp > 53 ? 2 : inty;
+    long mask = (1L << (53 - yexp)) - 1L;
+    int inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1 : 2;
+    inty1 = (ay & mask) != 0 ? 0 : inty1;
+    inty = !(yexp < 1) & !(yexp > 53) ? inty1 : inty;
+  }
+
+  expv *= (inty == 1) & !xpos ? -1.0 : 1.0;
+
+  long ret = as_long(expv);
+
+  // Now all the edge cases
+  ret = !xpos & (inty == 0) ? QNANBITPATT_DP64 : ret;
+  ret = ax < 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64
+                                                          : ret;
+  ret = ax > 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? 0L : ret;
+  ret = ax < 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? 0L : ret;
+  ret = ax > 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64
+                                                          : ret;
+  long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
+  ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
+  ret = ((ax == 0L) & !ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
+  long xzero = xpos ? 0L : 0x8000000000000000L;
+  ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
+  ret = ((ax == 0L) & ypos & (inty != 1)) ? 0L : ret;
+  ret = ((ax == 0L) & (uy == NINFBITPATT_DP64)) ? PINFBITPATT_DP64 : ret;
+  ret = ((ux == 0xbff0000000000000L) & (ay == PINFBITPATT_DP64))
+            ? 0x3ff0000000000000L
+            : ret;
+  ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L
+                                                         : ret;
+  ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty != 1)) ? 0L : ret;
+  ret =
+      ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
+  ret =
+      ((ux == NINFBITPATT_DP64) & ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
+  ret = (ux == PINFBITPATT_DP64) & !ypos ? 0L : ret;
+  ret = (ux == PINFBITPATT_DP64) & ypos ? PINFBITPATT_DP64 : ret;
+  ret = ax > PINFBITPATT_DP64 ? ux : ret;
+  ret = ay > PINFBITPATT_DP64 ? uy : ret;
+  ret = ay == 0L ? 0x3ff0000000000000L : ret;
+  ret = ux == 0x3ff0000000000000L ? 0x3ff0000000000000L : ret;
+
+  return as_double(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_pow, double, double)
 #endif
diff --git a/libclc/generic/lib/math/clc_pown.cl b/libclc/generic/lib/math/clc_pown.cl
index 031bf9b25e6a1a..c0208926646027 100644
--- a/libclc/generic/lib/math/clc_pown.cl
+++ b/libclc/generic/lib/math/clc_pown.cl
@@ -23,6 +23,7 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/clc_mad.h>
 
 #include "config.h"
 #include "math.h"
@@ -64,308 +65,321 @@
 // At the end of exp, do
 // ((((expT * poly) + expT) + expH*poly) + expH)
 
-_CLC_DEF _CLC_OVERLOAD float __clc_pown(float x, int ny)
-{
-    float y = (float)ny;
-
-    int ix = as_int(x);
-    int ax = ix & EXSIGNBIT_SP32;
-    int xpos = ix == ax;
-
-    int iy = as_int(y);
-    int ay = iy & EXSIGNBIT_SP32;
-    int ypos = iy == ay;
-
-    // Extra precise log calculation
-    // First handle case that x is close to 1
-    float r = 1.0f - as_float(ax);
-    int near1 = __clc_fabs(r) < 0x1.0p-4f;
-    float r2 = r*r;
-
-    // Coefficients are just 1/3, 1/4, 1/5 and 1/6
-    float poly = mad(r,
-                     mad(r,
-                         mad(r,
-                             mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
-                             0x1.99999ap-3f),
-                         0x1.000000p-2f),
-                     0x1.555556p-2f);
-
-    poly *= r2*r;
-
-    float lth_near1 = -r2 * 0.5f;
-    float ltt_near1 = -poly;
-    float lt_near1 = lth_near1 + ltt_near1;
-    float lh_near1 = -r;
-    float l_near1 = lh_near1 + lt_near1;
-
-    // Computations for x not near 1
-    int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
-    float mf = (float)m;
-    int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
-    float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
-    int c = m == -127;
-    int ixn = c ? ixs : ax;
-    float mfn = c ? mfs : mf;
-
-    int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
-    // F - Y
-    float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (ixn & MANTBITS_SP32));
-
-    indx = indx >> 16;
-    float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
-    float rh = f * tv.s0;
-    float rt = f * tv.s1;
-    r = rh + rt;
-
-    poly = mad(r, mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * (r*r);
-    poly += (rh - r) + rt;
-
-    const float LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
-    const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
-    tv = USE_TABLE(loge_tbl, indx);
-    float lth = -r;
-    float ltt = mad(mfn, LOG2_TAIL, -poly) + tv.s1;
-    float lt = lth + ltt;
-    float lh = mad(mfn, LOG2_HEAD, tv.s0);
-    float l = lh + lt;
-
-    // Select near 1 or not
-    lth = near1 ? lth_near1 : lth;
-    ltt = near1 ? ltt_near1 : ltt;
-    lt = near1 ? lt_near1 : lt;
-    lh = near1 ? lh_near1 : lh;
-    l = near1 ? l_near1 : l;
-
-    float gh = as_float(as_int(l) & 0xfffff000);
-    float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
-    float yh = as_float(iy & 0xfffff000);
-
-    float yt = (float)(ny - (int)yh);
-
-    float ylogx_s = mad(gt, yh, mad(gh, yt, yt*gt));
-    float ylogx = mad(yh, gh, ylogx_s);
-    float ylogx_t = mad(yh, gh, -ylogx) + ylogx_s;
-
-    // Extra precise exp of ylogx
-    const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657
-    int n = convert_int(ylogx * R_64_BY_LOG2);
-    float nf = (float) n;
-
-    int j = n & 0x3f;
-    m = n >> 6;
-    int m2 = m << EXPSHIFTBITS_SP32;
-
-    const float R_LOG2_BY_64_LD = 0x1.620000p-7f;  // log2/64 lead: 0.0108032227
-    const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388
-    r = mad(nf, -R_LOG2_BY_64_TL, mad(nf, -R_LOG2_BY_64_LD, ylogx)) + ylogx_t;
-
-    // Truncated Taylor series for e^r
-    poly = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
-
-    tv = USE_TABLE(exp_tbl_ep, j);
-
-    float expylogx = mad(tv.s0, poly, mad(tv.s1, poly, tv.s1)) + tv.s0;
-    float sexpylogx = expylogx * as_float(0x1 << (m + 149));
-    float texpylogx = as_float(as_int(expylogx) + m2);
-    expylogx = m < -125 ? sexpylogx : texpylogx;
-
-    // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
-    expylogx = ((ylogx > 0x1.62e430p+6f) | (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f)) ? as_float(PINFBITPATT_SP32) : expylogx;
-
-    // Result is 0 if ylogx < -149*log2
-    expylogx = ylogx <  -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
-    // Classify y:
-    //   inty = 0 means not an integer.
-    //   inty = 1 means odd integer.
-    //   inty = 2 means even integer.
-
-    int inty = 2 - (ny & 1);
-
-    float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
-    expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
-    int ret = as_int(expylogx);
-
-    // Corner case handling
-    int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
-    ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
-    ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret;
-    ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret;
-    int xzero = !xpos ? 0x80000000 : 0L;
-    ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
-    ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
-    ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty != 1)) ? 0 : ret;
-    ret = ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
-    ret = ((ix == NINFBITPATT_SP32) & ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
-    ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
-    ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
-    ret = ax > PINFBITPATT_SP32 ? ix : ret;
-    ret = ny == 0 ? 0x3f800000 : ret;
-
-    return as_float(ret);
+_CLC_DEF _CLC_OVERLOAD float __clc_pown(float x, int ny) {
+  float y = (float)ny;
+
+  int ix = as_int(x);
+  int ax = ix & EXSIGNBIT_SP32;
+  int xpos = ix == ax;
+
+  int iy = as_int(y);
+  int ay = iy & EXSIGNBIT_SP32;
+  int ypos = iy == ay;
+
+  // Extra precise log calculation
+  // First handle case that x is close to 1
+  float r = 1.0f - as_float(ax);
+  int near1 = __clc_fabs(r) < 0x1.0p-4f;
+  float r2 = r * r;
+
+  // Coefficients are just 1/3, 1/4, 1/5 and 1/6
+  float poly = __clc_mad(
+      r,
+      __clc_mad(r,
+                __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
+                          0x1.99999ap-3f),
+                0x1.000000p-2f),
+      0x1.555556p-2f);
+
+  poly *= r2 * r;
+
+  float lth_near1 = -r2 * 0.5f;
+  float ltt_near1 = -poly;
+  float lt_near1 = lth_near1 + ltt_near1;
+  float lh_near1 = -r;
+  float l_near1 = lh_near1 + lt_near1;
+
+  // Computations for x not near 1
+  int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+  float mf = (float)m;
+  int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+  float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
+  int c = m == -127;
+  int ixn = c ? ixs : ax;
+  float mfn = c ? mfs : mf;
+
+  int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
+
+  // F - Y
+  float f = as_float(0x3f000000 | indx) -
+            as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+
+  indx = indx >> 16;
+  float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
+  float rh = f * tv.s0;
+  float rt = f * tv.s1;
+  r = rh + rt;
+
+  poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
+         (r * r);
+  poly += (rh - r) + rt;
+
+  const float LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
+  const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
+  tv = USE_TABLE(loge_tbl, indx);
+  float lth = -r;
+  float ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + tv.s1;
+  float lt = lth + ltt;
+  float lh = __clc_mad(mfn, LOG2_HEAD, tv.s0);
+  float l = lh + lt;
+
+  // Select near 1 or not
+  lth = near1 ? lth_near1 : lth;
+  ltt = near1 ? ltt_near1 : ltt;
+  lt = near1 ? lt_near1 : lt;
+  lh = near1 ? lh_near1 : lh;
+  l = near1 ? l_near1 : l;
+
+  float gh = as_float(as_int(l) & 0xfffff000);
+  float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
+
+  float yh = as_float(iy & 0xfffff000);
+
+  float yt = (float)(ny - (int)yh);
+
+  float ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
+  float ylogx = __clc_mad(yh, gh, ylogx_s);
+  float ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
+
+  // Extra precise exp of ylogx
+  // 64/log2 : 92.332482616893657
+  const float R_64_BY_LOG2 = 0x1.715476p+6f;
+  int n = convert_int(ylogx * R_64_BY_LOG2);
+  float nf = (float)n;
+
+  int j = n & 0x3f;
+  m = n >> 6;
+  int m2 = m << EXPSHIFTBITS_SP32;
+
+  // log2/64 lead: 0.0108032227
+  const float R_LOG2_BY_64_LD = 0x1.620000p-7f;
+  // log2/64 tail: 0.0000272020388
+  const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
+  r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
+      ylogx_t;
+
+  // Truncated Taylor series for e^r
+  poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
+                             0x1.000000p-1f),
+                   r * r, r);
+
+  tv = USE_TABLE(exp_tbl_ep, j);
+
+  float expylogx =
+      __clc_mad(tv.s0, poly, __clc_mad(tv.s1, poly, tv.s1)) + tv.s0;
+  float sexpylogx = expylogx * as_float(0x1 << (m + 149));
+  float texpylogx = as_float(as_int(expylogx) + m2);
+  expylogx = m < -125 ? sexpylogx : texpylogx;
+
+  // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
+  expylogx = ((ylogx > 0x1.62e430p+6f) |
+              (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f))
+                 ? as_float(PINFBITPATT_SP32)
+                 : expylogx;
+
+  // Result is 0 if ylogx < -149*log2
+  expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
+
+  // Classify y:
+  //   inty = 0 means not an integer.
+  //   inty = 1 means odd integer.
+  //   inty = 2 means even integer.
+
+  int inty = 2 - (ny & 1);
+
+  float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+  expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
+  int ret = as_int(expylogx);
+
+  // Corner case handling
+  int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
+  ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
+  ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret;
+  ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret;
+  int xzero = !xpos ? 0x80000000 : 0L;
+  ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
+  ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
+  ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty != 1)) ? 0 : ret;
+  ret =
+      ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
+  ret =
+      ((ix == NINFBITPATT_SP32) & ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
+  ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
+  ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
+  ret = ax > PINFBITPATT_SP32 ? ix : ret;
+  ret = ny == 0 ? 0x3f800000 : ret;
+
+  return as_float(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_pown, float, int)
 
 #ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny)
-{
-    const double real_log2_tail = 5.76999904754328540596e-08;
-    const double real_log2_lead = 6.93147122859954833984e-01;
-
-    double y = (double) ny;
-
-    long ux = as_long(x);
-    long ax = ux & (~SIGNBIT_DP64);
-    int xpos = ax == ux;
-
-    long uy = as_long(y);
-    long ay = uy & (~SIGNBIT_DP64);
-    int ypos = ay == uy;
-
-    // Extended precision log
-    double v, vt;
-    {
-        int exp = (int)(ax >> 52) - 1023;
-        int mask_exp_1023 = exp == -1023;
-        double xexp = (double) exp;
-        long mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
-        long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
-        exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
-        double xexp1 = (double) exp;
-        long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
-        xexp = mask_exp_1023 ? xexp1 : xexp;
-        mantissa = mask_exp_1023 ? mantissa1 : mantissa;
-
-        long rax = (mantissa & 0x000ff00000000000) + ((mantissa & 0x0000080000000000) << 1);
-        int index = rax >> 44;
-
-        double F = as_double(rax | 0x3FE0000000000000L);
-        double Y = as_double(mantissa | 0x3FE0000000000000L);
-        double f = F - Y;
-        double2 tv = USE_TABLE(log_f_inv_tbl, index);
-        double log_h = tv.s0;
-        double log_t = tv.s1;
-        double f_inv = (log_h + log_t) * f;
-        double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
-        double r2 = fma(-F, r1, f) * (log_h + log_t);
-        double r = r1 + r2;
-
-        double poly = fma(r,
-                          fma(r,
-                              fma(r,
-                                  fma(r, 1.0/7.0, 1.0/6.0),
-                                  1.0/5.0),
-                              1.0/4.0),
-                          1.0/3.0);
-        poly = poly * r * r * r;
-
-        double hr1r1 = 0.5*r1*r1;
-        double poly0h = r1 + hr1r1;
-        double poly0t = r1 - poly0h + hr1r1;
-        poly = fma(r1, r2, fma(0.5*r2, r2, poly)) + r2 + poly0t;
-
-        tv = USE_TABLE(powlog_tbl, index);
-        log_h = tv.s0;
-        log_t = tv.s1;
-
-        double resT_t = fma(xexp, real_log2_tail, + log_t) - poly;
-        double resT = resT_t - poly0h;
-        double resH = fma(xexp, real_log2_lead, log_h);
-        double resT_h = poly0h;
-
-        double H = resT + resH;
-        double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
-        double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
-        H = H_h;
-
-        double y_head = as_double(uy & 0xfffffffff8000000L);
-        double y_tail = y - y_head;
-
-        int mask_2_24 = ay > 0x4170000000000000; // 2^24
-        int nyh = convert_int(y_head);
-        int nyt = ny - nyh;
-        double y_tail1 = (double)nyt;
-        y_tail = mask_2_24 ? y_tail1 : y_tail;
-
-        double temp = fma(y_tail, H, fma(y_head, T, y_tail*T));
-        v = fma(y_head, H, temp);
-        vt = fma(y_head, H, -v) + temp;
-    }
-
-    // Now calculate exp of (v,vt)
-
-    double expv;
-    {
-        const double max_exp_arg = 709.782712893384;
-        const double min_exp_arg = -745.1332191019411;
-        const double sixtyfour_by_lnof2 = 92.33248261689366;
-        const double lnof2_by_64_head = 0.010830424260348081;
-        const double lnof2_by_64_tail = -4.359010638708991e-10;
-
-        double temp = v * sixtyfour_by_lnof2;
-        int n = (int)temp;
-        double dn = (double)n;
-        int j = n & 0x0000003f;
-        int m = n >> 6;
-
-        double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
-        double f1 = tv.s0;
-        double f2 = tv.s1;
-        double f = f1 + f2;
-
-        double r1 = fma(dn, -lnof2_by_64_head, v);
-        double r2 = dn * lnof2_by_64_tail;
-        double r = (r1 + r2) + vt;
-
-        double q = fma(r,
-                       fma(r,
-                           fma(r,
-                               fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
-                               4.16666666662260795726e-02),
-                           1.66666666665260878863e-01),
-                       5.00000000000000008883e-01);
-        q = fma(r*r, q, r);
-
-        expv = fma(f, q, f2) + f1;
-	      expv = ldexp(expv, m);
-
-        expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
-        expv = v < min_exp_arg ? 0.0 : expv;
-    }
-
-    // See whether y is an integer.
-    // inty = 0 means not an integer.
-    // inty = 1 means odd integer.
-    // inty = 2 means even integer.
-
-    int inty = 2 - (ny & 1);
-
-    expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
-
-    long ret = as_long(expv);
-
-    // Now all the edge cases
-    long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
-    ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
-    ret = ((ax == 0L) & !ypos & (inty == 2)) ? PINFBITPATT_DP64 : ret;
-    ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret;
-    long xzero = !xpos ? 0x8000000000000000L : 0L;
-    ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
-    ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L : ret;
-    ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty != 1)) ? 0L : ret;
-    ret = ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
-    ret = ((ux == NINFBITPATT_DP64) & ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
-    ret = ((ux == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
-    ret = ((ux == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
-    ret = ax > PINFBITPATT_DP64 ? ux : ret;
-    ret = ny == 0 ? 0x3ff0000000000000L : ret;
-
-    return as_double(ret);
+_CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny) {
+  const double real_log2_tail = 5.76999904754328540596e-08;
+  const double real_log2_lead = 6.93147122859954833984e-01;
+
+  double y = (double)ny;
+
+  long ux = as_long(x);
+  long ax = ux & (~SIGNBIT_DP64);
+  int xpos = ax == ux;
+
+  long uy = as_long(y);
+  long ay = uy & (~SIGNBIT_DP64);
+  int ypos = ay == uy;
+
+  // Extended precision log
+  double v, vt;
+  {
+    int exp = (int)(ax >> 52) - 1023;
+    int mask_exp_1023 = exp == -1023;
+    double xexp = (double)exp;
+    long mantissa = ax & 0x000FFFFFFFFFFFFFL;
+
+    long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+    exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
+    double xexp1 = (double)exp;
+    long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
+
+    xexp = mask_exp_1023 ? xexp1 : xexp;
+    mantissa = mask_exp_1023 ? mantissa1 : mantissa;
+
+    long rax = (mantissa & 0x000ff00000000000) +
+               ((mantissa & 0x0000080000000000) << 1);
+    int index = rax >> 44;
+
+    double F = as_double(rax | 0x3FE0000000000000L);
+    double Y = as_double(mantissa | 0x3FE0000000000000L);
+    double f = F - Y;
+    double2 tv = USE_TABLE(log_f_inv_tbl, index);
+    double log_h = tv.s0;
+    double log_t = tv.s1;
+    double f_inv = (log_h + log_t) * f;
+    double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+    double r2 = fma(-F, r1, f) * (log_h + log_t);
+    double r = r1 + r2;
+
+    double poly = fma(
+        r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+        1.0 / 3.0);
+    poly = poly * r * r * r;
+
+    double hr1r1 = 0.5 * r1 * r1;
+    double poly0h = r1 + hr1r1;
+    double poly0t = r1 - poly0h + hr1r1;
+    poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+
+    tv = USE_TABLE(powlog_tbl, index);
+    log_h = tv.s0;
+    log_t = tv.s1;
+
+    double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+    double resT = resT_t - poly0h;
+    double resH = fma(xexp, real_log2_lead, log_h);
+    double resT_h = poly0h;
+
+    double H = resT + resH;
+    double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+    double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
+    H = H_h;
+
+    double y_head = as_double(uy & 0xfffffffff8000000L);
+    double y_tail = y - y_head;
+
+    int mask_2_24 = ay > 0x4170000000000000; // 2^24
+    int nyh = convert_int(y_head);
+    int nyt = ny - nyh;
+    double y_tail1 = (double)nyt;
+    y_tail = mask_2_24 ? y_tail1 : y_tail;
+
+    double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
+    v = fma(y_head, H, temp);
+    vt = fma(y_head, H, -v) + temp;
+  }
+
+  // Now calculate exp of (v,vt)
+
+  double expv;
+  {
+    const double max_exp_arg = 709.782712893384;
+    const double min_exp_arg = -745.1332191019411;
+    const double sixtyfour_by_lnof2 = 92.33248261689366;
+    const double lnof2_by_64_head = 0.010830424260348081;
+    const double lnof2_by_64_tail = -4.359010638708991e-10;
+
+    double temp = v * sixtyfour_by_lnof2;
+    int n = (int)temp;
+    double dn = (double)n;
+    int j = n & 0x0000003f;
+    int m = n >> 6;
+
+    double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+    double f1 = tv.s0;
+    double f2 = tv.s1;
+    double f = f1 + f2;
+
+    double r1 = fma(dn, -lnof2_by_64_head, v);
+    double r2 = dn * lnof2_by_64_tail;
+    double r = (r1 + r2) + vt;
+
+    double q = fma(
+        r,
+        fma(r,
+            fma(r,
+                fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
+                4.16666666662260795726e-02),
+            1.66666666665260878863e-01),
+        5.00000000000000008883e-01);
+    q = fma(r * r, q, r);
+
+    expv = fma(f, q, f2) + f1;
+    expv = ldexp(expv, m);
+
+    expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+    expv = v < min_exp_arg ? 0.0 : expv;
+  }
+
+  // See whether y is an integer.
+  // inty = 0 means not an integer.
+  // inty = 1 means odd integer.
+  // inty = 2 means even integer.
+
+  int inty = 2 - (ny & 1);
+
+  expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
+
+  long ret = as_long(expv);
+
+  // Now all the edge cases
+  long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
+  ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
+  ret = ((ax == 0L) & !ypos & (inty == 2)) ? PINFBITPATT_DP64 : ret;
+  ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret;
+  long xzero = !xpos ? 0x8000000000000000L : 0L;
+  ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
+  ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L
+                                                         : ret;
+  ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty != 1)) ? 0L : ret;
+  ret =
+      ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
+  ret =
+      ((ux == NINFBITPATT_DP64) & ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
+  ret = ((ux == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
+  ret = ((ux == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
+  ret = ax > PINFBITPATT_DP64 ? ux : ret;
+  ret = ny == 0 ? 0x3ff0000000000000L : ret;
+
+  return as_double(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_pown, double, int)
 #endif
diff --git a/libclc/generic/lib/math/clc_powr.cl b/libclc/generic/lib/math/clc_powr.cl
index c431f529f3b97d..9516be34456b8d 100644
--- a/libclc/generic/lib/math/clc_powr.cl
+++ b/libclc/generic/lib/math/clc_powr.cl
@@ -23,6 +23,7 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/clc_mad.h>
 
 #include "config.h"
 #include "math.h"
@@ -64,319 +65,332 @@
 // At the end of exp, do
 // ((((expT * poly) + expT) + expH*poly) + expH)
 
-_CLC_DEF _CLC_OVERLOAD float __clc_powr(float x, float y)
-{
-    int ix = as_int(x);
-    int ax = ix & EXSIGNBIT_SP32;
-    int xpos = ix == ax;
-
-    int iy = as_int(y);
-    int ay = iy & EXSIGNBIT_SP32;
-    int ypos = iy == ay;
-
-    // Extra precise log calculation
-    // First handle case that x is close to 1
-    float r = 1.0f - as_float(ax);
-    int near1 = __clc_fabs(r) < 0x1.0p-4f;
-    float r2 = r*r;
-
-    // Coefficients are just 1/3, 1/4, 1/5 and 1/6
-    float poly = mad(r,
-                     mad(r,
-                         mad(r,
-                             mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
-                             0x1.99999ap-3f),
-                         0x1.000000p-2f),
-                     0x1.555556p-2f);
-
-    poly *= r2*r;
-
-    float lth_near1 = -r2 * 0.5f;
-    float ltt_near1 = -poly;
-    float lt_near1 = lth_near1 + ltt_near1;
-    float lh_near1 = -r;
-    float l_near1 = lh_near1 + lt_near1;
-
-    // Computations for x not near 1
-    int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
-    float mf = (float)m;
-    int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
-    float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
-    int c = m == -127;
-    int ixn = c ? ixs : ax;
-    float mfn = c ? mfs : mf;
-
-    int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
-    // F - Y
-    float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (ixn & MANTBITS_SP32));
-
-    indx = indx >> 16;
-    float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
-    float rh = f * tv.s0;
-    float rt = f * tv.s1;
-    r = rh + rt;
-
-    poly = mad(r, mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * (r*r);
-    poly += (rh - r) + rt;
-
-    const float LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
-    const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
-    tv = USE_TABLE(loge_tbl, indx);
-    float lth = -r;
-    float ltt = mad(mfn, LOG2_TAIL, -poly) + tv.s1;
-    float lt = lth + ltt;
-    float lh = mad(mfn, LOG2_HEAD, tv.s0);
-    float l = lh + lt;
-
-    // Select near 1 or not
-    lth = near1 ? lth_near1 : lth;
-    ltt = near1 ? ltt_near1 : ltt;
-    lt = near1 ? lt_near1 : lt;
-    lh = near1 ? lh_near1 : lh;
-    l = near1 ? l_near1 : l;
-
-    float gh = as_float(as_int(l) & 0xfffff000);
-    float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
-    float yh = as_float(iy & 0xfffff000);
-
-    float yt = y - yh;
-
-    float ylogx_s = mad(gt, yh, mad(gh, yt, yt*gt));
-    float ylogx = mad(yh, gh, ylogx_s);
-    float ylogx_t = mad(yh, gh, -ylogx) + ylogx_s;
-
-    // Extra precise exp of ylogx
-    const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657
-    int n = convert_int(ylogx * R_64_BY_LOG2);
-    float nf = (float) n;
-
-    int j = n & 0x3f;
-    m = n >> 6;
-    int m2 = m << EXPSHIFTBITS_SP32;
-
-    const float R_LOG2_BY_64_LD = 0x1.620000p-7f;  // log2/64 lead: 0.0108032227
-    const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388
-    r = mad(nf, -R_LOG2_BY_64_TL, mad(nf, -R_LOG2_BY_64_LD, ylogx)) + ylogx_t;
-
-    // Truncated Taylor series for e^r
-    poly = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
-
-    tv = USE_TABLE(exp_tbl_ep, j);
-
-    float expylogx = mad(tv.s0, poly, mad(tv.s1, poly, tv.s1)) + tv.s0;
-    float sexpylogx = expylogx * as_float(0x1 << (m + 149));
-    float texpylogx = as_float(as_int(expylogx) + m2);
-    expylogx = m < -125 ? sexpylogx : texpylogx;
-
-    // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
-    expylogx = ((ylogx > 0x1.62e430p+6f) | (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f)) ? as_float(PINFBITPATT_SP32) : expylogx;
-
-    // Result is 0 if ylogx < -149*log2
-    expylogx = ylogx <  -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
-    // Classify y:
-    //   inty = 0 means not an integer.
-    //   inty = 1 means odd integer.
-    //   inty = 2 means even integer.
-
-    int yexp = (int)(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
-    int mask = (1 << (24 - yexp)) - 1;
-    int yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
-    int inty = yodd ? 1 : 2;
-    inty = (iy & mask) != 0 ? 0 : inty;
-    inty = yexp < 1 ? 0 : inty;
-    inty = yexp > 24 ? 2 : inty;
-
-    float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
-    expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
-    int ret = as_int(expylogx);
-
-    // Corner case handling
-    ret = ax < 0x3f800000 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
-    ret = ax < 0x3f800000 & iy == PINFBITPATT_SP32 ? 0 : ret;
-    ret = ax == 0x3f800000 & ay < PINFBITPATT_SP32 ? 0x3f800000 : ret;
-    ret = ax == 0x3f800000 & ay == PINFBITPATT_SP32 ? QNANBITPATT_SP32 : ret;
-    ret = ax > 0x3f800000 & iy == NINFBITPATT_SP32 ? 0 : ret;
-    ret = ax > 0x3f800000 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
-    ret = ((ix < PINFBITPATT_SP32) & (ay == 0)) ? 0x3f800000 : ret;
-    ret = ((ax == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
-    ret = ((ax == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
-    ret = ((ax == PINFBITPATT_SP32) & (iy == PINFBITPATT_SP32)) ? PINFBITPATT_SP32 : ret;
-    ret = ((ax == PINFBITPATT_SP32) & (ay == 0)) ? QNANBITPATT_SP32 : ret;
-    ret = ((ax == 0) & !ypos) ? PINFBITPATT_SP32 : ret;
-    ret = ((ax == 0) & ypos) ? 0 : ret;
-    ret = ((ax == 0) & (ay == 0)) ? QNANBITPATT_SP32 : ret;
-    ret = ((ax != 0) & !xpos) ? QNANBITPATT_SP32 : ret;
-    ret = ax > PINFBITPATT_SP32 ? ix : ret;
-    ret = ay > PINFBITPATT_SP32 ? iy : ret;
-
-    return as_float(ret);
+_CLC_DEF _CLC_OVERLOAD float __clc_powr(float x, float y) {
+  int ix = as_int(x);
+  int ax = ix & EXSIGNBIT_SP32;
+  int xpos = ix == ax;
+
+  int iy = as_int(y);
+  int ay = iy & EXSIGNBIT_SP32;
+  int ypos = iy == ay;
+
+  // Extra precise log calculation
+  // First handle case that x is close to 1
+  float r = 1.0f - as_float(ax);
+  int near1 = __clc_fabs(r) < 0x1.0p-4f;
+  float r2 = r * r;
+
+  // Coefficients are just 1/3, 1/4, 1/5 and 1/6
+  float poly = __clc_mad(
+      r,
+      __clc_mad(r,
+                __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
+                          0x1.99999ap-3f),
+                0x1.000000p-2f),
+      0x1.555556p-2f);
+
+  poly *= r2 * r;
+
+  float lth_near1 = -r2 * 0.5f;
+  float ltt_near1 = -poly;
+  float lt_near1 = lth_near1 + ltt_near1;
+  float lh_near1 = -r;
+  float l_near1 = lh_near1 + lt_near1;
+
+  // Computations for x not near 1
+  int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+  float mf = (float)m;
+  int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+  float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
+  int c = m == -127;
+  int ixn = c ? ixs : ax;
+  float mfn = c ? mfs : mf;
+
+  int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
+
+  // F - Y
+  float f = as_float(0x3f000000 | indx) -
+            as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+
+  indx = indx >> 16;
+  float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
+  float rh = f * tv.s0;
+  float rt = f * tv.s1;
+  r = rh + rt;
+
+  poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
+         (r * r);
+  poly += (rh - r) + rt;
+
+  const float LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
+  const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
+  tv = USE_TABLE(loge_tbl, indx);
+  float lth = -r;
+  float ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + tv.s1;
+  float lt = lth + ltt;
+  float lh = __clc_mad(mfn, LOG2_HEAD, tv.s0);
+  float l = lh + lt;
+
+  // Select near 1 or not
+  lth = near1 ? lth_near1 : lth;
+  ltt = near1 ? ltt_near1 : ltt;
+  lt = near1 ? lt_near1 : lt;
+  lh = near1 ? lh_near1 : lh;
+  l = near1 ? l_near1 : l;
+
+  float gh = as_float(as_int(l) & 0xfffff000);
+  float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
+
+  float yh = as_float(iy & 0xfffff000);
+
+  float yt = y - yh;
+
+  float ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
+  float ylogx = __clc_mad(yh, gh, ylogx_s);
+  float ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
+
+  // Extra precise exp of ylogx
+  // 64/log2 : 92.332482616893657
+  const float R_64_BY_LOG2 = 0x1.715476p+6f;
+  int n = convert_int(ylogx * R_64_BY_LOG2);
+  float nf = (float)n;
+
+  int j = n & 0x3f;
+  m = n >> 6;
+  int m2 = m << EXPSHIFTBITS_SP32;
+  // log2/64 lead: 0.0108032227
+  const float R_LOG2_BY_64_LD = 0x1.620000p-7f;
+  // log2/64 tail: 0.0000272020388
+  const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
+  r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
+      ylogx_t;
+
+  // Truncated Taylor series for e^r
+  poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
+                             0x1.000000p-1f),
+                   r * r, r);
+
+  tv = USE_TABLE(exp_tbl_ep, j);
+
+  float expylogx =
+      __clc_mad(tv.s0, poly, __clc_mad(tv.s1, poly, tv.s1)) + tv.s0;
+  float sexpylogx = expylogx * as_float(0x1 << (m + 149));
+  float texpylogx = as_float(as_int(expylogx) + m2);
+  expylogx = m < -125 ? sexpylogx : texpylogx;
+
+  // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
+  expylogx = ((ylogx > 0x1.62e430p+6f) |
+              (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f))
+                 ? as_float(PINFBITPATT_SP32)
+                 : expylogx;
+
+  // Result is 0 if ylogx < -149*log2
+  expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
+
+  // Classify y:
+  //   inty = 0 means not an integer.
+  //   inty = 1 means odd integer.
+  //   inty = 2 means even integer.
+
+  int yexp = (int)(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
+  int mask = (1 << (24 - yexp)) - 1;
+  int yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
+  int inty = yodd ? 1 : 2;
+  inty = (iy & mask) != 0 ? 0 : inty;
+  inty = yexp < 1 ? 0 : inty;
+  inty = yexp > 24 ? 2 : inty;
+
+  float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+  expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
+  int ret = as_int(expylogx);
+
+  // Corner case handling
+  ret = ax < 0x3f800000 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+  ret = ax < 0x3f800000 & iy == PINFBITPATT_SP32 ? 0 : ret;
+  ret = ax == 0x3f800000 & ay < PINFBITPATT_SP32 ? 0x3f800000 : ret;
+  ret = ax == 0x3f800000 & ay == PINFBITPATT_SP32 ? QNANBITPATT_SP32 : ret;
+  ret = ax > 0x3f800000 & iy == NINFBITPATT_SP32 ? 0 : ret;
+  ret = ax > 0x3f800000 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+  ret = ((ix < PINFBITPATT_SP32) & (ay == 0)) ? 0x3f800000 : ret;
+  ret = ((ax == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
+  ret = ((ax == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
+  ret = ((ax == PINFBITPATT_SP32) & (iy == PINFBITPATT_SP32)) ? PINFBITPATT_SP32
+                                                              : ret;
+  ret = ((ax == PINFBITPATT_SP32) & (ay == 0)) ? QNANBITPATT_SP32 : ret;
+  ret = ((ax == 0) & !ypos) ? PINFBITPATT_SP32 : ret;
+  ret = ((ax == 0) & ypos) ? 0 : ret;
+  ret = ((ax == 0) & (ay == 0)) ? QNANBITPATT_SP32 : ret;
+  ret = ((ax != 0) & !xpos) ? QNANBITPATT_SP32 : ret;
+  ret = ax > PINFBITPATT_SP32 ? ix : ret;
+  ret = ay > PINFBITPATT_SP32 ? iy : ret;
+
+  return as_float(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_powr, float, float)
 
 #ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y)
-{
-    const double real_log2_tail = 5.76999904754328540596e-08;
-    const double real_log2_lead = 6.93147122859954833984e-01;
-
-    long ux = as_long(x);
-    long ax = ux & (~SIGNBIT_DP64);
-    int xpos = ax == ux;
-
-    long uy = as_long(y);
-    long ay = uy & (~SIGNBIT_DP64);
-    int ypos = ay == uy;
-
-    // Extended precision log
-    double v, vt;
-    {
-        int exp = (int)(ax >> 52) - 1023;
-        int mask_exp_1023 = exp == -1023;
-        double xexp = (double) exp;
-        long mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
-        long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
-        exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
-        double xexp1 = (double) exp;
-        long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
-        xexp = mask_exp_1023 ? xexp1 : xexp;
-        mantissa = mask_exp_1023 ? mantissa1 : mantissa;
-
-        long rax = (mantissa & 0x000ff00000000000) + ((mantissa & 0x0000080000000000) << 1);
-        int index = rax >> 44;
-
-        double F = as_double(rax | 0x3FE0000000000000L);
-        double Y = as_double(mantissa | 0x3FE0000000000000L);
-        double f = F - Y;
-        double2 tv = USE_TABLE(log_f_inv_tbl, index);
-        double log_h = tv.s0;
-        double log_t = tv.s1;
-        double f_inv = (log_h + log_t) * f;
-        double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
-        double r2 = fma(-F, r1, f) * (log_h + log_t);
-        double r = r1 + r2;
-
-        double poly = fma(r,
-                          fma(r,
-                              fma(r,
-                                  fma(r, 1.0/7.0, 1.0/6.0),
-                                  1.0/5.0),
-                              1.0/4.0),
-                          1.0/3.0);
-        poly = poly * r * r * r;
-
-        double hr1r1 = 0.5*r1*r1;
-        double poly0h = r1 + hr1r1;
-        double poly0t = r1 - poly0h + hr1r1;
-        poly = fma(r1, r2, fma(0.5*r2, r2, poly)) + r2 + poly0t;
-
-        tv = USE_TABLE(powlog_tbl, index);
-        log_h = tv.s0;
-        log_t = tv.s1;
-
-        double resT_t = fma(xexp, real_log2_tail, + log_t) - poly;
-        double resT = resT_t - poly0h;
-        double resH = fma(xexp, real_log2_lead, log_h);
-        double resT_h = poly0h;
-
-        double H = resT + resH;
-        double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
-        double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
-        H = H_h;
-
-        double y_head = as_double(uy & 0xfffffffff8000000L);
-        double y_tail = y - y_head;
-
-        double temp = fma(y_tail, H, fma(y_head, T, y_tail*T));
-        v = fma(y_head, H, temp);
-        vt = fma(y_head, H, -v) + temp;
-    }
-
-    // Now calculate exp of (v,vt)
-
-    double expv;
-    {
-        const double max_exp_arg = 709.782712893384;
-        const double min_exp_arg = -745.1332191019411;
-        const double sixtyfour_by_lnof2 = 92.33248261689366;
-        const double lnof2_by_64_head = 0.010830424260348081;
-        const double lnof2_by_64_tail = -4.359010638708991e-10;
-
-        double temp = v * sixtyfour_by_lnof2;
-        int n = (int)temp;
-        double dn = (double)n;
-        int j = n & 0x0000003f;
-        int m = n >> 6;
-
-        double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
-        double f1 = tv.s0;
-        double f2 = tv.s1;
-        double f = f1 + f2;
-
-        double r1 = fma(dn, -lnof2_by_64_head, v);
-        double r2 = dn * lnof2_by_64_tail;
-        double r = (r1 + r2) + vt;
-
-        double q = fma(r,
-                       fma(r,
-                           fma(r,
-                               fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
-                               4.16666666662260795726e-02),
-                           1.66666666665260878863e-01),
-                       5.00000000000000008883e-01);
-        q = fma(r*r, q, r);
-
-        expv = fma(f, q, f2) + f1;
-	      expv = ldexp(expv, m);
-
-        expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
-        expv = v < min_exp_arg ? 0.0 : expv;
-    }
-
-    // See whether y is an integer.
-    // inty = 0 means not an integer.
-    // inty = 1 means odd integer.
-    // inty = 2 means even integer.
-
-    int inty;
-    {
-        int yexp = (int)(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
-        inty = yexp < 1 ? 0 : 2;
-        inty = yexp > 53 ? 2 : inty;
-        long mask = (1L << (53 - yexp)) - 1L;
-        int inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1 : 2;
-        inty1 = (ay & mask) != 0 ? 0 : inty1;
-        inty = !(yexp < 1) & !(yexp > 53) ? inty1 : inty;
-    }
-
-    expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
-
-    long ret = as_long(expv);
-
-    // Now all the edge cases
-    ret = ax < 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
-    ret = ax < 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? 0L : ret;
-    ret = ax == 0x3ff0000000000000L & ay < PINFBITPATT_DP64 ? 0x3ff0000000000000L : ret;
-    ret = ax == 0x3ff0000000000000L & ay == PINFBITPATT_DP64 ? QNANBITPATT_DP64 : ret;
-    ret = ax > 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? 0L : ret;
-    ret = ax > 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
-    ret = ux < PINFBITPATT_DP64 & ay == 0L ? 0x3ff0000000000000L : ret;
-    ret = ((ax == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
-    ret = ((ax == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
-    ret = ((ax == PINFBITPATT_DP64) & (uy == PINFBITPATT_DP64)) ? PINFBITPATT_DP64 : ret;
-    ret = ((ax == PINFBITPATT_DP64) & (ay == 0L)) ? QNANBITPATT_DP64 : ret;
-    ret = ((ax == 0L) & !ypos) ? PINFBITPATT_DP64 : ret;
-    ret = ((ax == 0L) & ypos) ? 0L : ret;
-    ret = ((ax == 0L) & (ay == 0L)) ? QNANBITPATT_DP64 : ret;
-    ret = ((ax != 0L) & !xpos) ? QNANBITPATT_DP64 : ret;
-    ret = ax > PINFBITPATT_DP64 ? ux : ret;
-    ret = ay > PINFBITPATT_DP64 ? uy : ret;
-
-    return as_double(ret);
+_CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y) {
+  const double real_log2_tail = 5.76999904754328540596e-08;
+  const double real_log2_lead = 6.93147122859954833984e-01;
+
+  long ux = as_long(x);
+  long ax = ux & (~SIGNBIT_DP64);
+  int xpos = ax == ux;
+
+  long uy = as_long(y);
+  long ay = uy & (~SIGNBIT_DP64);
+  int ypos = ay == uy;
+
+  // Extended precision log
+  double v, vt;
+  {
+    int exp = (int)(ax >> 52) - 1023;
+    int mask_exp_1023 = exp == -1023;
+    double xexp = (double)exp;
+    long mantissa = ax & 0x000FFFFFFFFFFFFFL;
+
+    long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+    exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
+    double xexp1 = (double)exp;
+    long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
+
+    xexp = mask_exp_1023 ? xexp1 : xexp;
+    mantissa = mask_exp_1023 ? mantissa1 : mantissa;
+
+    long rax = (mantissa & 0x000ff00000000000) +
+               ((mantissa & 0x0000080000000000) << 1);
+    int index = rax >> 44;
+
+    double F = as_double(rax | 0x3FE0000000000000L);
+    double Y = as_double(mantissa | 0x3FE0000000000000L);
+    double f = F - Y;
+    double2 tv = USE_TABLE(log_f_inv_tbl, index);
+    double log_h = tv.s0;
+    double log_t = tv.s1;
+    double f_inv = (log_h + log_t) * f;
+    double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+    double r2 = fma(-F, r1, f) * (log_h + log_t);
+    double r = r1 + r2;
+
+    double poly = fma(
+        r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+        1.0 / 3.0);
+    poly = poly * r * r * r;
+
+    double hr1r1 = 0.5 * r1 * r1;
+    double poly0h = r1 + hr1r1;
+    double poly0t = r1 - poly0h + hr1r1;
+    poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+
+    tv = USE_TABLE(powlog_tbl, index);
+    log_h = tv.s0;
+    log_t = tv.s1;
+
+    double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+    double resT = resT_t - poly0h;
+    double resH = fma(xexp, real_log2_lead, log_h);
+    double resT_h = poly0h;
+
+    double H = resT + resH;
+    double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+    double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
+    H = H_h;
+
+    double y_head = as_double(uy & 0xfffffffff8000000L);
+    double y_tail = y - y_head;
+
+    double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
+    v = fma(y_head, H, temp);
+    vt = fma(y_head, H, -v) + temp;
+  }
+
+  // Now calculate exp of (v,vt)
+
+  double expv;
+  {
+    const double max_exp_arg = 709.782712893384;
+    const double min_exp_arg = -745.1332191019411;
+    const double sixtyfour_by_lnof2 = 92.33248261689366;
+    const double lnof2_by_64_head = 0.010830424260348081;
+    const double lnof2_by_64_tail = -4.359010638708991e-10;
+
+    double temp = v * sixtyfour_by_lnof2;
+    int n = (int)temp;
+    double dn = (double)n;
+    int j = n & 0x0000003f;
+    int m = n >> 6;
+
+    double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+    double f1 = tv.s0;
+    double f2 = tv.s1;
+    double f = f1 + f2;
+
+    double r1 = fma(dn, -lnof2_by_64_head, v);
+    double r2 = dn * lnof2_by_64_tail;
+    double r = (r1 + r2) + vt;
+
+    double q = fma(
+        r,
+        fma(r,
+            fma(r,
+                fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
+                4.16666666662260795726e-02),
+            1.66666666665260878863e-01),
+        5.00000000000000008883e-01);
+    q = fma(r * r, q, r);
+
+    expv = fma(f, q, f2) + f1;
+    expv = ldexp(expv, m);
+
+    expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+    expv = v < min_exp_arg ? 0.0 : expv;
+  }
+
+  // See whether y is an integer.
+  // inty = 0 means not an integer.
+  // inty = 1 means odd integer.
+  // inty = 2 means even integer.
+
+  int inty;
+  {
+    int yexp = (int)(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
+    inty = yexp < 1 ? 0 : 2;
+    inty = yexp > 53 ? 2 : inty;
+    long mask = (1L << (53 - yexp)) - 1L;
+    int inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1 : 2;
+    inty1 = (ay & mask) != 0 ? 0 : inty1;
+    inty = !(yexp < 1) & !(yexp > 53) ? inty1 : inty;
+  }
+
+  expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
+
+  long ret = as_long(expv);
+
+  // Now all the edge cases
+  ret = ax < 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64
+                                                          : ret;
+  ret = ax < 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? 0L : ret;
+  ret = ax == 0x3ff0000000000000L & ay < PINFBITPATT_DP64 ? 0x3ff0000000000000L
+                                                          : ret;
+  ret = ax == 0x3ff0000000000000L & ay == PINFBITPATT_DP64 ? QNANBITPATT_DP64
+                                                           : ret;
+  ret = ax > 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? 0L : ret;
+  ret = ax > 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64
+                                                          : ret;
+  ret = ux < PINFBITPATT_DP64 & ay == 0L ? 0x3ff0000000000000L : ret;
+  ret = ((ax == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
+  ret = ((ax == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
+  ret = ((ax == PINFBITPATT_DP64) & (uy == PINFBITPATT_DP64)) ? PINFBITPATT_DP64
+                                                              : ret;
+  ret = ((ax == PINFBITPATT_DP64) & (ay == 0L)) ? QNANBITPATT_DP64 : ret;
+  ret = ((ax == 0L) & !ypos) ? PINFBITPATT_DP64 : ret;
+  ret = ((ax == 0L) & ypos) ? 0L : ret;
+  ret = ((ax == 0L) & (ay == 0L)) ? QNANBITPATT_DP64 : ret;
+  ret = ((ax != 0L) & !xpos) ? QNANBITPATT_DP64 : ret;
+  ret = ax > PINFBITPATT_DP64 ? ux : ret;
+  ret = ay > PINFBITPATT_DP64 ? uy : ret;
+
+  return as_double(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_powr, double, double)
 #endif
diff --git a/libclc/generic/lib/math/clc_rootn.cl b/libclc/generic/lib/math/clc_rootn.cl
index eee9c9fcaa2d42..70ae02ac2370c9 100644
--- a/libclc/generic/lib/math/clc_rootn.cl
+++ b/libclc/generic/lib/math/clc_rootn.cl
@@ -23,6 +23,7 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/clc_mad.h>
 
 #include "config.h"
 #include "math.h"
@@ -64,308 +65,320 @@
 // At the end of exp, do
 // ((((expT * poly) + expT) + expH*poly) + expH)
 
-_CLC_DEF _CLC_OVERLOAD float __clc_rootn(float x, int ny)
-{
-    float y = MATH_RECIP((float)ny);
-
-    int ix = as_int(x);
-    int ax = ix & EXSIGNBIT_SP32;
-    int xpos = ix == ax;
-
-    int iy = as_int(y);
-    int ay = iy & EXSIGNBIT_SP32;
-    int ypos = iy == ay;
-
-    // Extra precise log calculation
-    // First handle case that x is close to 1
-    float r = 1.0f - as_float(ax);
-    int near1 = __clc_fabs(r) < 0x1.0p-4f;
-    float r2 = r*r;
-
-    // Coefficients are just 1/3, 1/4, 1/5 and 1/6
-    float poly = mad(r,
-                     mad(r,
-                         mad(r,
-                             mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
-                             0x1.99999ap-3f),
-                         0x1.000000p-2f),
-                     0x1.555556p-2f);
-
-    poly *= r2*r;
-
-    float lth_near1 = -r2 * 0.5f;
-    float ltt_near1 = -poly;
-    float lt_near1 = lth_near1 + ltt_near1;
-    float lh_near1 = -r;
-    float l_near1 = lh_near1 + lt_near1;
-
-    // Computations for x not near 1
-    int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
-    float mf = (float)m;
-    int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
-    float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
-    int c = m == -127;
-    int ixn = c ? ixs : ax;
-    float mfn = c ? mfs : mf;
-
-    int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
-    // F - Y
-    float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (ixn & MANTBITS_SP32));
-
-    indx = indx >> 16;
-    float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
-    float rh = f * tv.s0;
-    float rt = f * tv.s1;
-    r = rh + rt;
-
-    poly = mad(r, mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * (r*r);
-    poly += (rh - r) + rt;
-
-    const float LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
-    const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
-    tv = USE_TABLE(loge_tbl, indx);
-    float lth = -r;
-    float ltt = mad(mfn, LOG2_TAIL, -poly) + tv.s1;
-    float lt = lth + ltt;
-    float lh = mad(mfn, LOG2_HEAD, tv.s0);
-    float l = lh + lt;
-
-    // Select near 1 or not
-    lth = near1 ? lth_near1 : lth;
-    ltt = near1 ? ltt_near1 : ltt;
-    lt = near1 ? lt_near1 : lt;
-    lh = near1 ? lh_near1 : lh;
-    l = near1 ? l_near1 : l;
-
-    float gh = as_float(as_int(l) & 0xfffff000);
-    float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
-    float yh = as_float(iy & 0xfffff000);
-
-    float fny = (float)ny;
-    float fnyh = as_float(as_int(fny) & 0xfffff000);
-    float fnyt = (float)(ny - (int)fnyh);
-    float yt = MATH_DIVIDE(mad(-fnyt, yh, mad(-fnyh, yh, 1.0f)), fny);
-
-    float ylogx_s = mad(gt, yh, mad(gh, yt, yt*gt));
-    float ylogx = mad(yh, gh, ylogx_s);
-    float ylogx_t = mad(yh, gh, -ylogx) + ylogx_s;
-
-    // Extra precise exp of ylogx
-    const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657
-    int n = convert_int(ylogx * R_64_BY_LOG2);
-    float nf = (float) n;
-
-    int j = n & 0x3f;
-    m = n >> 6;
-    int m2 = m << EXPSHIFTBITS_SP32;
-
-    const float R_LOG2_BY_64_LD = 0x1.620000p-7f;  // log2/64 lead: 0.0108032227
-    const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388
-    r = mad(nf, -R_LOG2_BY_64_TL, mad(nf, -R_LOG2_BY_64_LD, ylogx)) + ylogx_t;
-
-    // Truncated Taylor series for e^r
-    poly = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
-
-    tv = USE_TABLE(exp_tbl_ep, j);
-
-    float expylogx = mad(tv.s0, poly, mad(tv.s1, poly, tv.s1)) + tv.s0;
-    float sexpylogx = __clc_fp32_subnormals_supported() ? expylogx * as_float(0x1 << (m + 149)) : 0.0f;
-
-    float texpylogx = as_float(as_int(expylogx) + m2);
-    expylogx = m < -125 ? sexpylogx : texpylogx;
-
-    // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
-    expylogx = ((ylogx > 0x1.62e430p+6f) | (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f)) ? as_float(PINFBITPATT_SP32) : expylogx;
-
-    // Result is 0 if ylogx < -149*log2
-    expylogx = ylogx <  -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
-    // Classify y:
-    //   inty = 0 means not an integer.
-    //   inty = 1 means odd integer.
-    //   inty = 2 means even integer.
-
-    int inty = 2 - (ny & 1);
-
-    float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
-    expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
-    int ret = as_int(expylogx);
-
-    // Corner case handling
-    ret = (!xpos & (inty == 2)) ? QNANBITPATT_SP32 : ret;
-    int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
-    ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
-    ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret;
-    ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret;
-    int xzero = xpos ? 0 : 0x80000000;
-    ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
-    ret = ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
-    ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
-    ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
-    ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
-    ret = ax > PINFBITPATT_SP32 ? ix : ret;
-    ret = ny == 0 ? QNANBITPATT_SP32 : ret;
-
-    return as_float(ret);
+_CLC_DEF _CLC_OVERLOAD float __clc_rootn(float x, int ny) {
+  float y = MATH_RECIP((float)ny);
+
+  int ix = as_int(x);
+  int ax = ix & EXSIGNBIT_SP32;
+  int xpos = ix == ax;
+
+  int iy = as_int(y);
+  int ay = iy & EXSIGNBIT_SP32;
+  int ypos = iy == ay;
+
+  // Extra precise log calculation
+  // First handle case that x is close to 1
+  float r = 1.0f - as_float(ax);
+  int near1 = __clc_fabs(r) < 0x1.0p-4f;
+  float r2 = r * r;
+
+  // Coefficients are just 1/3, 1/4, 1/5 and 1/6
+  float poly = __clc_mad(
+      r,
+      __clc_mad(r,
+                __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
+                          0x1.99999ap-3f),
+                0x1.000000p-2f),
+      0x1.555556p-2f);
+
+  poly *= r2 * r;
+
+  float lth_near1 = -r2 * 0.5f;
+  float ltt_near1 = -poly;
+  float lt_near1 = lth_near1 + ltt_near1;
+  float lh_near1 = -r;
+  float l_near1 = lh_near1 + lt_near1;
+
+  // Computations for x not near 1
+  int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+  float mf = (float)m;
+  int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+  float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
+  int c = m == -127;
+  int ixn = c ? ixs : ax;
+  float mfn = c ? mfs : mf;
+
+  int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
+
+  // F - Y
+  float f = as_float(0x3f000000 | indx) -
+            as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+
+  indx = indx >> 16;
+  float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
+  float rh = f * tv.s0;
+  float rt = f * tv.s1;
+  r = rh + rt;
+
+  poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
+         (r * r);
+  poly += (rh - r) + rt;
+
+  const float LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
+  const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
+  tv = USE_TABLE(loge_tbl, indx);
+  float lth = -r;
+  float ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + tv.s1;
+  float lt = lth + ltt;
+  float lh = __clc_mad(mfn, LOG2_HEAD, tv.s0);
+  float l = lh + lt;
+
+  // Select near 1 or not
+  lth = near1 ? lth_near1 : lth;
+  ltt = near1 ? ltt_near1 : ltt;
+  lt = near1 ? lt_near1 : lt;
+  lh = near1 ? lh_near1 : lh;
+  l = near1 ? l_near1 : l;
+
+  float gh = as_float(as_int(l) & 0xfffff000);
+  float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
+
+  float yh = as_float(iy & 0xfffff000);
+
+  float fny = (float)ny;
+  float fnyh = as_float(as_int(fny) & 0xfffff000);
+  float fnyt = (float)(ny - (int)fnyh);
+  float yt = MATH_DIVIDE(__clc_mad(-fnyt, yh, __clc_mad(-fnyh, yh, 1.0f)), fny);
+
+  float ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
+  float ylogx = __clc_mad(yh, gh, ylogx_s);
+  float ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
+
+  // Extra precise exp of ylogx
+  const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657
+  int n = convert_int(ylogx * R_64_BY_LOG2);
+  float nf = (float)n;
+
+  int j = n & 0x3f;
+  m = n >> 6;
+  int m2 = m << EXPSHIFTBITS_SP32;
+
+  // log2/64 lead: 0.0108032227
+  const float R_LOG2_BY_64_LD = 0x1.620000p-7f;
+  // log2/64 tail: 0.0000272020388
+  const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
+  r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
+      ylogx_t;
+
+  // Truncated Taylor series for e^r
+  poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
+                             0x1.000000p-1f),
+                   r * r, r);
+
+  tv = USE_TABLE(exp_tbl_ep, j);
+
+  float expylogx =
+      __clc_mad(tv.s0, poly, __clc_mad(tv.s1, poly, tv.s1)) + tv.s0;
+  float sexpylogx = __clc_fp32_subnormals_supported()
+                        ? expylogx * as_float(0x1 << (m + 149))
+                        : 0.0f;
+
+  float texpylogx = as_float(as_int(expylogx) + m2);
+  expylogx = m < -125 ? sexpylogx : texpylogx;
+
+  // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
+  expylogx = ((ylogx > 0x1.62e430p+6f) |
+              (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f))
+                 ? as_float(PINFBITPATT_SP32)
+                 : expylogx;
+
+  // Result is 0 if ylogx < -149*log2
+  expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
+
+  // Classify y:
+  //   inty = 0 means not an integer.
+  //   inty = 1 means odd integer.
+  //   inty = 2 means even integer.
+
+  int inty = 2 - (ny & 1);
+
+  float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+  expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
+  int ret = as_int(expylogx);
+
+  // Corner case handling
+  ret = (!xpos & (inty == 2)) ? QNANBITPATT_SP32 : ret;
+  int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
+  ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
+  ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret;
+  ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret;
+  int xzero = xpos ? 0 : 0x80000000;
+  ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
+  ret =
+      ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
+  ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
+  ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
+  ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
+  ret = ax > PINFBITPATT_SP32 ? ix : ret;
+  ret = ny == 0 ? QNANBITPATT_SP32 : ret;
+
+  return as_float(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_rootn, float, int)
 
 #ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny)
-{
-    const double real_log2_tail = 5.76999904754328540596e-08;
-    const double real_log2_lead = 6.93147122859954833984e-01;
-
-    double dny = (double)ny;
-    double y = 1.0 / dny;
-
-    long ux = as_long(x);
-    long ax = ux & (~SIGNBIT_DP64);
-    int xpos = ax == ux;
-
-    long uy = as_long(y);
-    long ay = uy & (~SIGNBIT_DP64);
-    int ypos = ay == uy;
-
-    // Extended precision log
-    double v, vt;
-    {
-        int exp = (int)(ax >> 52) - 1023;
-        int mask_exp_1023 = exp == -1023;
-        double xexp = (double) exp;
-        long mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
-        long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
-        exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
-        double xexp1 = (double) exp;
-        long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
-        xexp = mask_exp_1023 ? xexp1 : xexp;
-        mantissa = mask_exp_1023 ? mantissa1 : mantissa;
-
-        long rax = (mantissa & 0x000ff00000000000) + ((mantissa & 0x0000080000000000) << 1);
-        int index = rax >> 44;
-
-        double F = as_double(rax | 0x3FE0000000000000L);
-        double Y = as_double(mantissa | 0x3FE0000000000000L);
-        double f = F - Y;
-        double2 tv = USE_TABLE(log_f_inv_tbl, index);
-        double log_h = tv.s0;
-        double log_t = tv.s1;
-        double f_inv = (log_h + log_t) * f;
-        double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
-        double r2 = fma(-F, r1, f) * (log_h + log_t);
-        double r = r1 + r2;
-
-        double poly = fma(r,
-                          fma(r,
-                              fma(r,
-                                  fma(r, 1.0/7.0, 1.0/6.0),
-                                  1.0/5.0),
-                              1.0/4.0),
-                          1.0/3.0);
-        poly = poly * r * r * r;
-
-        double hr1r1 = 0.5*r1*r1;
-        double poly0h = r1 + hr1r1;
-        double poly0t = r1 - poly0h + hr1r1;
-        poly = fma(r1, r2, fma(0.5*r2, r2, poly)) + r2 + poly0t;
-
-        tv = USE_TABLE(powlog_tbl, index);
-        log_h = tv.s0;
-        log_t = tv.s1;
-
-        double resT_t = fma(xexp, real_log2_tail, + log_t) - poly;
-        double resT = resT_t - poly0h;
-        double resH = fma(xexp, real_log2_lead, log_h);
-        double resT_h = poly0h;
-
-        double H = resT + resH;
-        double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
-        double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
-        H = H_h;
-
-        double y_head = as_double(uy & 0xfffffffff8000000L);
-        double y_tail = y - y_head;
-
-        double fnyh = as_double(as_long(dny) & 0xfffffffffff00000);
-        double fnyt = (double)(ny - (int)fnyh);
-        y_tail = fma(-fnyt, y_head, fma(-fnyh, y_head, 1.0))/ dny;
-
-        double temp = fma(y_tail, H, fma(y_head, T, y_tail*T));
-        v = fma(y_head, H, temp);
-        vt = fma(y_head, H, -v) + temp;
-    }
-
-    // Now calculate exp of (v,vt)
-
-    double expv;
-    {
-        const double max_exp_arg = 709.782712893384;
-        const double min_exp_arg = -745.1332191019411;
-        const double sixtyfour_by_lnof2 = 92.33248261689366;
-        const double lnof2_by_64_head = 0.010830424260348081;
-        const double lnof2_by_64_tail = -4.359010638708991e-10;
-
-        double temp = v * sixtyfour_by_lnof2;
-        int n = (int)temp;
-        double dn = (double)n;
-        int j = n & 0x0000003f;
-        int m = n >> 6;
-
-        double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
-        double f1 = tv.s0;
-        double f2 = tv.s1;
-        double f = f1 + f2;
-
-        double r1 = fma(dn, -lnof2_by_64_head, v);
-        double r2 = dn * lnof2_by_64_tail;
-        double r = (r1 + r2) + vt;
-
-        double q = fma(r,
-                       fma(r,
-                           fma(r,
-                               fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
-                               4.16666666662260795726e-02),
-                           1.66666666665260878863e-01),
-                       5.00000000000000008883e-01);
-        q = fma(r*r, q, r);
-
-        expv = fma(f, q, f2) + f1;
-	      expv = ldexp(expv, m);
-
-        expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
-        expv = v < min_exp_arg ? 0.0 : expv;
-    }
-
-    // See whether y is an integer.
-    // inty = 0 means not an integer.
-    // inty = 1 means odd integer.
-    // inty = 2 means even integer.
-
-    int inty = 2 - (ny & 1);
-
-    expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
-
-    long ret = as_long(expv);
-
-    // Now all the edge cases
-    ret = (!xpos & (inty == 2)) ? QNANBITPATT_DP64 : ret;
-    long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
-    ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
-    ret = ((ax == 0L) & !ypos & (inty == 2)) ? PINFBITPATT_DP64 : ret;
-    ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret;
-    long xzero = xpos ? 0L : 0x8000000000000000L;
-    ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
-    ret = ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
-    ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L : ret;
-    ret = ((ux == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
-    ret = ((ux == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
-    ret = ax > PINFBITPATT_DP64 ? ux : ret;
-    ret = ny == 0 ? QNANBITPATT_DP64 : ret;
-    return as_double(ret);
+_CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny) {
+  const double real_log2_tail = 5.76999904754328540596e-08;
+  const double real_log2_lead = 6.93147122859954833984e-01;
+
+  double dny = (double)ny;
+  double y = 1.0 / dny;
+
+  long ux = as_long(x);
+  long ax = ux & (~SIGNBIT_DP64);
+  int xpos = ax == ux;
+
+  long uy = as_long(y);
+  long ay = uy & (~SIGNBIT_DP64);
+  int ypos = ay == uy;
+
+  // Extended precision log
+  double v, vt;
+  {
+    int exp = (int)(ax >> 52) - 1023;
+    int mask_exp_1023 = exp == -1023;
+    double xexp = (double)exp;
+    long mantissa = ax & 0x000FFFFFFFFFFFFFL;
+
+    long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+    exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
+    double xexp1 = (double)exp;
+    long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
+
+    xexp = mask_exp_1023 ? xexp1 : xexp;
+    mantissa = mask_exp_1023 ? mantissa1 : mantissa;
+
+    long rax = (mantissa & 0x000ff00000000000) +
+               ((mantissa & 0x0000080000000000) << 1);
+    int index = rax >> 44;
+
+    double F = as_double(rax | 0x3FE0000000000000L);
+    double Y = as_double(mantissa | 0x3FE0000000000000L);
+    double f = F - Y;
+    double2 tv = USE_TABLE(log_f_inv_tbl, index);
+    double log_h = tv.s0;
+    double log_t = tv.s1;
+    double f_inv = (log_h + log_t) * f;
+    double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+    double r2 = fma(-F, r1, f) * (log_h + log_t);
+    double r = r1 + r2;
+
+    double poly = fma(
+        r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+        1.0 / 3.0);
+    poly = poly * r * r * r;
+
+    double hr1r1 = 0.5 * r1 * r1;
+    double poly0h = r1 + hr1r1;
+    double poly0t = r1 - poly0h + hr1r1;
+    poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+
+    tv = USE_TABLE(powlog_tbl, index);
+    log_h = tv.s0;
+    log_t = tv.s1;
+
+    double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+    double resT = resT_t - poly0h;
+    double resH = fma(xexp, real_log2_lead, log_h);
+    double resT_h = poly0h;
+
+    double H = resT + resH;
+    double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+    double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
+    H = H_h;
+
+    double y_head = as_double(uy & 0xfffffffff8000000L);
+    double y_tail = y - y_head;
+
+    double fnyh = as_double(as_long(dny) & 0xfffffffffff00000);
+    double fnyt = (double)(ny - (int)fnyh);
+    y_tail = fma(-fnyt, y_head, fma(-fnyh, y_head, 1.0)) / dny;
+
+    double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
+    v = fma(y_head, H, temp);
+    vt = fma(y_head, H, -v) + temp;
+  }
+
+  // Now calculate exp of (v,vt)
+
+  double expv;
+  {
+    const double max_exp_arg = 709.782712893384;
+    const double min_exp_arg = -745.1332191019411;
+    const double sixtyfour_by_lnof2 = 92.33248261689366;
+    const double lnof2_by_64_head = 0.010830424260348081;
+    const double lnof2_by_64_tail = -4.359010638708991e-10;
+
+    double temp = v * sixtyfour_by_lnof2;
+    int n = (int)temp;
+    double dn = (double)n;
+    int j = n & 0x0000003f;
+    int m = n >> 6;
+
+    double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+    double f1 = tv.s0;
+    double f2 = tv.s1;
+    double f = f1 + f2;
+
+    double r1 = fma(dn, -lnof2_by_64_head, v);
+    double r2 = dn * lnof2_by_64_tail;
+    double r = (r1 + r2) + vt;
+
+    double q = fma(
+        r,
+        fma(r,
+            fma(r,
+                fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
+                4.16666666662260795726e-02),
+            1.66666666665260878863e-01),
+        5.00000000000000008883e-01);
+    q = fma(r * r, q, r);
+
+    expv = fma(f, q, f2) + f1;
+    expv = ldexp(expv, m);
+
+    expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+    expv = v < min_exp_arg ? 0.0 : expv;
+  }
+
+  // See whether y is an integer.
+  // inty = 0 means not an integer.
+  // inty = 1 means odd integer.
+  // inty = 2 means even integer.
+
+  int inty = 2 - (ny & 1);
+
+  expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
+
+  long ret = as_long(expv);
+
+  // Now all the edge cases
+  ret = (!xpos & (inty == 2)) ? QNANBITPATT_DP64 : ret;
+  long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
+  ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
+  ret = ((ax == 0L) & !ypos & (inty == 2)) ? PINFBITPATT_DP64 : ret;
+  ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret;
+  long xzero = xpos ? 0L : 0x8000000000000000L;
+  ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
+  ret =
+      ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
+  ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L
+                                                         : ret;
+  ret = ((ux == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
+  ret = ((ux == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
+  ret = ax > PINFBITPATT_DP64 ? ux : ret;
+  ret = ny == 0 ? QNANBITPATT_DP64 : ret;
+  return as_double(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_rootn, double, int)
 #endif
diff --git a/libclc/generic/lib/math/mad.cl b/libclc/generic/lib/math/mad.cl
index 86bc70d94bea1c..94012ab3df25a9 100644
--- a/libclc/generic/lib/math/mad.cl
+++ b/libclc/generic/lib/math/mad.cl
@@ -1,4 +1,19 @@
 #include <clc/clc.h>
+#include <clc/clcmacro.h>
+#include <clc/math/clc_mad.h>
 
-#define __CLC_BODY <mad.inc>
-#include <clc/math/gentype.inc>
+_CLC_DEFINE_TERNARY_BUILTIN(float, mad, __clc_mad, float, float, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEFINE_TERNARY_BUILTIN(double, mad, __clc_mad, double, double, double)
+
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_TERNARY_BUILTIN(half, mad, __clc_mad, half, half, half)
+
+#endif
diff --git a/libclc/generic/lib/math/mad.inc b/libclc/generic/lib/math/mad.inc
deleted file mode 100644
index d32c7839d1b970..00000000000000
--- a/libclc/generic/lib/math/mad.inc
+++ /dev/null
@@ -1,3 +0,0 @@
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mad(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c) {
-  return a * b + c;
-}
diff --git a/libclc/generic/lib/math/sincos_helpers.cl b/libclc/generic/lib/math/sincos_helpers.cl
index 0adecf6978bcab..e291e81ed980de 100644
--- a/libclc/generic/lib/math/sincos_helpers.cl
+++ b/libclc/generic/lib/math/sincos_helpers.cl
@@ -21,307 +21,319 @@
  */
 
 #include <clc/clc.h>
+#include <clc/math/clc_mad.h>
+#include <clc/math/clc_trunc.h>
 #include <clc/shared/clc_max.h>
 
 #include "math.h"
-#include "tables.h"
 #include "sincos_helpers.h"
+#include "tables.h"
 
-#define bitalign(hi, lo, shift) \
-  ((hi) << (32 - (shift))) | ((lo) >> (shift));
+#define bitalign(hi, lo, shift) ((hi) << (32 - (shift))) | ((lo) >> (shift));
 
-#define bytealign(src0, src1, src2) \
-  ((uint) (((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3)*8)))
+#define bytealign(src0, src1, src2)                                            \
+  ((uint)(((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3) * 8)))
 
 _CLC_DEF float __clc_sinf_piby4(float x, float y) {
-    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
-    // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
-    // = x * f(w)
-    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
-    // We use a minimax approximation of (f(w) - 1) / w
-    // because this produces an expansion in even powers of x.
-
-    const float c1 = -0.1666666666e0f;
-    const float c2 = 0.8333331876e-2f;
-    const float c3 = -0.198400874e-3f;
-    const float c4 = 0.272500015e-5f;
-    const float c5 = -2.5050759689e-08f; // 0xb2d72f34
-    const float c6 = 1.5896910177e-10f;	 // 0x2f2ec9d3
-
-    float z = x * x;
-    float v = z * x;
-    float r = mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2);
-    float ret = x - mad(v, -c1, mad(z, mad(y, 0.5f, -v*r), -y));
-
-    return ret;
+  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+  // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+  // = x * f(w)
+  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+  // We use a minimax approximation of (f(w) - 1) / w
+  // because this produces an expansion in even powers of x.
+
+  const float c1 = -0.1666666666e0f;
+  const float c2 = 0.8333331876e-2f;
+  const float c3 = -0.198400874e-3f;
+  const float c4 = 0.272500015e-5f;
+  const float c5 = -2.5050759689e-08f; // 0xb2d72f34
+  const float c6 = 1.5896910177e-10f;  // 0x2f2ec9d3
+
+  float z = x * x;
+  float v = z * x;
+  float r = __clc_mad(
+      z, __clc_mad(z, __clc_mad(z, __clc_mad(z, c6, c5), c4), c3), c2);
+  float ret =
+      x - __clc_mad(v, -c1, __clc_mad(z, __clc_mad(y, 0.5f, -v * r), -y));
+
+  return ret;
 }
 
 _CLC_DEF float __clc_cosf_piby4(float x, float y) {
-    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
-    // = f(w)
-    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
-    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
-    // because this produces an expansion in even powers of x.
-
-    const float c1 = 0.416666666e-1f;
-    const float c2 = -0.138888876e-2f;
-    const float c3 = 0.248006008e-4f;
-    const float c4 = -0.2730101334e-6f;
-    const float c5 = 2.0875723372e-09f;	 // 0x310f74f6
-    const float c6 = -1.1359647598e-11f; // 0xad47d74e
-
-    float z = x * x;
-    float r = z * mad(z, mad(z, mad(z, mad(z, mad(z, c6,  c5), c4), c3), c2), c1);
-
-    // if |x| < 0.3
-    float qx = 0.0f;
-
-    int ix = as_int(x) & EXSIGNBIT_SP32;
-
-    //  0.78125 > |x| >= 0.3
-    float xby4 = as_float(ix - 0x01000000);
-    qx = (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4 : qx;
-
-    // x > 0.78125
-    qx = ix > 0x3f480000 ? 0.28125f : qx;
-
-    float hz = mad(z, 0.5f, -qx);
-    float a = 1.0f - qx;
-    float ret = a - (hz - mad(z, r, -x*y));
-    return ret;
+  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+  // = f(w)
+  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+  // because this produces an expansion in even powers of x.
+
+  const float c1 = 0.416666666e-1f;
+  const float c2 = -0.138888876e-2f;
+  const float c3 = 0.248006008e-4f;
+  const float c4 = -0.2730101334e-6f;
+  const float c5 = 2.0875723372e-09f;  // 0x310f74f6
+  const float c6 = -1.1359647598e-11f; // 0xad47d74e
+
+  float z = x * x;
+  float r =
+      z *
+      __clc_mad(
+          z,
+          __clc_mad(z, __clc_mad(z, __clc_mad(z, __clc_mad(z, c6, c5), c4), c3),
+                    c2),
+          c1);
+
+  // if |x| < 0.3
+  float qx = 0.0f;
+
+  int ix = as_int(x) & EXSIGNBIT_SP32;
+
+  //  0.78125 > |x| >= 0.3
+  float xby4 = as_float(ix - 0x01000000);
+  qx = (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4 : qx;
+
+  // x > 0.78125
+  qx = ix > 0x3f480000 ? 0.28125f : qx;
+
+  float hz = __clc_mad(z, 0.5f, -qx);
+  float a = 1.0f - qx;
+  float ret = a - (hz - __clc_mad(z, r, -x * y));
+  return ret;
 }
 
-_CLC_DEF float __clc_tanf_piby4(float x, int regn)
-{
-    // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4].
-    float r = x * x;
+_CLC_DEF float __clc_tanf_piby4(float x, int regn) {
+  // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4].
+  float r = x * x;
 
-    float a = mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f);
+  float a =
+      __clc_mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f);
 
-    float b = mad(r,
-	          mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f),
-	          1.15588821434688393452299f);
+  float b = __clc_mad(
+      r,
+      __clc_mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f),
+      1.15588821434688393452299f);
 
-    float t = mad(x*r, native_divide(a, b), x);
-    float tr = -MATH_RECIP(t);
+  float t = __clc_mad(x * r, native_divide(a, b), x);
+  float tr = -MATH_RECIP(t);
 
-    return regn & 1 ? tr : t;
+  return regn & 1 ? tr : t;
 }
 
-_CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh, float bt)
-{
-    if (HAVE_HW_FMA32()) {
-        float ph = a * b;
-        *hi = ph;
-        *lo = fma(a, b, -ph);
-    } else {
-        float ah = as_float(as_uint(a) & 0xfffff000U);
-        float at = a - ah;
-        float ph = a * b;
-        float pt = mad(at, bt, mad(at, bh, mad(ah, bt, mad(ah, bh, -ph))));
-        *hi = ph;
-        *lo = pt;
-    }
+_CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh,
+                             float bt) {
+  if (HAVE_HW_FMA32()) {
+    float ph = a * b;
+    *hi = ph;
+    *lo = fma(a, b, -ph);
+  } else {
+    float ah = as_float(as_uint(a) & 0xfffff000U);
+    float at = a - ah;
+    float ph = a * b;
+    float pt = __clc_mad(
+        at, bt, __clc_mad(at, bh, __clc_mad(ah, bt, __clc_mad(ah, bh, -ph))));
+    *hi = ph;
+    *lo = pt;
+  }
 }
 
-_CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x)
-{
-    // 72 bits of pi/2
-    const float fpiby2_1 = (float) 0xC90FDA / 0x1.0p+23f;
-    const float fpiby2_1_h = (float) 0xC90 / 0x1.0p+11f;
-    const float fpiby2_1_t = (float) 0xFDA / 0x1.0p+23f;
+_CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x) {
+  // 72 bits of pi/2
+  const float fpiby2_1 = (float)0xC90FDA / 0x1.0p+23f;
+  const float fpiby2_1_h = (float)0xC90 / 0x1.0p+11f;
+  const float fpiby2_1_t = (float)0xFDA / 0x1.0p+23f;
 
-    const float fpiby2_2 = (float) 0xA22168 / 0x1.0p+47f;
-    const float fpiby2_2_h = (float) 0xA22 / 0x1.0p+35f;
-    const float fpiby2_2_t = (float) 0x168 / 0x1.0p+47f;
+  const float fpiby2_2 = (float)0xA22168 / 0x1.0p+47f;
+  const float fpiby2_2_h = (float)0xA22 / 0x1.0p+35f;
+  const float fpiby2_2_t = (float)0x168 / 0x1.0p+47f;
 
-    const float fpiby2_3 = (float) 0xC234C4 / 0x1.0p+71f;
-    const float fpiby2_3_h = (float) 0xC23 / 0x1.0p+59f;
-    const float fpiby2_3_t = (float) 0x4C4 / 0x1.0p+71f;
+  const float fpiby2_3 = (float)0xC234C4 / 0x1.0p+71f;
+  const float fpiby2_3_h = (float)0xC23 / 0x1.0p+59f;
+  const float fpiby2_3_t = (float)0x4C4 / 0x1.0p+71f;
 
-    const float twobypi = 0x1.45f306p-1f;
+  const float twobypi = 0x1.45f306p-1f;
 
-    float fnpi2 = trunc(mad(x, twobypi, 0.5f));
+  float fnpi2 = __clc_trunc(__clc_mad(x, twobypi, 0.5f));
 
-    // subtract n * pi/2 from x
-    float rhead, rtail;
-    __clc_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t);
-    float v = x - rhead;
-    float rem = v + (((x - v) - rhead) - rtail);
+  // subtract n * pi/2 from x
+  float rhead, rtail;
+  __clc_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t);
+  float v = x - rhead;
+  float rem = v + (((x - v) - rhead) - rtail);
 
-    float rhead2, rtail2;
-    __clc_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t);
-    v = rem - rhead2;
-    rem = v + (((rem - v) - rhead2) - rtail2);
+  float rhead2, rtail2;
+  __clc_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t);
+  v = rem - rhead2;
+  rem = v + (((rem - v) - rhead2) - rtail2);
 
-    float rhead3, rtail3;
-    __clc_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t);
-    v = rem - rhead3;
+  float rhead3, rtail3;
+  __clc_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t);
+  v = rem - rhead3;
 
-    *hi = v + ((rem - v) - rhead3);
-    *lo = -rtail3;
-    return fnpi2;
+  *hi = v + ((rem - v) - rhead3);
+  *lo = -rtail3;
+  return fnpi2;
 }
 
-_CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x)
-{
-    float fnpi2 = __clc_removePi2S(r, rr, x);
-    return (int)fnpi2 & 0x3;
+_CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x) {
+  float fnpi2 = __clc_removePi2S(r, rr, x);
+  return (int)fnpi2 & 0x3;
 }
 
-#define FULL_MUL(A, B, HI, LO) \
-    LO = A * B; \
-    HI = mul_hi(A, B)
-
-#define FULL_MAD(A, B, C, HI, LO) \
-    LO = ((A) * (B) + (C)); \
-    HI = mul_hi(A, B); \
-    HI += LO < C
-
-_CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x)
-{
-    int xe = (int)(as_uint(x) >> 23) - 127;
-    uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU);
-
-    // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041 FE5163AB
-    const uint b6 = 0xA2F9836EU;
-    const uint b5 = 0x4E441529U;
-    const uint b4 = 0xFC2757D1U;
-    const uint b3 = 0xF534DDC0U;
-    const uint b2 = 0xDB629599U;
-    const uint b1 = 0x3C439041U;
-    const uint b0 = 0xFE5163ABU;
-
-    uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1;
-
-    FULL_MUL(xm, b0, c0, p0);
-    FULL_MAD(xm, b1, c0, c1, p1);
-    FULL_MAD(xm, b2, c1, c0, p2);
-    FULL_MAD(xm, b3, c0, c1, p3);
-    FULL_MAD(xm, b4, c1, c0, p4);
-    FULL_MAD(xm, b5, c0, c1, p5);
-    FULL_MAD(xm, b6, c1, p7, p6);
-
-    uint fbits = 224 + 23 - xe;
-
-    // shift amount to get 2 lsb of integer part at top 2 bits
-    //   min: 25 (xe=18) max: 134 (xe=127)
-    uint shift = 256U - 2 - fbits;
-
-    // Shift by up to 134/32 = 4 words
-    int c = shift > 31;
-    p7 = c ? p6 : p7;
-    p6 = c ? p5 : p6;
-    p5 = c ? p4 : p5;
-    p4 = c ? p3 : p4;
-    p3 = c ? p2 : p3;
-    p2 = c ? p1 : p2;
-    p1 = c ? p0 : p1;
-    shift -= (-c) & 32;
-
-    c = shift > 31;
-    p7 = c ? p6 : p7;
-    p6 = c ? p5 : p6;
-    p5 = c ? p4 : p5;
-    p4 = c ? p3 : p4;
-    p3 = c ? p2 : p3;
-    p2 = c ? p1 : p2;
-    shift -= (-c) & 32;
-
-    c = shift > 31;
-    p7 = c ? p6 : p7;
-    p6 = c ? p5 : p6;
-    p5 = c ? p4 : p5;
-    p4 = c ? p3 : p4;
-    p3 = c ? p2 : p3;
-    shift -= (-c) & 32;
-
-    c = shift > 31;
-    p7 = c ? p6 : p7;
-    p6 = c ? p5 : p6;
-    p5 = c ? p4 : p5;
-    p4 = c ? p3 : p4;
-    shift -= (-c) & 32;
-
-    // bitalign cannot handle a shift of 32
-    c = shift > 0;
-    shift = 32 - shift;
-    uint t7 = bitalign(p7, p6, shift);
-    uint t6 = bitalign(p6, p5, shift);
-    uint t5 = bitalign(p5, p4, shift);
-    p7 = c ? t7 : p7;
-    p6 = c ? t6 : p6;
-    p5 = c ? t5 : p5;
-
-    // Get 2 lsb of int part and msb of fraction
-    int i = p7 >> 29;
-
-    // Scoot up 2 more bits so only fraction remains
-    p7 = bitalign(p7, p6, 30);
-    p6 = bitalign(p6, p5, 30);
-    p5 = bitalign(p5, p4, 30);
-
-    // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5
-    uint flip = i & 1 ? 0xffffffffU : 0U;
-    uint sign = i & 1 ? 0x80000000U : 0U;
-    p7 = p7 ^ flip;
-    p6 = p6 ^ flip;
-    p5 = p5 ^ flip;
-
-    // Find exponent and shift away leading zeroes and hidden bit
-    xe = clz(p7) + 1;
-    shift = 32 - xe;
-    p7 = bitalign(p7, p6, shift);
-    p6 = bitalign(p6, p5, shift);
-
-    // Most significant part of fraction
-    float q1 = as_float(sign | ((127 - xe) << 23) | (p7 >> 9));
-
-    // Shift out bits we captured on q1
-    p7 = bitalign(p7, p6, 32-23);
-
-    // Get 24 more bits of fraction in another float, there are not long strings of zeroes here
-    int xxe = clz(p7) + 1;
-    p7 = bitalign(p7, p6, 32-xxe);
-    float q0 = as_float(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9));
-
-    // At this point, the fraction q1 + q0 is correct to at least 48 bits
-    // Now we need to multiply the fraction by pi/2
-    // This loses us about 4 bits
-    // pi/2 = C90 FDA A22 168 C23 4C4
-
-    const float pio2h = (float)0xc90fda / 0x1.0p+23f;
-    const float pio2hh = (float)0xc90 / 0x1.0p+11f;
-    const float pio2ht = (float)0xfda / 0x1.0p+23f;
-    const float pio2t = (float)0xa22168 / 0x1.0p+47f;
-
-    float rh, rt;
-
-    if (HAVE_HW_FMA32()) {
-        rh = q1 * pio2h;
-        rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh)));
-    } else {
-        float q1h = as_float(as_uint(q1) & 0xfffff000);
-        float q1t = q1 - q1h;
-        rh = q1 * pio2h;
-        rt = mad(q1t, pio2ht, mad(q1t, pio2hh, mad(q1h, pio2ht, mad(q1h, pio2hh, -rh))));
-        rt = mad(q0, pio2h, mad(q1, pio2t, rt));
-    }
-
-    float t = rh + rt;
-    rt = rt - (t - rh);
-
-    *r = t;
-    *rr = rt;
-    return ((i >> 1) + (i & 1)) & 0x3;
+#define FULL_MUL(A, B, HI, LO)                                                 \
+  LO = A * B;                                                                  \
+  HI = mul_hi(A, B)
+
+#define FULL_MAD(A, B, C, HI, LO)                                              \
+  LO = ((A) * (B) + (C));                                                      \
+  HI = mul_hi(A, B);                                                           \
+  HI += LO < C
+
+_CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x) {
+  int xe = (int)(as_uint(x) >> 23) - 127;
+  uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU);
+
+  // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041
+  // FE5163AB
+  const uint b6 = 0xA2F9836EU;
+  const uint b5 = 0x4E441529U;
+  const uint b4 = 0xFC2757D1U;
+  const uint b3 = 0xF534DDC0U;
+  const uint b2 = 0xDB629599U;
+  const uint b1 = 0x3C439041U;
+  const uint b0 = 0xFE5163ABU;
+
+  uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1;
+
+  FULL_MUL(xm, b0, c0, p0);
+  FULL_MAD(xm, b1, c0, c1, p1);
+  FULL_MAD(xm, b2, c1, c0, p2);
+  FULL_MAD(xm, b3, c0, c1, p3);
+  FULL_MAD(xm, b4, c1, c0, p4);
+  FULL_MAD(xm, b5, c0, c1, p5);
+  FULL_MAD(xm, b6, c1, p7, p6);
+
+  uint fbits = 224 + 23 - xe;
+
+  // shift amount to get 2 lsb of integer part at top 2 bits
+  //   min: 25 (xe=18) max: 134 (xe=127)
+  uint shift = 256U - 2 - fbits;
+
+  // Shift by up to 134/32 = 4 words
+  int c = shift > 31;
+  p7 = c ? p6 : p7;
+  p6 = c ? p5 : p6;
+  p5 = c ? p4 : p5;
+  p4 = c ? p3 : p4;
+  p3 = c ? p2 : p3;
+  p2 = c ? p1 : p2;
+  p1 = c ? p0 : p1;
+  shift -= (-c) & 32;
+
+  c = shift > 31;
+  p7 = c ? p6 : p7;
+  p6 = c ? p5 : p6;
+  p5 = c ? p4 : p5;
+  p4 = c ? p3 : p4;
+  p3 = c ? p2 : p3;
+  p2 = c ? p1 : p2;
+  shift -= (-c) & 32;
+
+  c = shift > 31;
+  p7 = c ? p6 : p7;
+  p6 = c ? p5 : p6;
+  p5 = c ? p4 : p5;
+  p4 = c ? p3 : p4;
+  p3 = c ? p2 : p3;
+  shift -= (-c) & 32;
+
+  c = shift > 31;
+  p7 = c ? p6 : p7;
+  p6 = c ? p5 : p6;
+  p5 = c ? p4 : p5;
+  p4 = c ? p3 : p4;
+  shift -= (-c) & 32;
+
+  // bitalign cannot handle a shift of 32
+  c = shift > 0;
+  shift = 32 - shift;
+  uint t7 = bitalign(p7, p6, shift);
+  uint t6 = bitalign(p6, p5, shift);
+  uint t5 = bitalign(p5, p4, shift);
+  p7 = c ? t7 : p7;
+  p6 = c ? t6 : p6;
+  p5 = c ? t5 : p5;
+
+  // Get 2 lsb of int part and msb of fraction
+  int i = p7 >> 29;
+
+  // Scoot up 2 more bits so only fraction remains
+  p7 = bitalign(p7, p6, 30);
+  p6 = bitalign(p6, p5, 30);
+  p5 = bitalign(p5, p4, 30);
+
+  // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5
+  uint flip = i & 1 ? 0xffffffffU : 0U;
+  uint sign = i & 1 ? 0x80000000U : 0U;
+  p7 = p7 ^ flip;
+  p6 = p6 ^ flip;
+  p5 = p5 ^ flip;
+
+  // Find exponent and shift away leading zeroes and hidden bit
+  xe = clz(p7) + 1;
+  shift = 32 - xe;
+  p7 = bitalign(p7, p6, shift);
+  p6 = bitalign(p6, p5, shift);
+
+  // Most significant part of fraction
+  float q1 = as_float(sign | ((127 - xe) << 23) | (p7 >> 9));
+
+  // Shift out bits we captured on q1
+  p7 = bitalign(p7, p6, 32 - 23);
+
+  // Get 24 more bits of fraction in another float, there are not long strings
+  // of zeroes here
+  int xxe = clz(p7) + 1;
+  p7 = bitalign(p7, p6, 32 - xxe);
+  float q0 = as_float(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9));
+
+  // At this point, the fraction q1 + q0 is correct to at least 48 bits
+  // Now we need to multiply the fraction by pi/2
+  // This loses us about 4 bits
+  // pi/2 = C90 FDA A22 168 C23 4C4
+
+  const float pio2h = (float)0xc90fda / 0x1.0p+23f;
+  const float pio2hh = (float)0xc90 / 0x1.0p+11f;
+  const float pio2ht = (float)0xfda / 0x1.0p+23f;
+  const float pio2t = (float)0xa22168 / 0x1.0p+47f;
+
+  float rh, rt;
+
+  if (HAVE_HW_FMA32()) {
+    rh = q1 * pio2h;
+    rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh)));
+  } else {
+    float q1h = as_float(as_uint(q1) & 0xfffff000);
+    float q1t = q1 - q1h;
+    rh = q1 * pio2h;
+    rt = __clc_mad(
+        q1t, pio2ht,
+        __clc_mad(q1t, pio2hh,
+                  __clc_mad(q1h, pio2ht, __clc_mad(q1h, pio2hh, -rh))));
+    rt = __clc_mad(q0, pio2h, __clc_mad(q1, pio2t, rt));
+  }
+
+  float t = rh + rt;
+  rt = rt - (t - rh);
+
+  *r = t;
+  *rr = rt;
+  return ((i >> 1) + (i & 1)) & 0x3;
 }
 
-_CLC_DEF int __clc_argReductionS(float *r, float *rr, float x)
-{
-    if (x < 0x1.0p+23f)
-        return __clc_argReductionSmallS(r, rr, x);
-    else
-        return __clc_argReductionLargeS(r, rr, x);
+_CLC_DEF int __clc_argReductionS(float *r, float *rr, float x) {
+  if (x < 0x1.0p+23f)
+    return __clc_argReductionSmallS(r, rr, x);
+  else
+    return __clc_argReductionLargeS(r, rr, x);
 }
 
 #ifdef cl_khr_fp64
@@ -329,39 +341,44 @@ _CLC_DEF int __clc_argReductionS(float *r, float *rr, float x)
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 // Reduction for medium sized arguments
-_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn) {
-    // How many pi/2 is x a multiple of?
-    const double two_by_pi = 0x1.45f306dc9c883p-1;
-    double dnpi2 = trunc(fma(x, two_by_pi, 0.5));
-
-    const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
-    const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
-    const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
-
-    // Compute product of npi2 with 159 bits of 2/pi
-    double p_hh = piby2_h * dnpi2;
-    double p_ht = fma(piby2_h, dnpi2, -p_hh);
-    double p_mh = piby2_m * dnpi2;
-    double p_mt = fma(piby2_m, dnpi2, -p_mh);
-    double p_th = piby2_t * dnpi2;
-    double p_tt = fma(piby2_t, dnpi2, -p_th);
-
-    // Reduce to 159 bits
-    double ph = p_hh;
-    double pm = p_ht + p_mh;
-    double t = p_mh - (pm - p_ht);
-    double pt = p_th + t + p_mt + p_tt;
-    t = ph + pm; pm = pm - (t - ph); ph = t;
-    t = pm + pt; pt = pt - (t - pm); pm = t;
-
-    // Subtract from x
-    t = x + ph;
-    double qh = t + pm;
-    double qt = pm - (qh - t) + pt;
-
-    *r = qh;
-    *rr = qt;
-    *regn = (int)(long)dnpi2 & 0x3;
+_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr,
+                                           int *regn) {
+  // How many pi/2 is x a multiple of?
+  const double two_by_pi = 0x1.45f306dc9c883p-1;
+  double dnpi2 = __clc_trunc(fma(x, two_by_pi, 0.5));
+
+  const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
+  const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
+  const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
+
+  // Compute product of npi2 with 159 bits of 2/pi
+  double p_hh = piby2_h * dnpi2;
+  double p_ht = fma(piby2_h, dnpi2, -p_hh);
+  double p_mh = piby2_m * dnpi2;
+  double p_mt = fma(piby2_m, dnpi2, -p_mh);
+  double p_th = piby2_t * dnpi2;
+  double p_tt = fma(piby2_t, dnpi2, -p_th);
+
+  // Reduce to 159 bits
+  double ph = p_hh;
+  double pm = p_ht + p_mh;
+  double t = p_mh - (pm - p_ht);
+  double pt = p_th + t + p_mt + p_tt;
+  t = ph + pm;
+  pm = pm - (t - ph);
+  ph = t;
+  t = pm + pt;
+  pt = pt - (t - pm);
+  pm = t;
+
+  // Subtract from x
+  t = x + ph;
+  double qh = t + pm;
+  double qt = pm - (qh - t) + pt;
+
+  *r = qh;
+  *rr = qt;
+  *regn = (int)(long)dnpi2 & 0x3;
 }
 
 // Given positive argument x, reduce it to the range [-pi/4,pi/4] using
@@ -369,195 +386,208 @@ _CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, int
 // Return value "regn" tells how many lots of pi/2 were subtracted
 // from x to put it in the range [-pi/4,pi/4], mod 4.
 
-_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn) {
-
-    long ux = as_long(x);
-    int e = (int)(ux >> 52) -  1023;
-    int i = __clc_max(23, (e >> 3) + 17);
-    int j = 150 - i;
-    int j16 = j & ~0xf;
-    double fract_temp;
-
-    // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary byte boundary
-    uint4 q0 = USE_TABLE(pibits_tbl, j16);
-    uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16));
-    uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32));
-
-    int k = (j >> 2) & 0x3;
-    int4 c = (int4)k == (int4)(0, 1, 2, 3);
-
-    uint u0, u1, u2, u3, u4, u5, u6;
-
-    u0 = c.s1 ? q0.s1 : q0.s0;
-    u0 = c.s2 ? q0.s2 : u0;
-    u0 = c.s3 ? q0.s3 : u0;
-
-    u1 = c.s1 ? q0.s2 : q0.s1;
-    u1 = c.s2 ? q0.s3 : u1;
-    u1 = c.s3 ? q1.s0 : u1;
-
-    u2 = c.s1 ? q0.s3 : q0.s2;
-    u2 = c.s2 ? q1.s0 : u2;
-    u2 = c.s3 ? q1.s1 : u2;
-
-    u3 = c.s1 ? q1.s0 : q0.s3;
-    u3 = c.s2 ? q1.s1 : u3;
-    u3 = c.s3 ? q1.s2 : u3;
-
-    u4 = c.s1 ? q1.s1 : q1.s0;
-    u4 = c.s2 ? q1.s2 : u4;
-    u4 = c.s3 ? q1.s3 : u4;
-
-    u5 = c.s1 ? q1.s2 : q1.s1;
-    u5 = c.s2 ? q1.s3 : u5;
-    u5 = c.s3 ? q2.s0 : u5;
-
-    u6 = c.s1 ? q1.s3 : q1.s2;
-    u6 = c.s2 ? q2.s0 : u6;
-    u6 = c.s3 ? q2.s1 : u6;
-
-    uint v0 = bytealign(u1, u0, j);
-    uint v1 = bytealign(u2, u1, j);
-    uint v2 = bytealign(u3, u2, j);
-    uint v3 = bytealign(u4, u3, j);
-    uint v4 = bytealign(u5, u4, j);
-    uint v5 = bytealign(u6, u5, j);
-
-    // Place those 192 bits in 4 48-bit doubles along with correct exponent
-    // If i > 1018 we would get subnormals so we scale p up and x down to get the same product
-    i = 2 + 8*i;
-    x *= i > 1018 ? 0x1.0p-136 : 1.0;
-    i -= i > 1018 ? 136 : 0;
-
-    uint ua = (uint)(1023 + 52 - i) << 20;
-    double a = as_double((uint2)(0, ua));
-    double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a;
-    ua += 0x03000000U;
-    a = as_double((uint2)(0, ua));
-    double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a;
-    ua += 0x03000000U;
-    a = as_double((uint2)(0, ua));
-    double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a;
-    ua += 0x03000000U;
-    a = as_double((uint2)(0, ua));
-    double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a;
-
-    // Exact multiply
-    double f0h = p0 * x;
-    double f0l = fma(p0, x, -f0h);
-    double f1h = p1 * x;
-    double f1l = fma(p1, x, -f1h);
-    double f2h = p2 * x;
-    double f2l = fma(p2, x, -f2h);
-    double f3h = p3 * x;
-    double f3l = fma(p3, x, -f3h);
-
-    // Accumulate product into 4 doubles
-    double s, t;
-
-    double f3 = f3h + f2h;
-    t = f2h - (f3 - f3h);
-    s = f3l + t;
-    t = t - (s - f3l);
-
-    double f2 = s + f1h;
-    t = f1h - (f2 - s) + t;
-    s = f2l + t;
-    t = t - (s - f2l);
-
-    double f1 = s + f0h;
-    t = f0h - (f1 - s) + t;
-    s = f1l + t;
-
-    double f0 = s + f0l;
-
-    // Strip off unwanted large integer bits
-    f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp);
-    f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
-
-    // Compute least significant integer bits
-    t = f3 + f2;
-    double di = t - fract(t, &fract_temp);
-    i = (float)di;
-
-    // Shift out remaining integer part
-    f3 -= di;
-    s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t;
-    s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t;
-    f1 += f0;
-
-    // Subtract 1 if fraction is >= 0.5, and update regn
-    int g = f3 >= 0.5;
-    i += g;
-    f3 -= (float)g;
-
-    // Shift up bits
-    s = f3 + f2; t = f2 -(s - f3); f3 = s; f2 = t + f1;
-
-    // Multiply precise fraction by pi/2 to get radians
-    const double p2h = 7074237752028440.0 / 0x1.0p+52;
-    const double p2t = 4967757600021510.0 / 0x1.0p+106;
-
-    double rhi = f3 * p2h;
-    double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
-
-    *r = rhi + rlo;
-    *rr = rlo - (*r - rhi);
-    *regn = i & 0x3;
+_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr,
+                                          int *regn) {
+
+  long ux = as_long(x);
+  int e = (int)(ux >> 52) - 1023;
+  int i = __clc_max(23, (e >> 3) + 17);
+  int j = 150 - i;
+  int j16 = j & ~0xf;
+  double fract_temp;
+
+  // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary
+  // byte boundary
+  uint4 q0 = USE_TABLE(pibits_tbl, j16);
+  uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16));
+  uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32));
+
+  int k = (j >> 2) & 0x3;
+  int4 c = (int4)k == (int4)(0, 1, 2, 3);
+
+  uint u0, u1, u2, u3, u4, u5, u6;
+
+  u0 = c.s1 ? q0.s1 : q0.s0;
+  u0 = c.s2 ? q0.s2 : u0;
+  u0 = c.s3 ? q0.s3 : u0;
+
+  u1 = c.s1 ? q0.s2 : q0.s1;
+  u1 = c.s2 ? q0.s3 : u1;
+  u1 = c.s3 ? q1.s0 : u1;
+
+  u2 = c.s1 ? q0.s3 : q0.s2;
+  u2 = c.s2 ? q1.s0 : u2;
+  u2 = c.s3 ? q1.s1 : u2;
+
+  u3 = c.s1 ? q1.s0 : q0.s3;
+  u3 = c.s2 ? q1.s1 : u3;
+  u3 = c.s3 ? q1.s2 : u3;
+
+  u4 = c.s1 ? q1.s1 : q1.s0;
+  u4 = c.s2 ? q1.s2 : u4;
+  u4 = c.s3 ? q1.s3 : u4;
+
+  u5 = c.s1 ? q1.s2 : q1.s1;
+  u5 = c.s2 ? q1.s3 : u5;
+  u5 = c.s3 ? q2.s0 : u5;
+
+  u6 = c.s1 ? q1.s3 : q1.s2;
+  u6 = c.s2 ? q2.s0 : u6;
+  u6 = c.s3 ? q2.s1 : u6;
+
+  uint v0 = bytealign(u1, u0, j);
+  uint v1 = bytealign(u2, u1, j);
+  uint v2 = bytealign(u3, u2, j);
+  uint v3 = bytealign(u4, u3, j);
+  uint v4 = bytealign(u5, u4, j);
+  uint v5 = bytealign(u6, u5, j);
+
+  // Place those 192 bits in 4 48-bit doubles along with correct exponent
+  // If i > 1018 we would get subnormals so we scale p up and x down to get the
+  // same product
+  i = 2 + 8 * i;
+  x *= i > 1018 ? 0x1.0p-136 : 1.0;
+  i -= i > 1018 ? 136 : 0;
+
+  uint ua = (uint)(1023 + 52 - i) << 20;
+  double a = as_double((uint2)(0, ua));
+  double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a;
+  ua += 0x03000000U;
+  a = as_double((uint2)(0, ua));
+  double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a;
+  ua += 0x03000000U;
+  a = as_double((uint2)(0, ua));
+  double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a;
+  ua += 0x03000000U;
+  a = as_double((uint2)(0, ua));
+  double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a;
+
+  // Exact multiply
+  double f0h = p0 * x;
+  double f0l = fma(p0, x, -f0h);
+  double f1h = p1 * x;
+  double f1l = fma(p1, x, -f1h);
+  double f2h = p2 * x;
+  double f2l = fma(p2, x, -f2h);
+  double f3h = p3 * x;
+  double f3l = fma(p3, x, -f3h);
+
+  // Accumulate product into 4 doubles
+  double s, t;
+
+  double f3 = f3h + f2h;
+  t = f2h - (f3 - f3h);
+  s = f3l + t;
+  t = t - (s - f3l);
+
+  double f2 = s + f1h;
+  t = f1h - (f2 - s) + t;
+  s = f2l + t;
+  t = t - (s - f2l);
+
+  double f1 = s + f0h;
+  t = f0h - (f1 - s) + t;
+  s = f1l + t;
+
+  double f0 = s + f0l;
+
+  // Strip off unwanted large integer bits
+  f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp);
+  f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
+
+  // Compute least significant integer bits
+  t = f3 + f2;
+  double di = t - fract(t, &fract_temp);
+  i = (float)di;
+
+  // Shift out remaining integer part
+  f3 -= di;
+  s = f3 + f2;
+  t = f2 - (s - f3);
+  f3 = s;
+  f2 = t;
+  s = f2 + f1;
+  t = f1 - (s - f2);
+  f2 = s;
+  f1 = t;
+  f1 += f0;
+
+  // Subtract 1 if fraction is >= 0.5, and update regn
+  int g = f3 >= 0.5;
+  i += g;
+  f3 -= (float)g;
+
+  // Shift up bits
+  s = f3 + f2;
+  t = f2 - (s - f3);
+  f3 = s;
+  f2 = t + f1;
+
+  // Multiply precise fraction by pi/2 to get radians
+  const double p2h = 7074237752028440.0 / 0x1.0p+52;
+  const double p2t = 4967757600021510.0 / 0x1.0p+106;
+
+  double rhi = f3 * p2h;
+  double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
+
+  *r = rhi + rlo;
+  *rr = rlo - (*r - rhi);
+  *regn = i & 0x3;
 }
 
-
 _CLC_DEF double2 __clc_sincos_piby4(double x, double xx) {
-    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
-    //                      = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
-    //                      = x * f(w)
-    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
-    // We use a minimax approximation of (f(w) - 1) / w
-    // because this produces an expansion in even powers of x.
-    // If xx (the tail of x) is non-zero, we add a correction
-    // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
-    // is an approximation to cos(x)*sin(xx) valid because
-    // xx is tiny relative to x.
-
-    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
-    //                      = f(w)
-    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
-    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
-    // because this produces an expansion in even powers of x.
-    // If xx (the tail of x) is non-zero, we subtract a correction
-    // term g(x,xx) = x*xx to the result, where g(x,xx)
-    // is an approximation to sin(x)*sin(xx) valid because
-    // xx is tiny relative to x.
-
-    const double sc1 = -0.166666666666666646259241729;
-    const double sc2 =  0.833333333333095043065222816e-2;
-    const double sc3 = -0.19841269836761125688538679e-3;
-    const double sc4 =  0.275573161037288022676895908448e-5;
-    const double sc5 = -0.25051132068021699772257377197e-7;
-    const double sc6 =  0.159181443044859136852668200e-9;
-
-    const double cc1 =  0.41666666666666665390037e-1;
-    const double cc2 = -0.13888888888887398280412e-2;
-    const double cc3 =  0.248015872987670414957399e-4;
-    const double cc4 = -0.275573172723441909470836e-6;
-    const double cc5 =  0.208761463822329611076335e-8;
-    const double cc6 = -0.113826398067944859590880e-10;
-
-    double x2 = x * x;
-    double x3 = x2 * x;
-    double r = 0.5 * x2;
-    double t = 1.0 - r;
-
-    double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
-
-    double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1),
-                        x2*x2, fma(x, xx, (1.0 - t) - r));
-
-    double2 ret;
-    ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx));
-    ret.hi = cp;
-
-    return ret;
+  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+  //                      = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+  //                      = x * f(w)
+  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+  // We use a minimax approximation of (f(w) - 1) / w
+  // because this produces an expansion in even powers of x.
+  // If xx (the tail of x) is non-zero, we add a correction
+  // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
+  // is an approximation to cos(x)*sin(xx) valid because
+  // xx is tiny relative to x.
+
+  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+  //                      = f(w)
+  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+  // because this produces an expansion in even powers of x.
+  // If xx (the tail of x) is non-zero, we subtract a correction
+  // term g(x,xx) = x*xx to the result, where g(x,xx)
+  // is an approximation to sin(x)*sin(xx) valid because
+  // xx is tiny relative to x.
+
+  const double sc1 = -0.166666666666666646259241729;
+  const double sc2 = 0.833333333333095043065222816e-2;
+  const double sc3 = -0.19841269836761125688538679e-3;
+  const double sc4 = 0.275573161037288022676895908448e-5;
+  const double sc5 = -0.25051132068021699772257377197e-7;
+  const double sc6 = 0.159181443044859136852668200e-9;
+
+  const double cc1 = 0.41666666666666665390037e-1;
+  const double cc2 = -0.13888888888887398280412e-2;
+  const double cc3 = 0.248015872987670414957399e-4;
+  const double cc4 = -0.275573172723441909470836e-6;
+  const double cc5 = 0.208761463822329611076335e-8;
+  const double cc6 = -0.113826398067944859590880e-10;
+
+  double x2 = x * x;
+  double x3 = x2 * x;
+  double r = 0.5 * x2;
+  double t = 1.0 - r;
+
+  double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
+
+  double cp =
+      t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2),
+                  x2, cc1),
+              x2 * x2, fma(x, xx, (1.0 - t) - r));
+
+  double2 ret;
+  ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5 * xx), x2, -xx));
+  ret.hi = cp;
+
+  return ret;
 }
 
 #endif
diff --git a/libclc/generic/lib/math/sincospiF_piby4.h b/libclc/generic/lib/math/sincospiF_piby4.h
index 90ecb1d7a6360e..d7c01c1bb13d36 100644
--- a/libclc/generic/lib/math/sincospiF_piby4.h
+++ b/libclc/generic/lib/math/sincospiF_piby4.h
@@ -19,38 +19,41 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
+#include <clc/math/clc_mad.h>
 
 // Evaluate single precisions in and cos of value in interval [-pi/4, pi/4]
-_CLC_INLINE float2
-__libclc__sincosf_piby4(float x)
-{
-    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
-    // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
-    // = x * f(w)
-    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
-    // We use a minimax approximation of (f(w) - 1) / w
-    // because this produces an expansion in even powers of x.
+_CLC_INLINE float2 __libclc__sincosf_piby4(float x) {
+  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+  // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+  // = x * f(w)
+  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+  // We use a minimax approximation of (f(w) - 1) / w
+  // because this produces an expansion in even powers of x.
 
-    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
-    // = f(w)
-    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
-    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
-    // because this produces an expansion in even powers of x.
+  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+  // = f(w)
+  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+  // because this produces an expansion in even powers of x.
 
-    const float sc1 = -0.166666666638608441788607926e0F;
-    const float sc2 =  0.833333187633086262120839299e-2F;
-    const float sc3 = -0.198400874359527693921333720e-3F;
-    const float sc4 =  0.272500015145584081596826911e-5F;
+  const float sc1 = -0.166666666638608441788607926e0F;
+  const float sc2 = 0.833333187633086262120839299e-2F;
+  const float sc3 = -0.198400874359527693921333720e-3F;
+  const float sc4 = 0.272500015145584081596826911e-5F;
 
-    const float cc1 =  0.41666666664325175238031e-1F;
-    const float cc2 = -0.13888887673175665567647e-2F;
-    const float cc3 =  0.24800600878112441958053e-4F;
-    const float cc4 = -0.27301013343179832472841e-6F;
+  const float cc1 = 0.41666666664325175238031e-1F;
+  const float cc2 = -0.13888887673175665567647e-2F;
+  const float cc3 = 0.24800600878112441958053e-4F;
+  const float cc4 = -0.27301013343179832472841e-6F;
 
-    float x2 = x * x;
+  float x2 = x * x;
 
-    float2 ret;
-    ret.x = mad(x*x2, mad(x2, mad(x2, mad(x2, sc4, sc3), sc2), sc1), x);
-    ret.y = mad(x2*x2, mad(x2, mad(x2, mad(x2, cc4, cc3), cc2), cc1), mad(x2, -0.5f, 1.0f));
-    return ret;
+  float2 ret;
+  ret.x = __clc_mad(
+      x * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, sc4, sc3), sc2), sc1),
+      x);
+  ret.y = __clc_mad(
+      x2 * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, cc4, cc3), cc2), cc1),
+      __clc_mad(x2, -0.5f, 1.0f));
+  return ret;
 }