[libclc] libclc: Update pow functions (PR #186890)

Tue Mar 17 00:46:02 PDT 2026

https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/186890

>From ffbdec6cf46144fca7f22370ca0db1124ab2a062 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 12 Mar 2026 13:02:07 +0100
Subject: [PATCH 1/3] libclc: Update pow functions

The 4 flavors of pow were originally ported from rocm
device libs between c45ec604f593fcb03d770f4398142d2446017f68,
cc5c65b2c25e0a82fbad95f0ce3bb5262e29eeee, and
fe8e00bc3c65115b2e3d2a43cf3d0d756a934a52. Update to a newer
version. Additionally expose fast variants for use by the
libcall optimizer (e.g, __pow_fast) for float types.
---
 libclc/clc/include/clc/math/clc_ep_decl.inc   |   5 +
 libclc/clc/include/clc/math/clc_exp2_fast.h   |  19 +
 libclc/clc/include/clc/math/clc_log2_fast.h   |  19 +
 libclc/clc/include/clc/math/clc_pow.h         |   7 +-
 libclc/clc/include/clc/math/clc_pown.h        |   9 +-
 libclc/clc/include/clc/math/clc_powr.h        |   9 +-
 libclc/clc/include/clc/math/clc_rootn.h       |   9 +-
 ...def_with_int_second_arg_scalarize_loop.inc |  37 ++
 libclc/clc/lib/amdgpu/CMakeLists.txt          |   2 +
 libclc/clc/lib/amdgpu/math/clc_exp2_fast.cl   |  33 ++
 libclc/clc/lib/amdgpu/math/clc_log2_fast.cl   |  25 +
 libclc/clc/lib/generic/CMakeLists.txt         |   2 +
 libclc/clc/lib/generic/math/clc_ep.cl         |   3 +
 libclc/clc/lib/generic/math/clc_ep.inc        |  78 +++
 libclc/clc/lib/generic/math/clc_exp2_fast.cl  |  15 +
 libclc/clc/lib/generic/math/clc_log2_fast.cl  |  15 +
 libclc/clc/lib/generic/math/clc_pow.cl        |  43 +-
 libclc/clc/lib/generic/math/clc_pow.inc       | 438 --------------
 libclc/clc/lib/generic/math/clc_pow_base.inc  | 542 ++++++++++++++++++
 libclc/clc/lib/generic/math/clc_pown.cl       |  42 +-
 libclc/clc/lib/generic/math/clc_pown.inc      | 402 -------------
 libclc/clc/lib/generic/math/clc_powr.cl       |  44 +-
 libclc/clc/lib/generic/math/clc_powr.inc      | 414 -------------
 libclc/clc/lib/generic/math/clc_rootn.cl      |  41 +-
 libclc/clc/lib/generic/math/clc_rootn.inc     | 405 -------------
 libclc/opencl/lib/generic/math/pow.cl         |  14 +-
 libclc/opencl/lib/generic/math/pown.cl        |  12 +-
 libclc/opencl/lib/generic/math/powr.cl        |  12 +-
 libclc/opencl/lib/generic/math/rootn.cl       |  12 +-
 29 files changed, 986 insertions(+), 1722 deletions(-)
 create mode 100644 libclc/clc/include/clc/math/clc_exp2_fast.h
 create mode 100644 libclc/clc/include/clc/math/clc_log2_fast.h
 create mode 100644 libclc/clc/include/clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc
 create mode 100644 libclc/clc/lib/amdgpu/math/clc_exp2_fast.cl
 create mode 100644 libclc/clc/lib/amdgpu/math/clc_log2_fast.cl
 create mode 100644 libclc/clc/lib/generic/math/clc_exp2_fast.cl
 create mode 100644 libclc/clc/lib/generic/math/clc_log2_fast.cl
 delete mode 100644 libclc/clc/lib/generic/math/clc_pow.inc
 create mode 100644 libclc/clc/lib/generic/math/clc_pow_base.inc
 delete mode 100644 libclc/clc/lib/generic/math/clc_pown.inc
 delete mode 100644 libclc/clc/lib/generic/math/clc_powr.inc
 delete mode 100644 libclc/clc/lib/generic/math/clc_rootn.inc

diff --git a/libclc/clc/include/clc/math/clc_ep_decl.inc b/libclc/clc/include/clc/math/clc_ep_decl.inc
index d29cfdc6346ba..1739da9f3ae63 100644
--- a/libclc/clc/include/clc/math/clc_ep_decl.inc
+++ b/libclc/clc/include/clc/math/clc_ep_decl.inc
@@ -128,4 +128,9 @@ _CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_sqrt(__CLC_GENTYPE a);
 
 _CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_sqrt(__CLC_EP_PAIR a);
 
+#if __CLC_FPSIZE == 32 || __CLC_FPSIZE == 64
+_CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_ep_exp(__CLC_EP_PAIR a);
+_CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_ln(__CLC_GENTYPE a);
+#endif
+
 #endif
diff --git a/libclc/clc/include/clc/math/clc_exp2_fast.h b/libclc/clc/include/clc/math/clc_exp2_fast.h
new file mode 100644
index 0000000000000..a42e6c9b7fd48
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_exp2_fast.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_EXP2_FAST_H__
+#define __CLC_MATH_CLC_EXP2_FAST_H__
+
+#define __CLC_FUNCTION __clc_exp2_fast
+#define __CLC_BODY <clc/shared/unary_decl.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_EXP2_FAST_H__
diff --git a/libclc/clc/include/clc/math/clc_log2_fast.h b/libclc/clc/include/clc/math/clc_log2_fast.h
new file mode 100644
index 0000000000000..5160afbedebf7
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_log2_fast.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_LOG2_FAST_H__
+#define __CLC_MATH_CLC_LOG2_FAST_H__
+
+#define __CLC_FUNCTION __clc_log2_fast
+#define __CLC_BODY <clc/shared/unary_decl.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_LOG2_FAST_H__
diff --git a/libclc/clc/include/clc/math/clc_pow.h b/libclc/clc/include/clc/math/clc_pow.h
index 5e37e5bf6da65..f7399873a4994 100644
--- a/libclc/clc/include/clc/math/clc_pow.h
+++ b/libclc/clc/include/clc/math/clc_pow.h
@@ -11,9 +11,14 @@
 
 #define __CLC_BODY <clc/shared/binary_decl.inc>
 #define __CLC_FUNCTION __clc_pow
-
 #include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
 
+#define __CLC_FLOAT_ONLY
+#define __CLC_BODY <clc/shared/binary_decl.inc>
+#define __CLC_FUNCTION __clc_pow_fast
+#include <clc/math/gentype.inc>
 #undef __CLC_FUNCTION
+#undef __CLC_FLOAT_ONLY
 
 #endif // __CLC_MATH_CLC_POW_H__
diff --git a/libclc/clc/include/clc/math/clc_pown.h b/libclc/clc/include/clc/math/clc_pown.h
index 30628efb19001..3e2b359468b48 100644
--- a/libclc/clc/include/clc/math/clc_pown.h
+++ b/libclc/clc/include/clc/math/clc_pown.h
@@ -9,11 +9,16 @@
 #ifndef __CLC_MATH_CLC_POWN_H__
 #define __CLC_MATH_CLC_POWN_H__
 
-#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
 #define __CLC_FUNCTION __clc_pown
-
+#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
 #include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
 
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_pown_fast
+#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
+#include <clc/math/gentype.inc>
 #undef __CLC_FUNCTION
+#undef __CLC_FLOAT_ONLY
 
 #endif // __CLC_MATH_CLC_POWN_H__
diff --git a/libclc/clc/include/clc/math/clc_powr.h b/libclc/clc/include/clc/math/clc_powr.h
index baa494cce6989..67c591ca6aa82 100644
--- a/libclc/clc/include/clc/math/clc_powr.h
+++ b/libclc/clc/include/clc/math/clc_powr.h
@@ -9,11 +9,16 @@
 #ifndef __CLC_MATH_CLC_POWR_H__
 #define __CLC_MATH_CLC_POWR_H__
 
-#define __CLC_BODY <clc/shared/binary_decl.inc>
 #define __CLC_FUNCTION __clc_powr
-
+#define __CLC_BODY <clc/shared/binary_decl.inc>
 #include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
 
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_powr_fast
+#define __CLC_BODY <clc/shared/binary_decl.inc>
+#include <clc/math/gentype.inc>
 #undef __CLC_FUNCTION
+#undef __CLC_FLOAT_ONLY
 
 #endif // __CLC_MATH_CLC_POWR_H__
diff --git a/libclc/clc/include/clc/math/clc_rootn.h b/libclc/clc/include/clc/math/clc_rootn.h
index 90a25ad52d867..26d111a8671d4 100644
--- a/libclc/clc/include/clc/math/clc_rootn.h
+++ b/libclc/clc/include/clc/math/clc_rootn.h
@@ -9,11 +9,16 @@
 #ifndef __CLC_MATH_CLC_ROOTN_H__
 #define __CLC_MATH_CLC_ROOTN_H__
 
-#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
 #define __CLC_FUNCTION __clc_rootn
-
+#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
 #include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
 
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_rootn_fast
+#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
+#include <clc/math/gentype.inc>
 #undef __CLC_FUNCTION
+#undef __CLC_FLOAT_ONLY
 
 #endif // __CLC_MATH_CLC_ROOTN_H__
diff --git a/libclc/clc/include/clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc b/libclc/clc/include/clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc
new file mode 100644
index 0000000000000..498df2fc420de
--- /dev/null
+++ b/libclc/clc/include/clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/utils.h"
+
+#if __CLC_VECSIZE_OR_1 >= 2
+
+#ifndef __CLC_IMPL_FUNCTION
+#define __CLC_IMPL_FUNCTION __CLC_FUNCTION
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
+                                                    __CLC_INTN y) {
+  union {
+    __CLC_GENTYPE vec;
+    __CLC_SCALAR_GENTYPE arr[__CLC_VECSIZE_OR_1];
+  } u_x, u_result;
+
+  union {
+    __CLC_INTN vec;
+    int arr[__CLC_VECSIZE_OR_1];
+  } u_y;
+
+  u_x.vec = x;
+  u_y.vec = y;
+  for (int i = 0; i < __CLC_VECSIZE_OR_1; ++i) {
+    u_result.arr[i] = __CLC_IMPL_FUNCTION(u_x.arr[i], u_y.arr[i]);
+  }
+  return u_result.vec;
+}
+
+#endif // __CLC_VECSIZE_OR_1 >= 2
diff --git a/libclc/clc/lib/amdgpu/CMakeLists.txt b/libclc/clc/lib/amdgpu/CMakeLists.txt
index 9b6c9a231ade0..daccc00b841b3 100644
--- a/libclc/clc/lib/amdgpu/CMakeLists.txt
+++ b/libclc/clc/lib/amdgpu/CMakeLists.txt
@@ -3,6 +3,7 @@ libclc_configure_source_list(CLC_AMDGPU_SOURCES
   address_space/clc_qualifier.cl
   math/clc_exp.cl
   math/clc_exp2.cl
+  math/clc_exp2_fast.cl
   math/clc_exp10.cl
   math/clc_frexp.cl
   math/clc_half_exp.cl
@@ -15,6 +16,7 @@ libclc_configure_source_list(CLC_AMDGPU_SOURCES
   math/clc_half_rsqrt.cl
   math/clc_half_sqrt.cl
   math/clc_ldexp.cl
+  math/clc_log2_fast.cl
   math/clc_native_exp.cl
   math/clc_native_exp2.cl
   math/clc_native_log10.cl
diff --git a/libclc/clc/lib/amdgpu/math/clc_exp2_fast.cl b/libclc/clc/lib/amdgpu/math/clc_exp2_fast.cl
new file mode 100644
index 0000000000000..b73bc9f6e260b
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/math/clc_exp2_fast.cl
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_MIN_VECSIZE 1
+#define __CLC_FUNCTION __clc_exp2_fast
+#define __CLC_IMPL_FUNCTION(x) __builtin_amdgcn_exp2f(x)
+#define __CLC_BODY <clc/shared/unary_def_scalarize.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_FLOAT_ONLY
+
+#define __CLC_HALF_ONLY
+#define __CLC_IMPL_FUNCTION(x) __clc_exp2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_HALF_ONLY
+
+#define __CLC_DOUBLE_ONLY
+#define __CLC_IMPL_FUNCTION(x) __clc_exp2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_DOUBLE_ONLY
diff --git a/libclc/clc/lib/amdgpu/math/clc_log2_fast.cl b/libclc/clc/lib/amdgpu/math/clc_log2_fast.cl
new file mode 100644
index 0000000000000..a47fc84b26a00
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/math/clc_log2_fast.cl
@@ -0,0 +1,25 @@
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_MIN_VECSIZE 1
+#define __CLC_FUNCTION __clc_log2_fast
+#define __CLC_IMPL_FUNCTION(x) __builtin_amdgcn_logf(x)
+#define __CLC_BODY <clc/shared/unary_def_scalarize.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_FLOAT_ONLY
+
+#define __CLC_HALF_ONLY
+#define __CLC_IMPL_FUNCTION(x) __clc_log2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_HALF_ONLY
+
+#define __CLC_DOUBLE_ONLY
+#define __CLC_IMPL_FUNCTION(x) __clc_log2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_DOUBLE_ONLY
diff --git a/libclc/clc/lib/generic/CMakeLists.txt b/libclc/clc/lib/generic/CMakeLists.txt
index 70a0863524b19..bda2ec67a55c3 100644
--- a/libclc/clc/lib/generic/CMakeLists.txt
+++ b/libclc/clc/lib/generic/CMakeLists.txt
@@ -80,6 +80,7 @@ libclc_configure_source_list(CLC_GENERIC_SOURCES
   math/clc_erfc.cl
   math/clc_exp.cl
   math/clc_exp2.cl
+  math/clc_exp2_fast.cl
   math/clc_exp10.cl
   math/clc_exp_helper.cl
   math/clc_expm1.cl
@@ -114,6 +115,7 @@ libclc_configure_source_list(CLC_GENERIC_SOURCES
   math/clc_lgamma_r.cl
   math/clc_log.cl
   math/clc_log2.cl
+  math/clc_log2_fast.cl
   math/clc_log10.cl
   math/clc_log1p.cl
   math/clc_logb.cl
diff --git a/libclc/clc/lib/generic/math/clc_ep.cl b/libclc/clc/lib/generic/math/clc_ep.cl
index f0e3020f65f6f..2e62563e2a107 100644
--- a/libclc/clc/lib/generic/math/clc_ep.cl
+++ b/libclc/clc/lib/generic/math/clc_ep.cl
@@ -9,8 +9,11 @@
 #include "clc/clc_convert.h"
 #include "clc/math/clc_div_fast.h"
 #include "clc/math/clc_ep.h"
+#include "clc/math/clc_exp.h"
 #include "clc/math/clc_fma.h"
+#include "clc/math/clc_frexp.h"
 #include "clc/math/clc_ldexp.h"
+#include "clc/math/clc_mad.h"
 #include "clc/math/clc_recip_fast.h"
 #include "clc/math/clc_sqrt_fast.h"
 #include "clc/relational/clc_isinf.h"
diff --git a/libclc/clc/lib/generic/math/clc_ep.inc b/libclc/clc/lib/generic/math/clc_ep.inc
index 38fa513c46aac..56a7f1a7becb2 100644
--- a/libclc/clc/lib/generic/math/clc_ep.inc
+++ b/libclc/clc/lib/generic/math/clc_ep.inc
@@ -387,5 +387,83 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_sqrt(__CLC_EP_PAIR a) {
                            a.hi == __CLC_FP_LIT(0.0) ? __CLC_FP_LIT(0.0) : slo);
 }
 
+#if __CLC_FPSIZE == 32
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_ep_exp(__CLC_EP_PAIR x) {
+  float d = x.hi == 0x1.62e430p+6f ? 0x1.0p-17f : 0.0f;
+  x.hi -= d;
+  x.lo += d;
+  float z = __clc_exp(x.hi);
+  float zz = __clc_fma(z, x.lo, z);
+  return __clc_isinf(z) ? z : zz;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_ln(float a) {
+  int a_exp;
+  float m = __clc_frexp(a, &a_exp);
+  int b = m < (2.0f / 3.0f);
+  m = __clc_ldexp(m, b);
+  int e = a_exp - b;
+
+  __CLC_EP_PAIR x = __clc_ep_div(m - 1.0f, __clc_ep_fast_add(1.0f, m));
+  __CLC_EP_PAIR s = __clc_ep_sqr(x);
+  float t = s.hi;
+  float p = __clc_mad(t, __clc_mad(t, 0x1.ed89c2p-3f, 0x1.23e988p-2f),
+                      0x1.999bdep-2f);
+
+  // ln(2)*e + 2*x + x^3(c3 + x^2*p)
+  float2 r = __clc_ep_add(
+      __clc_ep_mul(__clc_ep_make_pair(0x1.62e430p-1f, -0x1.05c610p-29f),
+                   (float)e),
+      __clc_ep_fast_add(
+          __clc_ep_ldexp(x, 1),
+          __clc_ep_mul(__clc_ep_mul(s, x),
+                       __clc_ep_fast_add(
+                           __clc_ep_make_pair(0x1.555554p-1f, 0x1.e72020p-29f),
+                           __clc_ep_mul(s, p)))));
+  return r;
+}
+
+#elif __CLC_FPSIZE == 64
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_ep_exp(__CLC_EP_PAIR x) {
+  __CLC_GENTYPE z = __clc_exp(x.hi);
+  __CLC_GENTYPE zz = __clc_mad(z, x.lo, z);
+  return __clc_isinf(z) ? z : zz;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_ln(double a) {
+  int a_exp;
+  double m = __clc_frexp(a, &a_exp);
+  int b = m < __CLC_FP_LIT(2.0 / 3.0);
+  m = __clc_ldexp(m, b);
+  int e = a_exp - b;
+
+  double2 x = __clc_ep_div(m - 1.0, __clc_ep_fast_add(1.0, m));
+  double2 s = __clc_ep_sqr(x);
+  double t = s.hi;
+  double p = __clc_mad(t, __clc_mad(t, __clc_mad(t, __clc_mad(t,
+             __clc_mad(t, __clc_mad(t, __clc_mad(t, __clc_mad(t,
+                 0x1.dee674222de17p-4, 0x1.a6564968915a9p-4), 0x1.e25e43abe935ap-4), 0x1.110ef47e6c9c2p-3),
+                 0x1.3b13bcfa74449p-3), 0x1.745d171bf3c30p-3), 0x1.c71c71c7792cep-3), 0x1.24924924920dap-2),
+                 0x1.999999999999cp-2);
+
+  // ln(2)*e + 2*x + x^3(c3 + x^2*p)
+  double2 r = __clc_ep_add(
+      __clc_ep_mul(
+          __clc_ep_make_pair(0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56),
+          (double)e),
+      __clc_ep_fast_add(
+          __clc_ep_ldexp(x, 1),
+          __clc_ep_mul(
+              __clc_ep_mul(s, x),
+              __clc_ep_fast_add(__clc_ep_make_pair(0x1.5555555555555p-1,
+                                                   0x1.543b0d5df274dp-55),
+                                __clc_ep_mul(s, p)))));
+  return r;
+}
+
+#endif
+
 #undef __CLC_EP_USE_FMA
 #endif
diff --git a/libclc/clc/lib/generic/math/clc_exp2_fast.cl b/libclc/clc/lib/generic/math/clc_exp2_fast.cl
new file mode 100644
index 0000000000000..e09bd65d7e02a
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_exp2_fast.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+
+#define __CLC_FUNCTION __clc_exp2_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_exp2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_log2_fast.cl b/libclc/clc/lib/generic/math/clc_log2_fast.cl
new file mode 100644
index 0000000000000..2aad63967e888
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_log2_fast.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+
+#define __CLC_FUNCTION __clc_log2_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_log2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_pow.cl b/libclc/clc/lib/generic/math/clc_pow.cl
index 70d3d614a8d36..14fbfb68359f5 100644
--- a/libclc/clc/lib/generic/math/clc_pow.cl
+++ b/libclc/clc/lib/generic/math/clc_pow.cl
@@ -6,16 +6,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clc_convert.h>
-#include <clc/internal/clc.h>
-#include <clc/math/clc_fabs.h>
-#include <clc/math/clc_fma.h>
-#include <clc/math/clc_ldexp.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/clc_subnormal_config.h>
-#include <clc/math/math.h>
-#include <clc/math/tables.h>
-#include <clc/relational/clc_select.h>
+#include "clc/clc_convert.h"
+#include "clc/float/definitions.h"
+#include "clc/internal/clc.h"
+#include "clc/math/clc_copysign.h"
+#include "clc/math/clc_ep.h"
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+#include "clc/math/clc_fabs.h"
+#include "clc/math/clc_ldexp.h"
+#include "clc/math/clc_log.h"
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+#include "clc/math/clc_mad.h"
+#include "clc/math/clc_recip_fast.h"
+#include "clc/math/clc_trunc.h"
+#include "clc/math/math.h"
+#include "clc/relational/clc_isinf.h"
+#include "clc/relational/clc_isunordered.h"
 
-#define __CLC_BODY <clc_pow.inc>
-#include <clc/math/gentype.inc>
+#define COMPILING_POW
+#define __CLC_BODY "clc_pow_base.inc"
+#include "clc/math/gentype.inc"
+
+#define __CLC_FUNCTION __clc_pow
+#define __CLC_BODY "clc/shared/binary_def_scalarize.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_pow_fast
+#define __CLC_BODY "clc/shared/binary_def_scalarize.inc"
+#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_pow.inc b/libclc/clc/lib/generic/math/clc_pow.inc
deleted file mode 100644
index 35cbcdae8ffff..0000000000000
--- a/libclc/clc/lib/generic/math/clc_pow.inc
+++ /dev/null
@@ -1,438 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Computes pow using log and exp
-//
-//   x^y = exp(y * log(x))
-//
-// We take care not to lose precision in the intermediate steps
-//
-// When computing log, calculate it in splits:
-//
-//   r = f * (p_invead + p_inv_tail)
-//   r = rh + rt
-//
-// Calculate log polynomial using r, in end addition, do:
-//
-//   poly = poly + ((rh-r) + rt)
-//
-//   lth = -r
-//   ltt = ((xexp * log2_t) - poly) + logT
-//   lt = lth + ltt
-//
-//   lh = (xexp * log2_h) + logH
-//   l = lh + lt
-//
-// Calculate final log answer as gh and gt:
-//
-//   gh = l & higher-half bits
-//   gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh))
-//
-//   yh = y & higher-half bits
-//   yt = y - yh
-//
-// Before entering computation of exp:
-//
-//   vs = ((yt*gt + yt*gh) + yh*gt)
-//   v = vs + yh*gh
-//   vt = ((yh*gh - v) + vs)
-//
-// In calculation of exp, add vt to r that is used for poly.
-//
-// At the end of exp, do
-//
-//   ((((expT * poly) + expT) + expH*poly) + expH)
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
-                                               __CLC_GENTYPE y) {
-  __CLC_GENTYPE absx = __clc_fabs(x);
-  __CLC_INTN ix = __CLC_AS_INTN(x);
-  __CLC_INTN ax = __CLC_AS_INTN(absx);
-  __CLC_INTN xpos = ix == ax;
-
-  __CLC_INTN iy = __CLC_AS_INTN(y);
-  __CLC_INTN ay = __CLC_AS_INTN(__clc_fabs(y));
-  __CLC_INTN ypos = iy == ay;
-
-  /* Extra precise log calculation
-   *  First handle case that x is close to 1
-   */
-  __CLC_GENTYPE r = 1.0f - absx;
-  __CLC_INTN near1 = __clc_fabs(r) < 0x1.0p-4f;
-  __CLC_GENTYPE r2 = r * r;
-
-  /* Coefficients are just 1/3, 1/4, 1/5 and 1/6 */
-  __CLC_GENTYPE poly = __clc_mad(
-      r,
-      __clc_mad(r,
-                __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
-                          0x1.99999ap-3f),
-                0x1.000000p-2f),
-      0x1.555556p-2f);
-
-  poly *= r2 * r;
-
-  __CLC_GENTYPE lth_near1 = -r2 * 0.5f;
-  __CLC_GENTYPE ltt_near1 = -poly;
-  __CLC_GENTYPE lt_near1 = lth_near1 + ltt_near1;
-  __CLC_GENTYPE lh_near1 = -r;
-  __CLC_GENTYPE l_near1 = lh_near1 + lt_near1;
-
-  /* Computations for x not near 1 */
-  __CLC_INTN m = __CLC_CONVERT_INTN(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
-  __CLC_GENTYPE mf = __CLC_CONVERT_GENTYPE(m);
-  __CLC_INTN ixs = __CLC_AS_INTN(__CLC_AS_GENTYPE(ax | 0x3f800000) - 1.0f);
-  __CLC_GENTYPE mfs = __CLC_CONVERT_GENTYPE((ixs >> EXPSHIFTBITS_SP32) - 253);
-  __CLC_INTN c = m == -127;
-  __CLC_INTN ixn = c ? ixs : ax;
-  __CLC_GENTYPE mfn = c ? mfs : mf;
-
-  __CLC_INTN indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
-  /* F - Y */
-  __CLC_GENTYPE f = __CLC_AS_GENTYPE(0x3f000000 | indx) -
-                    __CLC_AS_GENTYPE(0x3f000000 | (ixn & MANTBITS_SP32));
-
-  indx = indx >> 16;
-  __CLC_GENTYPE rh = f * __CLC_USE_TABLE(log_inv_tbl_ep_head, indx);
-  __CLC_GENTYPE rt = f * __CLC_USE_TABLE(log_inv_tbl_ep_tail, indx);
-  r = rh + rt;
-
-  poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
-         (r * r);
-  poly += (rh - r) + rt;
-
-  const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f;  /* 0.693115234 */
-  const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; /* 0.0000319461833 */
-  __CLC_GENTYPE logel = __CLC_USE_TABLE(loge_tbl_lo, indx);
-  __CLC_GENTYPE logeh = __CLC_USE_TABLE(loge_tbl_hi, indx);
-  __CLC_GENTYPE lth = -r;
-  __CLC_GENTYPE ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + logeh;
-  __CLC_GENTYPE lt = lth + ltt;
-  __CLC_GENTYPE lh = __clc_mad(mfn, LOG2_HEAD, logel);
-  __CLC_GENTYPE l = lh + lt;
-
-  /* Select near 1 or not */
-  lth = near1 ? lth_near1 : lth;
-  ltt = near1 ? ltt_near1 : ltt;
-  lt = near1 ? lt_near1 : lt;
-  lh = near1 ? lh_near1 : lh;
-  l = near1 ? l_near1 : l;
-
-  __CLC_GENTYPE gh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(l) & 0xfffff000);
-  __CLC_GENTYPE gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
-  __CLC_GENTYPE yh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(iy) & 0xfffff000);
-
-  __CLC_GENTYPE yt = y - yh;
-
-  __CLC_GENTYPE ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
-  __CLC_GENTYPE ylogx = __clc_mad(yh, gh, ylogx_s);
-  __CLC_GENTYPE ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
-
-  /* Extra precise exp of ylogx */
-  /* 64/log2 : 92.332482616893657 */
-  const __CLC_GENTYPE R_64_BY_LOG2 = 0x1.715476p+6f;
-  __CLC_INTN n = __CLC_CONVERT_INTN(ylogx * R_64_BY_LOG2);
-  __CLC_GENTYPE nf = __CLC_CONVERT_GENTYPE(n);
-
-  __CLC_INTN j = n & 0x3f;
-  m = n >> 6;
-  __CLC_INTN m2 = m << EXPSHIFTBITS_SP32;
-
-  /* log2/64 lead: 0.0108032227 */
-  const __CLC_GENTYPE R_LOG2_BY_64_LD = 0x1.620000p-7f;
-  /* log2/64 tail: 0.0000272020388 */
-  const __CLC_GENTYPE R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
-  r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
-      ylogx_t;
-
-  /* Truncated Taylor series for e^r */
-  poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
-                             0x1.000000p-1f),
-                   r * r, r);
-
-  __CLC_GENTYPE exp_head = __CLC_USE_TABLE(exp_tbl_ep_head, j);
-  __CLC_GENTYPE exp_tail = __CLC_USE_TABLE(exp_tbl_ep_tail, j);
-
-  __CLC_GENTYPE expylogx =
-      __clc_mad(exp_head, poly, __clc_mad(exp_tail, poly, exp_tail)) + exp_head;
-  __CLC_GENTYPE sexpylogx =
-      expylogx * __CLC_AS_GENTYPE((__CLC_UINTN)0x1 << (m + 149));
-  __CLC_GENTYPE texpylogx = __CLC_AS_GENTYPE(__CLC_AS_INTN(expylogx) + m2);
-  expylogx = m < -125 ? sexpylogx : texpylogx;
-
-  /* Result is +-Inf if (ylogx + ylogx_t) > 128*log2 */
-  expylogx =
-      __clc_select(expylogx, __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32),
-                   ylogx > 0x1.62e430p+6f ||
-                       (ylogx == 0x1.62e430p+6f && ylogx_t > -0x1.05c610p-22f));
-
-  /* Result is 0 if ylogx < -149*log2 */
-  expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
-  /* Classify y:
-   *   inty = 0 means not an integer.
-   *   inty = 1 means odd integer.
-   *   inty = 2 means even integer.
-   */
-
-  __CLC_INTN yexp =
-      __CLC_CONVERT_INTN(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
-  __CLC_INTN mask = ((__CLC_INTN)1 << (24 - yexp)) - 1;
-  __CLC_INTN yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
-  __CLC_INTN inty = yodd ? 1 : 2;
-  inty = (iy & mask) != 0 ? 0 : inty;
-  inty = yexp < 1 ? 0 : inty;
-  inty = yexp > 24 ? 2 : inty;
-
-  __CLC_GENTYPE signval =
-      __CLC_AS_GENTYPE((__CLC_AS_UINTN(expylogx) ^ SIGNBIT_SP32));
-  expylogx = ((inty == 1) && !xpos) ? signval : expylogx;
-  __CLC_INTN ret = __CLC_AS_INTN(expylogx);
-
-  /* Corner case handling */
-  ret = (!xpos && (inty == 0)) ? QNANBITPATT_SP32 : ret;
-  ret = ax < 0x3f800000 && iy == (__CLC_INTN)NINFBITPATT_SP32 ? PINFBITPATT_SP32
-                                                              : ret;
-  ret = ax > 0x3f800000 && iy == (__CLC_INTN)NINFBITPATT_SP32 ? 0 : ret;
-  ret = ax < 0x3f800000 && iy == (__CLC_INTN)PINFBITPATT_SP32 ? 0 : ret;
-  ret = ax > 0x3f800000 && iy == (__CLC_INTN)PINFBITPATT_SP32 ? PINFBITPATT_SP32
-                                                              : ret;
-  __CLC_BIT_INTN x_is_ninf = ix == (__CLC_INTN)NINFBITPATT_SP32;
-  __CLC_BIT_INTN x_is_pinf = ix == (__CLC_INTN)PINFBITPATT_SP32;
-  __CLC_INTN xinf =
-      xpos ? (__CLC_INTN)PINFBITPATT_SP32 : (__CLC_INTN)NINFBITPATT_SP32;
-
-  ret = ((ax == 0) && !ypos && (inty == 1)) ? xinf : ret;
-  ret = ((ax == 0) && !ypos && (inty != 1)) ? PINFBITPATT_SP32 : ret;
-  __CLC_INTN xzero = xpos ? (__CLC_INTN)0 : (__CLC_INTN)0x80000000;
-  ret = ((ax == 0) && ypos && (inty == 1)) ? xzero : ret;
-  ret = ((ax == 0) && ypos && (inty != 1)) ? 0 : ret;
-  ret = ((ax == 0) && (iy == (__CLC_INTN)NINFBITPATT_SP32)) ? PINFBITPATT_SP32
-                                                            : ret;
-  ret = (ix == (__CLC_INTN)0xbf800000 && ay == PINFBITPATT_SP32) ? 0x3f800000
-                                                                 : ret;
-  ret = (x_is_ninf && !ypos && (inty == 1)) ? (__CLC_INTN)0x80000000 : ret;
-  ret = (x_is_ninf && !ypos && (inty != 1)) ? 0 : ret;
-  ret = (x_is_ninf && ypos && (inty == 1)) ? (__CLC_INTN)NINFBITPATT_SP32 : ret;
-  ret = (x_is_ninf && ypos && (inty != 1)) ? (__CLC_INTN)PINFBITPATT_SP32 : ret;
-  ret = (x_is_pinf && !ypos) ? 0 : ret;
-  ret = (x_is_pinf && ypos) ? PINFBITPATT_SP32 : ret;
-  ret = (ax > PINFBITPATT_SP32) ? ix : ret;
-  ret = (ay > PINFBITPATT_SP32) ? iy : ret;
-  ret = ay == 0 ? 0x3f800000 : ret;
-  ret = ix == 0x3f800000 ? 0x3f800000 : ret;
-
-  return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 64
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
-                                               __CLC_GENTYPE y) {
-  const __CLC_GENTYPE real_log2_tail = 5.76999904754328540596e-08;
-  const __CLC_GENTYPE real_log2_lead = 6.93147122859954833984e-01;
-
-  __CLC_LONGN ux = __CLC_AS_LONGN(x);
-  __CLC_LONGN ax = __CLC_AS_LONGN(__clc_fabs(x));
-  __CLC_BIT_INTN xpos = ax == ux;
-
-  __CLC_LONGN uy = __CLC_AS_LONGN(y);
-  __CLC_LONGN ay = __CLC_AS_LONGN(__clc_fabs(y));
-  __CLC_BIT_INTN ypos = ay == uy;
-
-  // Extended precision log
-  __CLC_GENTYPE v, vt;
-  {
-    __CLC_INTN exp = __CLC_CONVERT_INTN(ax >> 52) - 1023;
-    __CLC_INTN mask_exp_1023 = exp == (__CLC_INTN)-1023;
-    __CLC_GENTYPE xexp = __CLC_CONVERT_GENTYPE(exp);
-    __CLC_LONGN mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
-    __CLC_LONGN temp_ux =
-        __CLC_AS_LONGN(__CLC_AS_GENTYPE(0x3ff0000000000000L | mantissa) - 1.0);
-    exp = __CLC_CONVERT_INTN((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
-    __CLC_GENTYPE xexp1 = __CLC_CONVERT_GENTYPE(exp);
-    __CLC_LONGN mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
-    xexp = __CLC_CONVERT_LONGN(mask_exp_1023) ? xexp1 : xexp;
-    mantissa = __CLC_CONVERT_LONGN(mask_exp_1023) ? mantissa1 : mantissa;
-
-    __CLC_LONGN rax = (mantissa & 0x000ff00000000000) +
-                      ((mantissa & 0x0000080000000000) << 1);
-    __CLC_INTN index = __CLC_CONVERT_INTN(rax >> 44);
-
-    __CLC_GENTYPE F = __CLC_AS_GENTYPE(rax | 0x3FE0000000000000L);
-    __CLC_GENTYPE Y = __CLC_AS_GENTYPE(mantissa | 0x3FE0000000000000L);
-    __CLC_GENTYPE f = F - Y;
-    __CLC_GENTYPE log_h = __CLC_USE_TABLE(log_f_inv_tbl_head, index);
-    __CLC_GENTYPE log_t = __CLC_USE_TABLE(log_f_inv_tbl_tail, index);
-    __CLC_GENTYPE f_inv = (log_h + log_t) * f;
-    __CLC_GENTYPE r1 =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(f_inv) & 0xfffffffff8000000L);
-    __CLC_GENTYPE r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
-    __CLC_GENTYPE r = r1 + r2;
-
-    __CLC_GENTYPE poly = __clc_fma(
-        r,
-        __clc_fma(r,
-                  __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
-                  1.0 / 4.0),
-        1.0 / 3.0);
-    poly = poly * r * r * r;
-
-    __CLC_GENTYPE hr1r1 = 0.5 * r1 * r1;
-    __CLC_GENTYPE poly0h = r1 + hr1r1;
-    __CLC_GENTYPE poly0t = r1 - poly0h + hr1r1;
-    poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;
-
-    log_h = __CLC_USE_TABLE(powlog_tbl_head, index);
-    log_t = __CLC_USE_TABLE(powlog_tbl_tail, index);
-
-    __CLC_GENTYPE resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
-    __CLC_GENTYPE resT = resT_t - poly0h;
-    __CLC_GENTYPE resH = __clc_fma(xexp, real_log2_lead, log_h);
-    __CLC_GENTYPE resT_h = poly0h;
-
-    __CLC_GENTYPE H = resT + resH;
-    __CLC_GENTYPE H_h =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(H) & 0xfffffffff8000000L);
-    __CLC_GENTYPE T =
-        (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
-    H = H_h;
-
-    __CLC_GENTYPE y_head =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(uy) & 0xfffffffff8000000L);
-    __CLC_GENTYPE y_tail = y - y_head;
-
-    __CLC_GENTYPE temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
-    v = __clc_fma(y_head, H, temp);
-    vt = __clc_fma(y_head, H, -v) + temp;
-  }
-
-  // Now calculate exp of (v,vt)
-
-  __CLC_GENTYPE expv;
-  {
-    const __CLC_GENTYPE max_exp_arg = 709.782712893384;
-    const __CLC_GENTYPE min_exp_arg = -745.1332191019411;
-    const __CLC_GENTYPE sixtyfour_by_lnof2 = 92.33248261689366;
-    const __CLC_GENTYPE lnof2_by_64_head = 0.010830424260348081;
-    const __CLC_GENTYPE lnof2_by_64_tail = -4.359010638708991e-10;
-
-    // If v is so large that we need to return INFINITY, or so small that we
-    // need to return 0, set v to known values that will produce that result. Do
-    // not try to continue the computation with the original v and patch it up
-    // afterwards because v may be so large that temp is out of range of int, in
-    // which case that conversion, and a value based on that conversion being
-    // passed to __clc_ldexp, results in undefined behavior.
-    v = v > max_exp_arg ? 1000.0 : v;
-    v = v < min_exp_arg ? -1000.0 : v;
-
-    __CLC_GENTYPE temp = v * sixtyfour_by_lnof2;
-    __CLC_INTN n = __CLC_CONVERT_INTN(temp);
-    __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n);
-    __CLC_INTN j = n & 0x0000003f;
-    __CLC_INTN m = n >> 6;
-
-    __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j);
-    __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j);
-    __CLC_GENTYPE f = f1 + f2;
-
-    __CLC_GENTYPE r1 = __clc_fma(dn, -lnof2_by_64_head, v);
-    __CLC_GENTYPE r2 = dn * lnof2_by_64_tail;
-    __CLC_GENTYPE r = (r1 + r2) + vt;
-
-    __CLC_GENTYPE q =
-        __clc_fma(r,
-                  __clc_fma(r,
-                            __clc_fma(r,
-                                      __clc_fma(r, 1.38889490863777199667e-03,
-                                                8.33336798434219616221e-03),
-                                      4.16666666662260795726e-02),
-                            1.66666666665260878863e-01),
-                  5.00000000000000008883e-01);
-    q = __clc_fma(r * r, q, r);
-
-    expv = __clc_fma(f, q, f2) + f1;
-    expv = __clc_ldexp(expv, m);
-  }
-
-  // See whether y is an integer.
-  // inty = 0 means not an integer.
-  // inty = 1 means odd integer.
-  // inty = 2 means even integer.
-
-  __CLC_LONGN inty;
-  {
-    __CLC_INTN yexp =
-        __CLC_CONVERT_INTN(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
-    inty = __CLC_CONVERT_LONGN(yexp < 1 ? 0 : 2);
-    inty = __CLC_CONVERT_LONGN(yexp > 53) ? 2 : inty;
-    __CLC_LONGN mask = ((__CLC_LONGN)1L << (53 - yexp)) - 1L;
-    __CLC_LONGN inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1L : 2L;
-    inty1 = (ay & mask) != 0 ? 0 : inty1;
-    inty = __CLC_CONVERT_LONGN(!(yexp < 1) && !(yexp > 53)) ? inty1 : inty;
-  }
-
-  expv *= (inty == 1) && !xpos ? -1.0 : 1.0;
-
-  __CLC_LONGN ret = __CLC_AS_LONGN(expv);
-
-  // Now all the edge cases
-  __CLC_BIT_INTN x_is_ninf = ux == (__CLC_LONGN)NINFBITPATT_DP64;
-  __CLC_BIT_INTN x_is_pinf = ux == (__CLC_LONGN)PINFBITPATT_DP64;
-  __CLC_BIT_INTN y_is_ninf = uy == (__CLC_LONGN)NINFBITPATT_DP64;
-  __CLC_BIT_INTN y_is_pinf = uy == (__CLC_LONGN)PINFBITPATT_DP64;
-  ret = !xpos && (inty == 0) ? QNANBITPATT_DP64 : ret;
-  ret = ax < 0x3ff0000000000000L && y_is_ninf ? PINFBITPATT_DP64 : ret;
-  ret = ax > 0x3ff0000000000000L && y_is_ninf ? 0L : ret;
-  ret = ax < 0x3ff0000000000000L && y_is_pinf ? 0L : ret;
-  ret = ax > 0x3ff0000000000000L && y_is_pinf ? PINFBITPATT_DP64 : ret;
-  __CLC_LONGN xinf =
-      xpos ? (__CLC_LONGN)PINFBITPATT_DP64 : (__CLC_LONGN)NINFBITPATT_DP64;
-  ret = ((ax == 0L) && !ypos && (inty == 1)) ? xinf : ret;
-  ret = ((ax == 0L) && !ypos && (inty != 1)) ? PINFBITPATT_DP64 : ret;
-  __CLC_LONGN xzero = xpos ? (__CLC_LONGN)0L : (__CLC_LONGN)0x8000000000000000L;
-  ret = ((ax == 0L) && ypos && (inty == 1)) ? xzero : ret;
-  ret = ((ax == 0L) && ypos && (inty != 1)) ? 0L : ret;
-  ret = ((ax == 0L) && y_is_ninf) ? PINFBITPATT_DP64 : ret;
-  ret = ((ux == (__CLC_LONGN)0xbff0000000000000L) && (ay == PINFBITPATT_DP64))
-            ? 0x3ff0000000000000L
-            : ret;
-  ret = (x_is_ninf && !ypos && (inty == 1)) ? (__CLC_LONGN)0x8000000000000000L
-                                            : ret;
-  ret = (x_is_ninf && !ypos && (inty != 1)) ? 0L : ret;
-  ret =
-      (x_is_ninf && ypos && (inty == 1)) ? (__CLC_LONGN)NINFBITPATT_DP64 : ret;
-  ret =
-      (x_is_ninf && ypos && (inty != 1)) ? (__CLC_LONGN)PINFBITPATT_DP64 : ret;
-  ret = x_is_pinf && !ypos ? 0L : ret;
-  ret = x_is_pinf && ypos ? PINFBITPATT_DP64 : ret;
-  ret = ax > PINFBITPATT_DP64 ? ux : ret;
-  ret = ay > PINFBITPATT_DP64 ? uy : ret;
-  ret = ay == 0L ? 0x3ff0000000000000L : ret;
-  ret = ux == 0x3ff0000000000000L ? 0x3ff0000000000000L : ret;
-
-  return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
-                                               __CLC_GENTYPE y) {
-  return __CLC_CONVERT_GENTYPE(
-      __clc_pow(__CLC_CONVERT_FLOATN(x), __CLC_CONVERT_FLOATN(y)));
-}
-
-#endif
diff --git a/libclc/clc/lib/generic/math/clc_pow_base.inc b/libclc/clc/lib/generic/math/clc_pow_base.inc
new file mode 100644
index 0000000000000..56fdb177a844b
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_pow_base.inc
@@ -0,0 +1,542 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Computes pow using log and exp
+//
+//   x^y = exp(y * log(x))
+//
+// We take care not to lose precision in the intermediate steps.
+//
+// When computing log, calculate it in splits:
+//
+//   r = f * (p_invead + p_inv_tail)
+//   r = rh + rt
+//
+// Calculate log polynomial using r, in end addition, do:
+//
+//   poly = poly + ((rh-r) + rt)
+//
+//   lth = -r
+//   ltt = ((xexp * log2_t) - poly) + logT
+//   lt = lth + ltt
+//
+//   lh = (xexp * log2_h) + logH
+//   l = lh + lt
+//
+// Calculate final log answer as gh and gt:
+//
+//   gh = l & higher-half bits
+//   gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh))
+//
+//   yh = y & higher-half bits
+//   yt = y - yh
+//
+// Before entering computation of exp:
+//
+//   vs = ((yt*gt + yt*gh) + yh*gt)
+//   v = vs + yh*gh
+//   vt = ((yh*gh - v) + vs)
+//
+// In calculation of exp, add vt to r that is used for poly.
+//
+// At the end of exp, do:
+//
+//   ((((expT * poly) + expT) + expH*poly) + expH)
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+#ifdef COMPILING_POW
+
+_CLC_OVERLOAD _CLC_CONST static bool is_integer(__CLC_GENTYPE ay) {
+  return __clc_trunc(ay) == ay;
+}
+
+_CLC_OVERLOAD _CLC_CONST static bool is_even_integer(__CLC_GENTYPE ay) {
+  // Even integers are still integers after division by 2.
+  return is_integer(__CLC_FP_LIT(0.5) * ay);
+}
+
+_CLC_OVERLOAD _CLC_CONST static bool is_odd_integer(__CLC_GENTYPE ay) {
+  return is_integer(ay) && !is_even_integer(ay);
+}
+#endif
+
+#if __CLC_FPSIZE == 32
+
+_CLC_CONST
+static __CLC_GENTYPE fast_expylnx(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  return __clc_exp2(y * __clc_log2(ax));
+}
+
+#if defined(COMPILING_POW) || defined(COMPILING_POWR)
+
+_CLC_CONST
+static __CLC_GENTYPE compute_expylnx_float(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  return __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+}
+#endif
+
+#if defined(COMPILING_POW)
+
+_CLC_CONST
+static __CLC_GENTYPE pow_fixup(__CLC_GENTYPE x, __CLC_GENTYPE y,
+                               __CLC_GENTYPE expylnx) {
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  bool is_odd_y = is_odd_integer(y);
+
+  __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0f);
+
+  // Now all the edge cases
+  if (x < 0.0f && !is_integer(y))
+    ret = FLT_NAN;
+
+  __CLC_GENTYPE ay = __clc_fabs(y);
+  if (__clc_isinf(ay)) {
+    // FIXME: Missing backend optimization to save on
+    // materialization cost of mixed sign constant infinities.
+    bool y_is_neg_inf = y != ay;
+    ret = ax == 1.0f ? ax : ((ax < 1.0f) ^ y_is_neg_inf ? 0.0f : ay);
+  }
+
+  if (__clc_isinf(ax) || x == 0.0f)
+    ret = __clc_copysign((x == 0.0f) ^ (y < 0.0f) ? 0.0f : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0f);
+
+  if (__clc_isunordered(x, y))
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
+                                                          __CLC_GENTYPE y) {
+  if (x == 1.0f)
+    y = 1.0f;
+  if (y == 0.0f)
+    x = 1.0f;
+
+  __CLC_GENTYPE expylnx = compute_expylnx_float(x, y);
+  return pow_fixup(x, y, expylnx);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
+__clc_pow_fast(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+  if (x == 1.0f)
+    y = 1.0f;
+  if (y == 0.0f)
+    x = 1.0f;
+
+  __CLC_GENTYPE expylnx = fast_expylnx(x, y);
+  return pow_fixup(x, y, expylnx);
+}
+
+#elif defined(COMPILING_POWR)
+
+_CLC_CONST
+static __CLC_GENTYPE powr_fixup(__CLC_GENTYPE x, __CLC_GENTYPE y,
+                                __CLC_GENTYPE expylnx) {
+  __CLC_GENTYPE ret = expylnx;
+
+  // Now all the edge cases
+  __CLC_GENTYPE iz = y < 0.0f ? __CLC_GENTYPE_INF : 0.0f;
+  __CLC_GENTYPE zi = y < 0.0f ? 0.0f : __CLC_GENTYPE_INF;
+
+  if (x == 0.0f)
+    ret = y == 0.0f ? __CLC_GENTYPE_NAN : iz;
+
+  if (x == __CLC_GENTYPE_INF && y != 0.0f)
+    ret = zi;
+
+  if (__clc_isinf(y) && x != 1.0f)
+    ret = x < 1.0f ? iz : zi;
+
+  if (__clc_isunordered(x, y))
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
+                                                           __CLC_GENTYPE y) {
+  if (x < 0.0f)
+    x = __CLC_GENTYPE_NAN;
+
+  __CLC_GENTYPE expylnx = compute_expylnx_float(x, y);
+  return powr_fixup(x, y, expylnx);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
+__clc_powr_fast(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+  if (x < 0.0f)
+    x = __CLC_GENTYPE_NAN;
+
+  __CLC_GENTYPE expylnx = fast_expylnx(x, y);
+  return powr_fixup(x, y, expylnx);
+}
+
+#elif defined(COMPILING_POWN)
+
+_CLC_CONST
+static __CLC_GENTYPE compute_expylnx_int(__CLC_GENTYPE x, __CLC_INTN ny) {
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  __CLC_INTN nyh = ny & 0xffff0000;
+  __CLC_EP_PAIR y = __clc_ep_fast_add(__CLC_CONVERT_GENTYPE(nyh),
+                                      __CLC_CONVERT_GENTYPE(ny - nyh));
+  return __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+}
+
+_CLC_CONST
+static __CLC_GENTYPE pown_fixup(__CLC_GENTYPE x, __CLC_INTN ny,
+                                __CLC_GENTYPE expylnx) {
+  bool is_odd_y = ny & 1;
+
+  __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0f);
+
+  // Now all the edge cases
+  if (__clc_isinf(x) || x == 0.0f)
+    ret = __clc_copysign((x == 0.0f) ^ (ny < 0) ? 0.0f : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0f);
+  return ret;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
+                                                           __CLC_INTN ny) {
+  if (ny == 0)
+    x = 1.0f;
+
+  __CLC_GENTYPE expylnx = compute_expylnx_int(x, ny);
+  return pown_fixup(x, ny, expylnx);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown_fast(__CLC_GENTYPE x,
+                                                                __CLC_INTN ny) {
+  if (ny == 0)
+    x = 1.0f;
+
+  __CLC_GENTYPE expylnx = fast_expylnx(x, __CLC_CONVERT_GENTYPE(ny));
+  return pown_fixup(x, ny, expylnx);
+}
+
+#elif defined(COMPILING_ROOTN)
+
+// root version of compute_expylnx_int
+_CLC_CONST
+static __CLC_GENTYPE compute_exp_inverse_y_lnx_int(__CLC_GENTYPE x,
+                                                   __CLC_INTN ny) {
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  __CLC_INTN nyh = ny & 0xffff0000;
+  __CLC_EP_PAIR y = __clc_ep_fast_add(__CLC_CONVERT_GENTYPE(nyh),
+                                      __CLC_CONVERT_GENTYPE(ny - nyh));
+  y = __clc_ep_recip(y);
+  return __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+}
+
+_CLC_CONST
+static __CLC_GENTYPE rootn_fixup(__CLC_GENTYPE x, __CLC_INTN ny,
+                                 __CLC_GENTYPE expylnx) {
+  bool is_odd_y = ny & 1;
+
+  __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0f);
+
+  // Now all the edge cases
+  if (__clc_isinf(x) || x == 0.0f)
+    ret = __clc_copysign((x == 0.0f) ^ (ny < 0) ? 0.0f : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0f);
+
+  if ((x < 0.0f && !is_odd_y) || ny == 0)
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+_CLC_CONST
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
+                                                            __CLC_INTN ny) {
+  __CLC_GENTYPE expylnx = compute_exp_inverse_y_lnx_int(x, ny);
+  return rootn_fixup(x, ny, expylnx);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
+__clc_rootn_fast(__CLC_GENTYPE x, __CLC_INTN ny) {
+  __CLC_GENTYPE y = __clc_recip_fast(__CLC_CONVERT_GENTYPE(ny));
+  __CLC_GENTYPE expylnx = fast_expylnx(x, y);
+  return rootn_fixup(x, ny, expylnx);
+}
+
+#else
+#error missing function macro
+#endif
+
+#elif __CLC_FPSIZE == 64
+
+#if defined(COMPILING_POW)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
+                                                          __CLC_GENTYPE y) {
+  if (x == 1.0)
+    y = 1.0;
+  if (y == 0.0)
+    x = 1.0;
+
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  __CLC_GENTYPE expylnx =
+      __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+
+  bool is_odd_y = is_odd_integer(y);
+
+  __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0);
+
+  // Now all the edge cases
+  if (x < 0.0 && !is_integer(y))
+    ret = __CLC_GENTYPE_NAN;
+
+  __CLC_GENTYPE ay = __clc_fabs(y);
+  if (__clc_isinf(ay)) {
+    // FIXME: Missing backend optimization to save on
+    // materialization cost of mixed sign constant infinities.
+    bool y_is_neg_inf = y != ay;
+    ret = ax == 1.0 ? ax : ((ax < 1.0) ^ y_is_neg_inf ? 0.0 : ay);
+  }
+
+  if (__clc_isinf(ax) || x == 0.0)
+    ret = __clc_copysign((x == 0.0) ^ (y < 0.0) ? 0.0 : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0);
+
+  if (__clc_isunordered(x, y))
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+#elif defined(COMPILING_POWR)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
+                                                           __CLC_GENTYPE y) {
+  if (x < 0.0)
+    x = __CLC_GENTYPE_NAN;
+
+  __CLC_GENTYPE ret = __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(x)));
+
+  // Now all the edge cases
+  __CLC_GENTYPE iz = y < 0.0 ? __CLC_GENTYPE_INF : 0.0;
+  __CLC_GENTYPE zi = y < 0.0 ? 0.0 : __CLC_GENTYPE_INF;
+
+  if (x == 0.0)
+    ret = y == 0.0 ? __CLC_GENTYPE_NAN : iz;
+
+  if (x == __CLC_GENTYPE_INF && y != 0.0)
+    ret = zi;
+
+  if (__clc_isinf(y) && x != 1.0)
+    ret = x < 1.0 ? iz : zi;
+
+  if (y == 0.0)
+    ret = x == 0.0 || __clc_isinf(x) ? __CLC_GENTYPE_NAN : 1.0;
+
+  if (__clc_isunordered(x, y))
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+#elif defined(COMPILING_POWN)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
+                                                           __CLC_INTN ny) {
+  if (ny == 0)
+    x = 1.0;
+
+  __CLC_GENTYPE y = __CLC_CONVERT_GENTYPE(ny);
+
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  __CLC_GENTYPE expylnx =
+      __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+
+  bool is_odd_y = ny & 1;
+
+  __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0);
+
+  // Now all the edge cases
+  if (__clc_isinf(ax) || x == 0.0)
+    ret = __clc_copysign((x == 0.0) ^ (ny < 0) ? 0.0 : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0);
+
+  return ret;
+}
+
+#elif defined(COMPILING_ROOTN)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
+                                                            __CLC_INTN ny) {
+  __CLC_EP_PAIR y = __clc_ep_recip(__CLC_CONVERT_GENTYPE(ny));
+
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  __CLC_GENTYPE expylnx =
+      __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+
+  bool is_odd_y = ny & 1;
+
+  __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0);
+
+  // Now all the edge cases
+  if (__clc_isinf(ax) || x == 0.0)
+    ret = __clc_copysign((x == 0.0) ^ (ny < 0) ? 0.0 : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0);
+
+  if ((x < 0.0 && !is_odd_y) || ny == 0)
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+#else
+#error missing function macro
+#endif
+
+#elif __CLC_FPSIZE == 16
+
+#if defined(COMPILING_POW) || defined(COMPILING_POWR)
+
+_CLC_CONST
+static __CLC_GENTYPE compute_expylnx_f16(__CLC_GENTYPE ax, __CLC_GENTYPE y) {
+  __CLC_FLOATN x_float = __CLC_CONVERT_FLOATN(ax);
+  __CLC_FLOATN y_float = __CLC_CONVERT_FLOATN(y);
+  __CLC_FLOATN result = __clc_exp2_fast(y_float * __clc_log2_fast(x_float));
+  return __CLC_CONVERT_GENTYPE(result);
+}
+
+#endif // defined(COMPILING_POW) || defined(COMPILING_POWR)
+
+#if defined(COMPILING_POW)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
+                                                          __CLC_GENTYPE y) {
+  if (x == 1.0h)
+    y = 1.0h;
+  if (y == 0.0h)
+    x = 1.0h;
+
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  __CLC_GENTYPE p = compute_expylnx_f16(ax, y);
+
+  bool is_odd_y = is_odd_integer(y);
+  __CLC_GENTYPE ret = __clc_copysign(p, is_odd_y ? x : 1.0h);
+
+  // Now all the edge cases
+  if (x < 0.0h && !is_integer(y))
+    ret = __CLC_GENTYPE_NAN;
+
+  __CLC_GENTYPE ay = __clc_fabs(y);
+  if (__clc_isinf(ay)) {
+    // FIXME: Missing backend optimization to save on
+    // materialization cost of mixed sign constant infinities.
+    bool y_is_neg_inf = y != ay;
+    ret = ax == 1.0h ? ax : ((ax < 1.0h) ^ y_is_neg_inf ? 0.0h : ay);
+  }
+
+  if (__clc_isinf(ax) || x == 0.0h) {
+    ret = __clc_copysign((x == 0.0h) ^ (y < 0.0h) ? 0.0h : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0h);
+  }
+
+  if (__clc_isunordered(x, y))
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+#elif defined(COMPILING_POWR)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
+                                                           __CLC_GENTYPE y) {
+  if (x < 0.0h)
+    x = __CLC_GENTYPE_NAN;
+
+  __CLC_GENTYPE ret = compute_expylnx_f16(x, y);
+
+  // Now all the edge cases
+  __CLC_GENTYPE iz = y < 0.0h ? __CLC_GENTYPE_INF : 0.0h;
+  __CLC_GENTYPE zi = y < 0.0h ? 0.0h : __CLC_GENTYPE_INF;
+
+  if (x == 0.0h)
+    ret = y == 0.0h ? __CLC_GENTYPE_NAN : iz;
+
+  if (x == __CLC_GENTYPE_INF && y != 0.0h)
+    ret = zi;
+
+  if (__clc_isinf(y) && x != 1.0h)
+    ret = x < 1.0h ? iz : zi;
+
+  if (__clc_isunordered(x, y))
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+#elif defined(COMPILING_POWN)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
+                                                           __CLC_INTN ny) {
+  if (ny == 0)
+    x = 1.0h;
+
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  __CLC_FLOATN fy = __CLC_CONVERT_FLOATN(ny);
+  __CLC_FLOATN p =
+      __clc_exp2_fast(fy * __clc_log2_fast(__CLC_CONVERT_FLOATN(ax)));
+
+  bool is_odd_y = ny & 1;
+
+  __CLC_GENTYPE ret =
+      __clc_copysign(__CLC_CONVERT_GENTYPE(p), is_odd_y ? x : 1.0h);
+
+  // Now all the edge cases
+  if (__clc_isinf(ax) || x == 0.0h)
+    ret = __clc_copysign((x == 0.0h) ^ (ny < 0) ? 0.0h : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0h);
+
+  return ret;
+}
+
+#elif defined(COMPILING_ROOTN)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
+                                                            __CLC_INTN ny) {
+  __CLC_GENTYPE ax = __clc_fabs(x);
+
+  __CLC_FLOATN fy = __clc_recip_fast(__CLC_CONVERT_FLOATN(ny));
+
+  __CLC_FLOATN p =
+      __clc_exp2_fast(fy * __clc_log2_fast(__CLC_CONVERT_FLOATN(ax)));
+
+  bool is_odd_y = ny & 1;
+
+  __CLC_GENTYPE ret =
+      __clc_copysign(__CLC_CONVERT_GENTYPE(p), is_odd_y ? x : 1.0h);
+
+  // Now all the edge cases
+  if (__clc_isinf(ax) || x == 0.0h)
+    ret = __clc_copysign((x == 0.0h) ^ (ny < 0) ? 0.0h : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0h);
+
+  if ((x < 0.0h && !is_odd_y) || ny == 0)
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+#else
+#error missing function macro
+#endif
+
+#endif
+#endif // __CLC_SCALAR
diff --git a/libclc/clc/lib/generic/math/clc_pown.cl b/libclc/clc/lib/generic/math/clc_pown.cl
index 5aa9560174b99..c21738de8cf2f 100644
--- a/libclc/clc/lib/generic/math/clc_pown.cl
+++ b/libclc/clc/lib/generic/math/clc_pown.cl
@@ -6,16 +6,34 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clc_convert.h>
-#include <clc/internal/clc.h>
-#include <clc/math/clc_fabs.h>
-#include <clc/math/clc_fma.h>
-#include <clc/math/clc_ldexp.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/clc_subnormal_config.h>
-#include <clc/math/math.h>
-#include <clc/math/tables.h>
-#include <clc/relational/clc_select.h>
+#include "clc/clc_convert.h"
+#include "clc/math/clc_copysign.h"
+#include "clc/math/clc_ep.h"
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+#include "clc/math/clc_fabs.h"
+#include "clc/math/clc_ldexp.h"
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+#include "clc/math/clc_mad.h"
+#include "clc/math/clc_pown.h"
+#include "clc/math/clc_trunc.h"
+#include "clc/relational/clc_isinf.h"
 
-#define __CLC_BODY <clc_pown.inc>
-#include <clc/math/gentype.inc>
+#define COMPILING_POWN
+#define __CLC_BODY "clc_pow_base.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#define __CLC_FUNCTION __clc_pown
+#define __CLC_BODY                                                             \
+  "clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_pown_fast
+#define __CLC_BODY                                                             \
+  "clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
diff --git a/libclc/clc/lib/generic/math/clc_pown.inc b/libclc/clc/lib/generic/math/clc_pown.inc
deleted file mode 100644
index 1a681b5e4b397..0000000000000
--- a/libclc/clc/lib/generic/math/clc_pown.inc
+++ /dev/null
@@ -1,402 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Computes pow using log and exp
-//
-//   x^y = exp(y * log(x))
-//
-// We take care not to lose precision in the intermediate steps.
-//
-// When computing log, calculate it in splits:
-//
-//   r = f * (p_invead + p_inv_tail)
-//   r = rh + rt
-//
-// Calculate log polynomial using r, in end addition, do:
-//
-//   poly = poly + ((rh-r) + rt)
-//
-//   lth = -r
-//   ltt = ((xexp * log2_t) - poly) + logT
-//   lt = lth + ltt
-//
-//   lh = (xexp * log2_h) + logH
-//   l = lh + lt
-//
-// Calculate final log answer as gh and gt:
-//
-//   gh = l & higher-half bits
-//   gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh))
-//
-//   yh = y & higher-half bits
-//   yt = y - yh
-//
-// Before entering computation of exp:
-//
-//   vs = ((yt*gt + yt*gh) + yh*gt)
-//   v = vs + yh*gh
-//   vt = ((yh*gh - v) + vs)
-//
-// In calculation of exp, add vt to r that is used for poly.
-//
-// At the end of exp, do:
-//
-//   ((((expT * poly) + expT) + expH*poly) + expH)
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
-                                                __CLC_INTN ny) {
-  __CLC_GENTYPE y = __CLC_CONVERT_GENTYPE(ny);
-
-  __CLC_GENTYPE absx = __clc_fabs(x);
-  __CLC_INTN ix = __CLC_AS_INTN(x);
-  __CLC_INTN ax = __CLC_AS_INTN(absx);
-  __CLC_INTN xpos = ix == ax;
-
-  __CLC_INTN iy = __CLC_AS_INTN(y);
-  __CLC_INTN ay = __CLC_AS_INTN(__clc_fabs(y));
-  __CLC_INTN ypos = iy == ay;
-
-  // Extra precise log calculation
-  // First handle case that x is close to 1
-  __CLC_GENTYPE r = 1.0f - absx;
-  __CLC_INTN near1 = __clc_fabs(r) < 0x1.0p-4f;
-  __CLC_GENTYPE r2 = r * r;
-
-  // Coefficients are just 1/3, 1/4, 1/5 and 1/6
-  __CLC_GENTYPE poly = __clc_mad(
-      r,
-      __clc_mad(r,
-                __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
-                          0x1.99999ap-3f),
-                0x1.000000p-2f),
-      0x1.555556p-2f);
-
-  poly *= r2 * r;
-
-  __CLC_GENTYPE lth_near1 = -r2 * 0.5f;
-  __CLC_GENTYPE ltt_near1 = -poly;
-  __CLC_GENTYPE lt_near1 = lth_near1 + ltt_near1;
-  __CLC_GENTYPE lh_near1 = -r;
-  __CLC_GENTYPE l_near1 = lh_near1 + lt_near1;
-
-  // Computations for x not near 1
-  __CLC_INTN m = __CLC_CONVERT_INTN(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
-  __CLC_GENTYPE mf = __CLC_CONVERT_GENTYPE(m);
-  __CLC_INTN ixs = __CLC_AS_INTN(__CLC_AS_GENTYPE(ax | 0x3f800000) - 1.0f);
-  __CLC_GENTYPE mfs = __CLC_CONVERT_GENTYPE((ixs >> EXPSHIFTBITS_SP32) - 253);
-  __CLC_INTN c = m == -127;
-  __CLC_INTN ixn = c ? ixs : ax;
-  __CLC_GENTYPE mfn = c ? mfs : mf;
-
-  __CLC_INTN indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
-  // F - Y
-  __CLC_GENTYPE f = __CLC_AS_GENTYPE(0x3f000000 | indx) -
-                    __CLC_AS_GENTYPE(0x3f000000 | (ixn & MANTBITS_SP32));
-
-  indx = indx >> 16;
-  __CLC_GENTYPE rh = f * __CLC_USE_TABLE(log_inv_tbl_ep_head, indx);
-  __CLC_GENTYPE rt = f * __CLC_USE_TABLE(log_inv_tbl_ep_tail, indx);
-  r = rh + rt;
-
-  poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
-         (r * r);
-  poly += (rh - r) + rt;
-
-  const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
-  const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
-  __CLC_GENTYPE logel = __CLC_USE_TABLE(loge_tbl_lo, indx);
-  __CLC_GENTYPE logeh = __CLC_USE_TABLE(loge_tbl_hi, indx);
-  __CLC_GENTYPE lth = -r;
-  __CLC_GENTYPE ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + logeh;
-  __CLC_GENTYPE lt = lth + ltt;
-  __CLC_GENTYPE lh = __clc_mad(mfn, LOG2_HEAD, logel);
-  __CLC_GENTYPE l = lh + lt;
-
-  // Select near 1 or not
-  lth = near1 ? lth_near1 : lth;
-  ltt = near1 ? ltt_near1 : ltt;
-  lt = near1 ? lt_near1 : lt;
-  lh = near1 ? lh_near1 : lh;
-  l = near1 ? l_near1 : l;
-
-  __CLC_GENTYPE gh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(l) & 0xfffff000);
-  __CLC_GENTYPE gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
-  __CLC_GENTYPE yh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(iy) & 0xfffff000);
-
-  __CLC_GENTYPE yt = __CLC_CONVERT_GENTYPE(ny - __CLC_CONVERT_INTN(yh));
-
-  __CLC_GENTYPE ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
-  __CLC_GENTYPE ylogx = __clc_mad(yh, gh, ylogx_s);
-  __CLC_GENTYPE ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
-
-  // Extra precise exp of ylogx
-  // 64/log2 : 92.332482616893657
-  const __CLC_GENTYPE R_64_BY_LOG2 = 0x1.715476p+6f;
-  __CLC_INTN n = __CLC_CONVERT_INTN(ylogx * R_64_BY_LOG2);
-  __CLC_GENTYPE nf = __CLC_CONVERT_GENTYPE(n);
-
-  __CLC_INTN j = n & 0x3f;
-  m = n >> 6;
-  __CLC_INTN m2 = m << EXPSHIFTBITS_SP32;
-
-  // log2/64 lead: 0.0108032227
-  const __CLC_GENTYPE R_LOG2_BY_64_LD = 0x1.620000p-7f;
-  // log2/64 tail: 0.0000272020388
-  const __CLC_GENTYPE R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
-  r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
-      ylogx_t;
-
-  // Truncated Taylor series for e^r
-  poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
-                             0x1.000000p-1f),
-                   r * r, r);
-
-  __CLC_GENTYPE exp_head = __CLC_USE_TABLE(exp_tbl_ep_head, j);
-  __CLC_GENTYPE exp_tail = __CLC_USE_TABLE(exp_tbl_ep_tail, j);
-
-  __CLC_GENTYPE expylogx =
-      __clc_mad(exp_head, poly, __clc_mad(exp_tail, poly, exp_tail)) + exp_head;
-  __CLC_GENTYPE sexpylogx =
-      expylogx * __CLC_AS_GENTYPE((__CLC_INTN)0x1 << (m + 149));
-  __CLC_GENTYPE texpylogx = __CLC_AS_GENTYPE(__CLC_AS_INTN(expylogx) + m2);
-  expylogx = m < -125 ? sexpylogx : texpylogx;
-
-  // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
-  expylogx =
-      __clc_select(expylogx, __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32),
-                   ylogx > 0x1.62e430p+6f ||
-                       (ylogx == 0x1.62e430p+6f && ylogx_t > -0x1.05c610p-22f));
-
-  // Result is 0 if ylogx < -149*log2
-  expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
-  // Classify y:
-  //   inty = 0 means not an integer.
-  //   inty = 1 means odd integer.
-  //   inty = 2 means even integer.
-
-  __CLC_INTN inty = 2 - (ny & 1);
-
-  __CLC_GENTYPE signval =
-      __CLC_AS_GENTYPE((__CLC_AS_UINTN(expylogx) ^ SIGNBIT_SP32));
-  expylogx = ((inty == 1) && !xpos) ? signval : expylogx;
-  __CLC_INTN ret = __CLC_AS_INTN(expylogx);
-
-  // Corner case handling
-  __CLC_BIT_INTN x_is_ninf = ix == (__CLC_INTN)NINFBITPATT_SP32;
-
-  __CLC_INTN xinf =
-      xpos ? (__CLC_INTN)PINFBITPATT_SP32 : (__CLC_INTN)NINFBITPATT_SP32;
-  ret = ((ax == 0) && !ypos && (inty == 1)) ? xinf : ret;
-  ret = ((ax == 0) && !ypos && (inty == 2)) ? PINFBITPATT_SP32 : ret;
-  ret = ((ax == 0) && ypos && (inty == 2)) ? 0 : ret;
-  __CLC_INTN xzero = !xpos ? (__CLC_INTN)0x80000000 : (__CLC_INTN)0;
-  ret = ((ax == 0) && ypos && (inty == 1)) ? xzero : ret;
-  ret = (x_is_ninf && !ypos && (inty == 1)) ? (__CLC_INTN)0x80000000 : ret;
-  ret = (x_is_ninf && !ypos && (inty != 1)) ? 0 : ret;
-  ret = (x_is_ninf && ypos && (inty == 1)) ? (__CLC_INTN)NINFBITPATT_SP32 : ret;
-  ret = (x_is_ninf && ypos && (inty != 1)) ? (__CLC_INTN)PINFBITPATT_SP32 : ret;
-  ret = ((ix == PINFBITPATT_SP32) && !ypos) ? 0 : ret;
-  ret = ((ix == PINFBITPATT_SP32) && ypos) ? (__CLC_INTN)PINFBITPATT_SP32 : ret;
-  ret = ax > PINFBITPATT_SP32 ? ix : ret;
-  ret = ny == 0 ? 0x3f800000 : ret;
-
-  return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 64
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
-                                                __CLC_INTN ny) {
-  const __CLC_GENTYPE real_log2_tail = 5.76999904754328540596e-08;
-  const __CLC_GENTYPE real_log2_lead = 6.93147122859954833984e-01;
-
-  __CLC_GENTYPE y = __CLC_CONVERT_GENTYPE(ny);
-
-  __CLC_LONGN ux = __CLC_AS_LONGN(x);
-  __CLC_LONGN ax = __CLC_AS_LONGN(__clc_fabs(x));
-  __CLC_BIT_INTN xpos = ax == ux;
-
-  __CLC_LONGN uy = __CLC_AS_LONGN(y);
-  __CLC_LONGN ay = __CLC_AS_LONGN(__clc_fabs(y));
-  __CLC_BIT_INTN ypos = ay == uy;
-
-  // Extended precision log
-  __CLC_GENTYPE v, vt;
-  {
-    __CLC_INTN exp = __CLC_CONVERT_INTN(ax >> 52) - 1023;
-    __CLC_INTN mask_exp_1023 = exp == -1023;
-    __CLC_GENTYPE xexp = __CLC_CONVERT_GENTYPE(exp);
-    __CLC_LONGN mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
-    __CLC_LONGN temp_ux =
-        __CLC_AS_LONGN(__CLC_AS_GENTYPE(0x3ff0000000000000L | mantissa) - 1.0);
-    exp = __CLC_CONVERT_INTN((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
-    __CLC_GENTYPE xexp1 = __CLC_CONVERT_GENTYPE(exp);
-    __CLC_LONGN mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
-    xexp = __CLC_CONVERT_LONGN(mask_exp_1023) ? xexp1 : xexp;
-    mantissa = __CLC_CONVERT_LONGN(mask_exp_1023) ? mantissa1 : mantissa;
-
-    __CLC_LONGN rax = (mantissa & 0x000ff00000000000) +
-                      ((mantissa & 0x0000080000000000) << 1);
-    __CLC_INTN index = __CLC_CONVERT_INTN(rax >> 44);
-
-    __CLC_GENTYPE F = __CLC_AS_GENTYPE(rax | 0x3FE0000000000000L);
-    __CLC_GENTYPE Y = __CLC_AS_GENTYPE(mantissa | 0x3FE0000000000000L);
-    __CLC_GENTYPE f = F - Y;
-    __CLC_GENTYPE log_h = __CLC_USE_TABLE(log_f_inv_tbl_head, index);
-    __CLC_GENTYPE log_t = __CLC_USE_TABLE(log_f_inv_tbl_tail, index);
-    __CLC_GENTYPE f_inv = (log_h + log_t) * f;
-    __CLC_GENTYPE r1 =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(f_inv) & 0xfffffffff8000000L);
-    __CLC_GENTYPE r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
-    __CLC_GENTYPE r = r1 + r2;
-
-    __CLC_GENTYPE poly = __clc_fma(
-        r,
-        __clc_fma(r,
-                  __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
-                  1.0 / 4.0),
-        1.0 / 3.0);
-    poly = poly * r * r * r;
-
-    __CLC_GENTYPE hr1r1 = 0.5 * r1 * r1;
-    __CLC_GENTYPE poly0h = r1 + hr1r1;
-    __CLC_GENTYPE poly0t = r1 - poly0h + hr1r1;
-    poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;
-
-    log_h = __CLC_USE_TABLE(powlog_tbl_head, index);
-    log_t = __CLC_USE_TABLE(powlog_tbl_tail, index);
-
-    __CLC_GENTYPE resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
-    __CLC_GENTYPE resT = resT_t - poly0h;
-    __CLC_GENTYPE resH = __clc_fma(xexp, real_log2_lead, log_h);
-    __CLC_GENTYPE resT_h = poly0h;
-
-    __CLC_GENTYPE H = resT + resH;
-    __CLC_GENTYPE H_h =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(H) & 0xfffffffff8000000L);
-    __CLC_GENTYPE T =
-        (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
-    H = H_h;
-
-    __CLC_GENTYPE y_head =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(uy) & 0xfffffffff8000000L);
-    __CLC_GENTYPE y_tail = y - y_head;
-
-    __CLC_BIT_INTN mask_2_24 = ay > 0x4170000000000000; // 2^24
-    __CLC_INTN nyh = __CLC_CONVERT_INTN(y_head);
-    __CLC_INTN nyt = ny - nyh;
-    __CLC_GENTYPE y_tail1 = __CLC_CONVERT_GENTYPE(nyt);
-    y_tail = mask_2_24 ? y_tail1 : y_tail;
-
-    __CLC_GENTYPE temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
-    v = __clc_fma(y_head, H, temp);
-    vt = __clc_fma(y_head, H, -v) + temp;
-  }
-
-  // Now calculate exp of (v,vt)
-
-  __CLC_GENTYPE expv;
-  {
-    const __CLC_GENTYPE max_exp_arg = 709.782712893384;
-    const __CLC_GENTYPE min_exp_arg = -745.1332191019411;
-    const __CLC_GENTYPE sixtyfour_by_lnof2 = 92.33248261689366;
-    const __CLC_GENTYPE lnof2_by_64_head = 0.010830424260348081;
-    const __CLC_GENTYPE lnof2_by_64_tail = -4.359010638708991e-10;
-
-    // If v is so large that we need to return INFINITY, or so small that we
-    // need to return 0, set v to known values that will produce that result. Do
-    // not try to continue the computation with the original v and patch it up
-    // afterwards because v may be so large that temp is out of range of int, in
-    // which case that conversion, and a value based on that conversion being
-    // passed to __clc_ldexp, results in undefined behavior.
-    v = v > max_exp_arg ? 1000.0 : v;
-    v = v < min_exp_arg ? -1000.0 : v;
-
-    __CLC_GENTYPE temp = v * sixtyfour_by_lnof2;
-    __CLC_INTN n = __CLC_CONVERT_INTN(temp);
-    __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n);
-    __CLC_INTN j = n & 0x0000003f;
-    __CLC_INTN m = n >> 6;
-
-    __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j);
-    __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j);
-    __CLC_GENTYPE f = f1 + f2;
-
-    __CLC_GENTYPE r1 = __clc_fma(dn, -lnof2_by_64_head, v);
-    __CLC_GENTYPE r2 = dn * lnof2_by_64_tail;
-    __CLC_GENTYPE r = (r1 + r2) + vt;
-
-    __CLC_GENTYPE q =
-        __clc_fma(r,
-                  __clc_fma(r,
-                            __clc_fma(r,
-                                      __clc_fma(r, 1.38889490863777199667e-03,
-                                                8.33336798434219616221e-03),
-                                      4.16666666662260795726e-02),
-                            1.66666666665260878863e-01),
-                  5.00000000000000008883e-01);
-    q = __clc_fma(r * r, q, r);
-
-    expv = __clc_fma(f, q, f2) + f1;
-    expv = __clc_ldexp(expv, m);
-  }
-
-  // See whether y is an integer.
-  // inty = 0 means not an integer.
-  // inty = 1 means odd integer.
-  // inty = 2 means even integer.
-
-  __CLC_LONGN inty = __CLC_CONVERT_LONGN(2 - (ny & 1));
-
-  expv *= ((inty == 1) && !xpos) ? -1.0 : 1.0;
-
-  __CLC_LONGN ret = __CLC_AS_LONGN(expv);
-
-  // Now all the edge cases
-  __CLC_BIT_INTN x_is_ninf = ux == (__CLC_LONGN)NINFBITPATT_DP64;
-  __CLC_BIT_INTN x_is_pinf = ux == (__CLC_LONGN)PINFBITPATT_DP64;
-  __CLC_LONGN xinf =
-      xpos ? (__CLC_LONGN)PINFBITPATT_DP64 : (__CLC_LONGN)NINFBITPATT_DP64;
-
-  ret = ((ax == 0L) && !ypos && (inty == 1)) ? xinf : ret;
-  ret = ((ax == 0L) && !ypos && (inty == 2)) ? (__CLC_LONGN)PINFBITPATT_DP64
-                                             : ret;
-  ret = ((ax == 0L) && ypos && (inty == 2)) ? 0L : ret;
-  __CLC_LONGN xzero = !xpos ? (__CLC_LONGN)0x8000000000000000L : 0L;
-  ret = ((ax == 0L) && ypos && (inty == 1)) ? xzero : ret;
-  ret = (x_is_ninf && !ypos && (inty == 1)) ? (__CLC_LONGN)0x8000000000000000L
-                                            : ret;
-  ret = (x_is_ninf && !ypos && (inty != 1)) ? 0L : ret;
-  ret =
-      (x_is_ninf && ypos && (inty == 1)) ? (__CLC_LONGN)NINFBITPATT_DP64 : ret;
-  ret =
-      (x_is_ninf && ypos && (inty != 1)) ? (__CLC_LONGN)PINFBITPATT_DP64 : ret;
-  ret = (x_is_pinf && !ypos) ? 0L : ret;
-  ret = (x_is_pinf && ypos) ? (__CLC_LONGN)PINFBITPATT_DP64 : ret;
-  ret = ax > (__CLC_LONGN)PINFBITPATT_DP64 ? ux : ret;
-  ret = __CLC_CONVERT_LONGN(ny == 0) ? (__CLC_LONGN)0x3ff0000000000000L : ret;
-
-  return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x, __CLC_INTN y) {
-  return __CLC_CONVERT_GENTYPE(__clc_pown(__CLC_CONVERT_FLOATN(x), y));
-}
-
-#endif
diff --git a/libclc/clc/lib/generic/math/clc_powr.cl b/libclc/clc/lib/generic/math/clc_powr.cl
index 0556ec97d6f3c..287e1a31b5b2a 100644
--- a/libclc/clc/lib/generic/math/clc_powr.cl
+++ b/libclc/clc/lib/generic/math/clc_powr.cl
@@ -6,16 +6,36 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clc_convert.h>
-#include <clc/internal/clc.h>
-#include <clc/math/clc_fabs.h>
-#include <clc/math/clc_fma.h>
-#include <clc/math/clc_ldexp.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/clc_subnormal_config.h>
-#include <clc/math/math.h>
-#include <clc/math/tables.h>
-#include <clc/relational/clc_select.h>
+#include "clc/clc_convert.h"
+#include "clc/float/definitions.h"
+#include "clc/internal/clc.h"
+#include "clc/math/clc_copysign.h"
+#include "clc/math/clc_ep.h"
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+#include "clc/math/clc_fabs.h"
+#include "clc/math/clc_ldexp.h"
+#include "clc/math/clc_log.h"
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+#include "clc/math/clc_mad.h"
+#include "clc/math/clc_recip_fast.h"
+#include "clc/math/clc_trunc.h"
+#include "clc/math/math.h"
+#include "clc/relational/clc_isinf.h"
+#include "clc/relational/clc_isunordered.h"
 
-#define __CLC_BODY <clc_powr.inc>
-#include <clc/math/gentype.inc>
+#define COMPILING_POWR
+#define __CLC_BODY <clc_pow_base.inc>
+#include "clc/math/gentype.inc"
+
+#define __CLC_FUNCTION __clc_powr
+#define __CLC_BODY "clc/shared/binary_def_scalarize.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_powr_fast
+#define __CLC_BODY "clc/shared/binary_def_scalarize.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
diff --git a/libclc/clc/lib/generic/math/clc_powr.inc b/libclc/clc/lib/generic/math/clc_powr.inc
deleted file mode 100644
index b94dbfdcbdeb7..0000000000000
--- a/libclc/clc/lib/generic/math/clc_powr.inc
+++ /dev/null
@@ -1,414 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Computes pow using log and exp
-//
-//   x^y = exp(y * log(x))
-//
-// We take care not to lose precision in the intermediate steps
-//
-// When computing log, calculate it in splits:
-//
-//   r = f * (p_invead + p_inv_tail)
-//   r = rh + rt
-//
-// Calculate log polynomial using r, in end addition, do:
-//
-//   poly = poly + ((rh-r) + rt)
-//
-//   lth = -r
-//   ltt = ((xexp * log2_t) - poly) + logT
-//   lt = lth + ltt
-//
-//   lh = (xexp * log2_h) + logH
-//   l = lh + lt
-//
-// Calculate final log answer as gh and gt:
-//
-//   gh = l & higher-half bits
-//   gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh))
-//
-//   yh = y & higher-half bits
-//   yt = y - yh
-//
-// Before entering computation of exp:
-//
-//   vs = ((yt*gt + yt*gh) + yh*gt)
-//   v = vs + yh*gh
-//   vt = ((yh*gh - v) + vs)
-//
-// In calculation of exp, add vt to r that is used for poly.
-//
-// At the end of exp, do
-//
-//   ((((expT * poly) + expT) + expH*poly) + expH)
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
-                                                __CLC_GENTYPE y) {
-  __CLC_GENTYPE absx = __clc_fabs(x);
-  __CLC_INTN ix = __CLC_AS_INTN(x);
-  __CLC_INTN ax = __CLC_AS_INTN(absx);
-  __CLC_INTN xpos = ix == ax;
-
-  __CLC_INTN iy = __CLC_AS_INTN(y);
-  __CLC_INTN ay = __CLC_AS_INTN(__clc_fabs(y));
-  __CLC_INTN ypos = iy == ay;
-
-  // Extra precise log calculation
-  // First handle case that x is close to 1
-  __CLC_GENTYPE r = 1.0f - absx;
-  __CLC_INTN near1 = __clc_fabs(r) < 0x1.0p-4f;
-  __CLC_GENTYPE r2 = r * r;
-
-  // Coefficients are just 1/3, 1/4, 1/5 and 1/6
-  __CLC_GENTYPE poly = __clc_mad(
-      r,
-      __clc_mad(r,
-                __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
-                          0x1.99999ap-3f),
-                0x1.000000p-2f),
-      0x1.555556p-2f);
-
-  poly *= r2 * r;
-
-  __CLC_GENTYPE lth_near1 = -r2 * 0.5f;
-  __CLC_GENTYPE ltt_near1 = -poly;
-  __CLC_GENTYPE lt_near1 = lth_near1 + ltt_near1;
-  __CLC_GENTYPE lh_near1 = -r;
-  __CLC_GENTYPE l_near1 = lh_near1 + lt_near1;
-
-  // Computations for x not near 1
-  __CLC_INTN m = __CLC_CONVERT_INTN(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
-  __CLC_GENTYPE mf = __CLC_CONVERT_GENTYPE(m);
-  __CLC_INTN ixs = __CLC_AS_INTN(__CLC_AS_GENTYPE(ax | 0x3f800000) - 1.0f);
-  __CLC_GENTYPE mfs = __CLC_CONVERT_GENTYPE((ixs >> EXPSHIFTBITS_SP32) - 253);
-  __CLC_INTN c = m == -127;
-  __CLC_INTN ixn = c ? ixs : ax;
-  __CLC_GENTYPE mfn = c ? mfs : mf;
-
-  __CLC_INTN indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
-  // F - Y
-  __CLC_GENTYPE f = __CLC_AS_GENTYPE(0x3f000000 | indx) -
-                    __CLC_AS_GENTYPE(0x3f000000 | (ixn & MANTBITS_SP32));
-
-  indx = indx >> 16;
-  __CLC_GENTYPE rh = f * __CLC_USE_TABLE(log_inv_tbl_ep_head, indx);
-  __CLC_GENTYPE rt = f * __CLC_USE_TABLE(log_inv_tbl_ep_tail, indx);
-  r = rh + rt;
-
-  poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
-         (r * r);
-  poly += (rh - r) + rt;
-
-  const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
-  const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
-  __CLC_GENTYPE logel = __CLC_USE_TABLE(loge_tbl_lo, indx);
-  __CLC_GENTYPE logeh = __CLC_USE_TABLE(loge_tbl_hi, indx);
-  __CLC_GENTYPE lth = -r;
-  __CLC_GENTYPE ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + logeh;
-  __CLC_GENTYPE lt = lth + ltt;
-  __CLC_GENTYPE lh = __clc_mad(mfn, LOG2_HEAD, logel);
-  __CLC_GENTYPE l = lh + lt;
-
-  // Select near 1 or not
-  lth = near1 ? lth_near1 : lth;
-  ltt = near1 ? ltt_near1 : ltt;
-  lt = near1 ? lt_near1 : lt;
-  lh = near1 ? lh_near1 : lh;
-  l = near1 ? l_near1 : l;
-
-  __CLC_GENTYPE gh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(l) & 0xfffff000);
-  __CLC_GENTYPE gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
-  __CLC_GENTYPE yh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(iy) & 0xfffff000);
-
-  __CLC_GENTYPE yt = y - yh;
-
-  __CLC_GENTYPE ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
-  __CLC_GENTYPE ylogx = __clc_mad(yh, gh, ylogx_s);
-  __CLC_GENTYPE ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
-
-  // Extra precise exp of ylogx
-  // 64/log2 : 92.332482616893657
-  const __CLC_GENTYPE R_64_BY_LOG2 = 0x1.715476p+6f;
-  __CLC_INTN n = __CLC_CONVERT_INTN(ylogx * R_64_BY_LOG2);
-  __CLC_GENTYPE nf = __CLC_CONVERT_GENTYPE(n);
-
-  __CLC_INTN j = n & 0x3f;
-  m = n >> 6;
-  __CLC_INTN m2 = m << EXPSHIFTBITS_SP32;
-  // log2/64 lead: 0.0108032227
-  const __CLC_GENTYPE R_LOG2_BY_64_LD = 0x1.620000p-7f;
-  // log2/64 tail: 0.0000272020388
-  const __CLC_GENTYPE R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
-  r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
-      ylogx_t;
-
-  // Truncated Taylor series for e^r
-  poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
-                             0x1.000000p-1f),
-                   r * r, r);
-
-  __CLC_GENTYPE exp_head = __CLC_USE_TABLE(exp_tbl_ep_head, j);
-  __CLC_GENTYPE exp_tail = __CLC_USE_TABLE(exp_tbl_ep_tail, j);
-
-  __CLC_GENTYPE expylogx =
-      __clc_mad(exp_head, poly, __clc_mad(exp_tail, poly, exp_tail)) + exp_head;
-  __CLC_GENTYPE sexpylogx =
-      expylogx * __CLC_AS_GENTYPE((__CLC_INTN)0x1 << (m + 149));
-  __CLC_GENTYPE texpylogx = __CLC_AS_GENTYPE(__CLC_AS_INTN(expylogx) + m2);
-  expylogx = m < -125 ? sexpylogx : texpylogx;
-
-  // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
-  expylogx =
-      __clc_select(expylogx, __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32),
-                   (ylogx > 0x1.62e430p+6f) ||
-                       (ylogx == 0x1.62e430p+6f && ylogx_t > -0x1.05c610p-22f));
-
-  // Result is 0 if ylogx < -149*log2
-  expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
-  // Classify y:
-  //   inty = 0 means not an integer.
-  //   inty = 1 means odd integer.
-  //   inty = 2 means even integer.
-
-  __CLC_INTN yexp = (__CLC_INTN)(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
-  __CLC_INTN mask = ((__CLC_INTN)1 << (24 - yexp)) - 1;
-  __CLC_INTN yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
-  __CLC_INTN inty = yodd ? 1 : 2;
-  inty = (iy & mask) != 0 ? 0 : inty;
-  inty = yexp < 1 ? 0 : inty;
-  inty = yexp > 24 ? 2 : inty;
-
-  __CLC_GENTYPE signval =
-      __CLC_AS_GENTYPE((__CLC_AS_UINTN(expylogx) ^ SIGNBIT_SP32));
-  expylogx = ((inty == 1) && !xpos) ? signval : expylogx;
-  __CLC_INTN ret = __CLC_AS_INTN(expylogx);
-
-  // Corner case handling
-  __CLC_BIT_INTN y_is_ninf = iy == (__CLC_INTN)NINFBITPATT_SP32;
-  __CLC_BIT_INTN y_is_pinf = iy == (__CLC_INTN)PINFBITPATT_SP32;
-  __CLC_BIT_INTN x_is_inf = ax == (__CLC_INTN)PINFBITPATT_SP32;
-
-  ret = ax < 0x3f800000 && y_is_ninf ? PINFBITPATT_SP32 : ret;
-  ret = ax < 0x3f800000 && y_is_pinf ? 0 : ret;
-  ret = ax == 0x3f800000 && ay < PINFBITPATT_SP32 ? 0x3f800000 : ret;
-  ret = ax == 0x3f800000 && ay == PINFBITPATT_SP32 ? QNANBITPATT_SP32 : ret;
-  ret = ax > 0x3f800000 && y_is_ninf ? 0 : ret;
-  ret = ax > 0x3f800000 && y_is_pinf ? PINFBITPATT_SP32 : ret;
-  ret = ((ix < PINFBITPATT_SP32) && (ay == 0)) ? 0x3f800000 : ret;
-  ret = (x_is_inf && !ypos) ? 0 : ret;
-  ret = (x_is_inf && ypos) ? PINFBITPATT_SP32 : ret;
-  ret = (x_is_inf && y_is_pinf) ? PINFBITPATT_SP32 : ret;
-  ret = (x_is_inf && (ay == 0)) ? QNANBITPATT_SP32 : ret;
-  ret = ((ax == 0) && !ypos) ? PINFBITPATT_SP32 : ret;
-  ret = ((ax == 0) && ypos) ? 0 : ret;
-  ret = ((ax == 0) && (ay == 0)) ? QNANBITPATT_SP32 : ret;
-  ret = ((ax != 0) && !xpos) ? QNANBITPATT_SP32 : ret;
-  ret = ax > PINFBITPATT_SP32 ? ix : ret;
-  ret = ay > PINFBITPATT_SP32 ? iy : ret;
-
-  return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 64
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
-                                                __CLC_GENTYPE y) {
-  const __CLC_GENTYPE real_log2_tail = 5.76999904754328540596e-08;
-  const __CLC_GENTYPE real_log2_lead = 6.93147122859954833984e-01;
-
-  __CLC_LONGN ux = __CLC_AS_LONGN(x);
-  __CLC_LONGN ax = __CLC_AS_LONGN(__clc_fabs(x));
-  __CLC_BIT_INTN xpos = ax == ux;
-
-  __CLC_LONGN uy = __CLC_AS_LONGN(y);
-  __CLC_LONGN ay = __CLC_AS_LONGN(__clc_fabs(y));
-  __CLC_BIT_INTN ypos = ay == uy;
-
-  // Extended precision log
-  __CLC_GENTYPE v, vt;
-  {
-    __CLC_INTN exp = __CLC_CONVERT_INTN(ax >> 52) - 1023;
-    __CLC_INTN mask_exp_1023 = exp == (__CLC_INTN)-1023;
-    __CLC_GENTYPE xexp = __CLC_CONVERT_GENTYPE(exp);
-    __CLC_LONGN mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
-    __CLC_LONGN temp_ux =
-        __CLC_AS_LONGN(__CLC_AS_GENTYPE(0x3ff0000000000000L | mantissa) - 1.0);
-    exp = __CLC_CONVERT_INTN((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
-    __CLC_GENTYPE xexp1 = __CLC_CONVERT_GENTYPE(exp);
-    __CLC_LONGN mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
-    xexp = __CLC_CONVERT_LONGN(mask_exp_1023) ? xexp1 : xexp;
-    mantissa = __CLC_CONVERT_LONGN(mask_exp_1023) ? mantissa1 : mantissa;
-
-    __CLC_LONGN rax = (mantissa & 0x000ff00000000000) +
-                      ((mantissa & 0x0000080000000000) << 1);
-    __CLC_INTN index = __CLC_CONVERT_INTN(rax >> 44);
-
-    __CLC_GENTYPE F = __CLC_AS_GENTYPE(rax | 0x3FE0000000000000L);
-    __CLC_GENTYPE Y = __CLC_AS_GENTYPE(mantissa | 0x3FE0000000000000L);
-    __CLC_GENTYPE f = F - Y;
-    __CLC_GENTYPE log_h = __CLC_USE_TABLE(log_f_inv_tbl_head, index);
-    __CLC_GENTYPE log_t = __CLC_USE_TABLE(log_f_inv_tbl_tail, index);
-    __CLC_GENTYPE f_inv = (log_h + log_t) * f;
-    __CLC_GENTYPE r1 =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(f_inv) & 0xfffffffff8000000L);
-    __CLC_GENTYPE r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
-    __CLC_GENTYPE r = r1 + r2;
-
-    __CLC_GENTYPE poly = __clc_fma(
-        r,
-        __clc_fma(r,
-                  __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
-                  1.0 / 4.0),
-        1.0 / 3.0);
-    poly = poly * r * r * r;
-
-    __CLC_GENTYPE hr1r1 = 0.5 * r1 * r1;
-    __CLC_GENTYPE poly0h = r1 + hr1r1;
-    __CLC_GENTYPE poly0t = r1 - poly0h + hr1r1;
-    poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;
-
-    log_h = __CLC_USE_TABLE(powlog_tbl_head, index);
-    log_t = __CLC_USE_TABLE(powlog_tbl_tail, index);
-
-    __CLC_GENTYPE resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
-    __CLC_GENTYPE resT = resT_t - poly0h;
-    __CLC_GENTYPE resH = __clc_fma(xexp, real_log2_lead, log_h);
-    __CLC_GENTYPE resT_h = poly0h;
-
-    __CLC_GENTYPE H = resT + resH;
-    __CLC_GENTYPE H_h =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(H) & 0xfffffffff8000000L);
-    __CLC_GENTYPE T =
-        (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
-    H = H_h;
-
-    __CLC_GENTYPE y_head =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(uy) & 0xfffffffff8000000L);
-    __CLC_GENTYPE y_tail = y - y_head;
-
-    __CLC_GENTYPE temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
-    v = __clc_fma(y_head, H, temp);
-    vt = __clc_fma(y_head, H, -v) + temp;
-  }
-
-  // Now calculate exp of (v,vt)
-
-  __CLC_GENTYPE expv;
-  {
-    const __CLC_GENTYPE max_exp_arg = 709.782712893384;
-    const __CLC_GENTYPE min_exp_arg = -745.1332191019411;
-    const __CLC_GENTYPE sixtyfour_by_lnof2 = 92.33248261689366;
-    const __CLC_GENTYPE lnof2_by_64_head = 0.010830424260348081;
-    const __CLC_GENTYPE lnof2_by_64_tail = -4.359010638708991e-10;
-
-    // If v is so large that we need to return INFINITY, or so small that we
-    // need to return 0, set v to known values that will produce that result. Do
-    // not try to continue the computation with the original v and patch it up
-    // afterwards because v may be so large that temp is out of range of int, in
-    // which case that conversion, and a value based on that conversion being
-    // passed to __clc_ldexp, results in undefined behavior.
-    v = v > max_exp_arg ? 1000.0 : v;
-    v = v < min_exp_arg ? -1000.0 : v;
-
-    __CLC_GENTYPE temp = v * sixtyfour_by_lnof2;
-    __CLC_INTN n = __CLC_CONVERT_INTN(temp);
-    __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n);
-    __CLC_INTN j = n & 0x0000003f;
-    __CLC_INTN m = n >> 6;
-
-    __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j);
-    __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j);
-    __CLC_GENTYPE f = f1 + f2;
-
-    __CLC_GENTYPE r1 = __clc_fma(dn, -lnof2_by_64_head, v);
-    __CLC_GENTYPE r2 = dn * lnof2_by_64_tail;
-    __CLC_GENTYPE r = (r1 + r2) + vt;
-
-    __CLC_GENTYPE q =
-        __clc_fma(r,
-                  __clc_fma(r,
-                            __clc_fma(r,
-                                      __clc_fma(r, 1.38889490863777199667e-03,
-                                                8.33336798434219616221e-03),
-                                      4.16666666662260795726e-02),
-                            1.66666666665260878863e-01),
-                  5.00000000000000008883e-01);
-    q = __clc_fma(r * r, q, r);
-
-    expv = __clc_fma(f, q, f2) + f1;
-    expv = __clc_ldexp(expv, m);
-  }
-
-  // See whether y is an integer.
-  // inty = 0 means not an integer.
-  // inty = 1 means odd integer.
-  // inty = 2 means even integer.
-
-  __CLC_LONGN inty;
-  {
-    __CLC_INTN yexp =
-        __CLC_CONVERT_INTN(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
-    inty = __CLC_CONVERT_LONGN(yexp < 1 ? 0 : 2);
-    inty = __CLC_CONVERT_LONGN(yexp > 53) ? 2 : inty;
-    __CLC_LONGN mask = ((__CLC_LONGN)1L << (53 - yexp)) - 1L;
-    __CLC_LONGN inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1L : 2L;
-    inty1 = (ay & mask) != 0 ? 0 : inty1;
-    inty = __CLC_CONVERT_LONGN(!(yexp < 1) && !(yexp > 53)) ? inty1 : inty;
-  }
-
-  expv *= ((inty == 1) && !xpos) ? -1.0 : 1.0;
-
-  __CLC_LONGN ret = __CLC_AS_LONGN(expv);
-
-  // Now all the edge cases
-  __CLC_BIT_INTN y_is_ninf = uy == (__CLC_LONGN)NINFBITPATT_DP64;
-  __CLC_BIT_INTN y_is_pinf = uy == (__CLC_LONGN)PINFBITPATT_DP64;
-  __CLC_BIT_INTN x_is_inf = ax == (__CLC_LONGN)PINFBITPATT_DP64;
-
-  ret = ax < 0x3ff0000000000000L && y_is_ninf ? PINFBITPATT_DP64 : ret;
-  ret = ax < 0x3ff0000000000000L && y_is_pinf ? 0L : ret;
-  ret = ax == 0x3ff0000000000000L && ay < PINFBITPATT_DP64 ? 0x3ff0000000000000L
-                                                           : ret;
-  ret = ax == 0x3ff0000000000000L && ay == PINFBITPATT_DP64 ? QNANBITPATT_DP64
-                                                            : ret;
-  ret = ax > 0x3ff0000000000000L && y_is_ninf ? 0L : ret;
-  ret = ax > 0x3ff0000000000000L && y_is_pinf ? PINFBITPATT_DP64 : ret;
-  ret = ux < PINFBITPATT_DP64 && ay == 0L ? 0x3ff0000000000000L : ret;
-  ret = (x_is_inf && !ypos) ? 0L : ret;
-  ret = (x_is_inf && ypos) ? PINFBITPATT_DP64 : ret;
-  ret = (x_is_inf && y_is_pinf) ? PINFBITPATT_DP64 : ret;
-  ret = ((ax == PINFBITPATT_DP64) && (ay == 0L)) ? QNANBITPATT_DP64 : ret;
-  ret = ((ax == 0L) && !ypos) ? PINFBITPATT_DP64 : ret;
-  ret = ((ax == 0L) && ypos) ? 0L : ret;
-  ret = ((ax == 0L) && (ay == 0L)) ? QNANBITPATT_DP64 : ret;
-  ret = ((ax != 0L) && !xpos) ? QNANBITPATT_DP64 : ret;
-  ret = ax > PINFBITPATT_DP64 ? ux : ret;
-  ret = ay > PINFBITPATT_DP64 ? uy : ret;
-
-  return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
-                                                __CLC_GENTYPE y) {
-  return __CLC_CONVERT_GENTYPE(
-      __clc_powr(__CLC_CONVERT_FLOATN(x), __CLC_CONVERT_FLOATN(y)));
-}
-
-#endif
diff --git a/libclc/clc/lib/generic/math/clc_rootn.cl b/libclc/clc/lib/generic/math/clc_rootn.cl
index da397cf66da62..29a51baac87c3 100644
--- a/libclc/clc/lib/generic/math/clc_rootn.cl
+++ b/libclc/clc/lib/generic/math/clc_rootn.cl
@@ -6,16 +6,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clc_convert.h>
-#include <clc/float/definitions.h>
-#include <clc/internal/clc.h>
-#include <clc/math/clc_fabs.h>
-#include <clc/math/clc_fma.h>
-#include <clc/math/clc_ldexp.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/clc_subnormal_config.h>
-#include <clc/math/math.h>
-#include <clc/math/tables.h>
+#include "clc/clc_convert.h"
+#include "clc/float/definitions.h"
+#include "clc/math/clc_copysign.h"
+#include "clc/math/clc_ep.h"
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+#include "clc/math/clc_fabs.h"
+#include "clc/math/clc_ldexp.h"
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+#include "clc/math/clc_mad.h"
+#include "clc/math/clc_recip_fast.h"
+#include "clc/math/clc_rootn.h"
+#include "clc/math/clc_trunc.h"
+#include "clc/relational/clc_isinf.h"
+#include "clc/relational/clc_isunordered.h"
 
-#define __CLC_BODY <clc_rootn.inc>
+#define COMPILING_ROOTN
+#define __CLC_BODY <clc_pow_base.inc>
 #include <clc/math/gentype.inc>
+
+#define __CLC_FUNCTION __clc_rootn
+#define __CLC_BODY                                                             \
+  "clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_rootn_fast
+#define __CLC_BODY                                                             \
+  "clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc"
+#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_rootn.inc b/libclc/clc/lib/generic/math/clc_rootn.inc
deleted file mode 100644
index fd3d0becb0dff..0000000000000
--- a/libclc/clc/lib/generic/math/clc_rootn.inc
+++ /dev/null
@@ -1,405 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Computes pow using log and exp
-//
-//   x^y = exp(y * log(x))
-//
-// We take care not to lose precision in the intermediate steps.
-//
-// When computing log, calculate it in splits:
-//
-//   r = f * (p_invead + p_inv_tail)
-//   r = rh + rt
-//
-// Calculate log polynomial using r, in end addition, do:
-//
-//   poly = poly + ((rh-r) + rt)
-//
-//   lth = -r
-//   ltt = ((xexp * log2_t) - poly) + logT
-//   lt = lth + ltt
-//
-//   lh = (xexp * log2_h) + logH
-//   l = lh + lt
-//
-// Calculate final log answer as gh and gt:
-//
-//   gh = l & higher-half bits
-//   gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh))
-//
-//   yh = y & higher-half bits
-//   yt = y - yh
-//
-// Before entering computation of exp:
-//
-//   vs = ((yt*gt + yt*gh) + yh*gt)
-//   v = vs + yh*gh
-//   vt = ((yh*gh - v) + vs)
-//
-// In calculation of exp, add vt to r that is used for poly.
-//
-// At the end of exp, do:
-//
-//   ((((expT * poly) + expT) + expH*poly) + expH)
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
-                                                 __CLC_INTN ny) {
-  __CLC_GENTYPE y = MATH_RECIP(__CLC_CONVERT_GENTYPE(ny));
-
-  __CLC_INTN ix = __CLC_AS_INTN(x);
-  __CLC_INTN ax = ix & EXSIGNBIT_SP32;
-  __CLC_INTN xpos = ix == ax;
-
-  __CLC_INTN iy = __CLC_AS_INTN(y);
-  __CLC_INTN ay = iy & EXSIGNBIT_SP32;
-  __CLC_INTN ypos = iy == ay;
-
-  // Extra precise log calculation
-  // First handle case that x is close to 1
-  __CLC_GENTYPE r = 1.0f - __CLC_AS_GENTYPE(ax);
-  __CLC_INTN near1 = __clc_fabs(r) < 0x1.0p-4f;
-  __CLC_GENTYPE r2 = r * r;
-
-  // Coefficients are just 1/3, 1/4, 1/5 and 1/6
-  __CLC_GENTYPE poly = __clc_mad(
-      r,
-      __clc_mad(r,
-                __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
-                          0x1.99999ap-3f),
-                0x1.000000p-2f),
-      0x1.555556p-2f);
-
-  poly *= r2 * r;
-
-  __CLC_GENTYPE lth_near1 = -r2 * 0.5f;
-  __CLC_GENTYPE ltt_near1 = -poly;
-  __CLC_GENTYPE lt_near1 = lth_near1 + ltt_near1;
-  __CLC_GENTYPE lh_near1 = -r;
-  __CLC_GENTYPE l_near1 = lh_near1 + lt_near1;
-
-  // Computations for x not near 1
-  __CLC_INTN m = __CLC_CONVERT_INTN(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
-  __CLC_GENTYPE mf = __CLC_CONVERT_GENTYPE(m);
-  __CLC_INTN ixs = __CLC_AS_INTN(__CLC_AS_GENTYPE(ax | 0x3f800000) - 1.0f);
-  __CLC_GENTYPE mfs = __CLC_CONVERT_GENTYPE((ixs >> EXPSHIFTBITS_SP32) - 253);
-  __CLC_INTN c = m == -127;
-  __CLC_INTN ixn = c ? ixs : ax;
-  __CLC_GENTYPE mfn = c ? mfs : mf;
-
-  __CLC_INTN indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
-  // F - Y
-  __CLC_GENTYPE f = __CLC_AS_GENTYPE(0x3f000000 | indx) -
-                    __CLC_AS_GENTYPE(0x3f000000 | (ixn & MANTBITS_SP32));
-
-  indx = indx >> 16;
-  __CLC_GENTYPE rh = f * __CLC_USE_TABLE(log_inv_tbl_ep_head, indx);
-  __CLC_GENTYPE rt = f * __CLC_USE_TABLE(log_inv_tbl_ep_tail, indx);
-  ;
-  r = rh + rt;
-
-  poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
-         (r * r);
-  poly += (rh - r) + rt;
-
-  const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
-  const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
-  __CLC_GENTYPE lth = -r;
-  __CLC_GENTYPE ltt =
-      __clc_mad(mfn, LOG2_TAIL, -poly) + __CLC_USE_TABLE(loge_tbl_hi, indx);
-  __CLC_GENTYPE lt = lth + ltt;
-  __CLC_GENTYPE lh =
-      __clc_mad(mfn, LOG2_HEAD, __CLC_USE_TABLE(loge_tbl_lo, indx));
-  __CLC_GENTYPE l = lh + lt;
-
-  // Select near 1 or not
-  lth = near1 ? lth_near1 : lth;
-  ltt = near1 ? ltt_near1 : ltt;
-  lt = near1 ? lt_near1 : lt;
-  lh = near1 ? lh_near1 : lh;
-  l = near1 ? l_near1 : l;
-
-  __CLC_GENTYPE gh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(l) & 0xfffff000);
-  __CLC_GENTYPE gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
-  __CLC_GENTYPE yh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(iy) & 0xfffff000);
-
-  __CLC_GENTYPE fny = __CLC_CONVERT_GENTYPE(ny);
-  __CLC_GENTYPE fnyh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(fny) & 0xfffff000);
-  __CLC_GENTYPE fnyt = __CLC_CONVERT_GENTYPE(ny - __CLC_CONVERT_INTN(fnyh));
-  __CLC_GENTYPE yt =
-      MATH_DIVIDE(__clc_mad(-fnyt, yh, __clc_mad(-fnyh, yh, 1.0f)), fny);
-
-  __CLC_GENTYPE ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
-  __CLC_GENTYPE ylogx = __clc_mad(yh, gh, ylogx_s);
-  __CLC_GENTYPE ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
-
-  // Extra precise exp of ylogx
-  const __CLC_GENTYPE R_64_BY_LOG2 =
-      0x1.715476p+6f; // 64/log2 : 92.332482616893657
-  __CLC_INTN n = __CLC_CONVERT_INTN(ylogx * R_64_BY_LOG2);
-  __CLC_GENTYPE nf = __CLC_CONVERT_GENTYPE(n);
-
-  __CLC_INTN j = n & 0x3f;
-  m = n >> 6;
-  __CLC_INTN m2 = m << EXPSHIFTBITS_SP32;
-
-  // log2/64 lead: 0.0108032227
-  const __CLC_GENTYPE R_LOG2_BY_64_LD = 0x1.620000p-7f;
-  // log2/64 tail: 0.0000272020388
-  const __CLC_GENTYPE R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
-  r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
-      ylogx_t;
-
-  // Truncated Taylor series for e^r
-  poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
-                             0x1.000000p-1f),
-                   r * r, r);
-
-  __CLC_GENTYPE exph = __CLC_USE_TABLE(exp_tbl_ep_head, j);
-  __CLC_GENTYPE expt = __CLC_USE_TABLE(exp_tbl_ep_tail, j);
-
-  __CLC_GENTYPE expylogx =
-      __clc_mad(exph, poly, __clc_mad(expt, poly, expt)) + exph;
-  __CLC_GENTYPE sexpylogx =
-      __clc_fp32_subnormals_supported()
-          ? expylogx * __CLC_AS_GENTYPE((__CLC_INTN)0x1 << (m + 149))
-          : 0.0f;
-
-  __CLC_GENTYPE texpylogx = __CLC_AS_GENTYPE(__CLC_AS_INTN(expylogx) + m2);
-  expylogx = m < -125 ? sexpylogx : texpylogx;
-
-  // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
-  expylogx = ((ylogx > 0x1.62e430p+6f) |
-              (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f))
-                 ? __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32)
-                 : expylogx;
-
-  // Result is 0 if ylogx < -149*log2
-  expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
-  // Classify y:
-  //   inty = 0 means not an integer.
-  //   inty = 1 means odd integer.
-  //   inty = 2 means even integer.
-
-  __CLC_INTN inty = 2 - (ny & 1);
-
-  __CLC_GENTYPE signval =
-      __CLC_AS_GENTYPE((__CLC_AS_UINTN(expylogx) ^ SIGNBIT_SP32));
-  expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
-  __CLC_INTN ret = __CLC_AS_INTN(expylogx);
-
-  // Corner case handling
-  __CLC_BIT_INTN x_is_ninf = ix == (__CLC_INTN)NINFBITPATT_SP32;
-  __CLC_BIT_INTN x_is_pinf = ix == (__CLC_INTN)PINFBITPATT_SP32;
-
-  ret = (!xpos & (inty == 2)) ? __CLC_AS_INTN(__CLC_GENTYPE_NAN) : ret;
-  __CLC_INTN xinf =
-      xpos ? (__CLC_INTN)PINFBITPATT_SP32 : (__CLC_INTN)NINFBITPATT_SP32;
-  ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
-  ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret;
-  ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret;
-  __CLC_INTN xzero = xpos ? 0 : (__CLC_INTN)0x80000000;
-  ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
-  ret = (x_is_ninf & ypos & (inty == 1)) ? (__CLC_INTN)NINFBITPATT_SP32 : ret;
-  ret = (x_is_ninf & !ypos & (inty == 1)) ? (__CLC_INTN)0x80000000 : ret;
-  ret = (x_is_pinf & !ypos) ? 0 : ret;
-  ret = (x_is_pinf & ypos) ? PINFBITPATT_SP32 : ret;
-  ret = ax > PINFBITPATT_SP32 ? ix : ret;
-  ret = ny == 0 ? __CLC_AS_INTN(__CLC_GENTYPE_NAN) : ret;
-
-  return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 64
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
-                                                 __CLC_INTN ny) {
-  const __CLC_GENTYPE real_log2_tail = 5.76999904754328540596e-08;
-  const __CLC_GENTYPE real_log2_lead = 6.93147122859954833984e-01;
-
-  __CLC_GENTYPE dny = __CLC_CONVERT_GENTYPE(ny);
-  __CLC_GENTYPE y = 1.0 / dny;
-
-  __CLC_LONGN ux = __CLC_AS_LONGN(x);
-  __CLC_LONGN ax = __CLC_AS_LONGN(__clc_fabs(x));
-  __CLC_BIT_INTN xpos = ax == ux;
-
-  __CLC_LONGN uy = __CLC_AS_LONGN(y);
-  __CLC_LONGN ay = __CLC_AS_LONGN(__clc_fabs(y));
-  __CLC_BIT_INTN ypos = ay == uy;
-
-  // Extended precision log
-  __CLC_GENTYPE v, vt;
-  {
-    __CLC_INTN exp = __CLC_CONVERT_INTN(ax >> 52) - 1023;
-    __CLC_INTN mask_exp_1023 = exp == -1023;
-    __CLC_GENTYPE xexp = __CLC_CONVERT_GENTYPE(exp);
-    __CLC_LONGN mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
-    __CLC_LONGN temp_ux =
-        __CLC_AS_LONGN(__CLC_AS_GENTYPE(0x3ff0000000000000L | mantissa) - 1.0);
-    exp = __CLC_CONVERT_INTN((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
-    __CLC_GENTYPE xexp1 = __CLC_CONVERT_GENTYPE(exp);
-    __CLC_LONGN mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
-    xexp = __CLC_CONVERT_LONGN(mask_exp_1023) ? xexp1 : xexp;
-    mantissa = __CLC_CONVERT_LONGN(mask_exp_1023) ? mantissa1 : mantissa;
-
-    __CLC_LONGN rax = (mantissa & 0x000ff00000000000) +
-                      ((mantissa & 0x0000080000000000) << 1);
-    __CLC_INTN index = __CLC_CONVERT_INTN(rax >> 44);
-
-    __CLC_GENTYPE F = __CLC_AS_GENTYPE(rax | 0x3FE0000000000000L);
-    __CLC_GENTYPE Y = __CLC_AS_GENTYPE(mantissa | 0x3FE0000000000000L);
-    __CLC_GENTYPE f = F - Y;
-    __CLC_GENTYPE log_h = __CLC_USE_TABLE(log_f_inv_tbl_head, index);
-    __CLC_GENTYPE log_t = __CLC_USE_TABLE(log_f_inv_tbl_tail, index);
-    __CLC_GENTYPE f_inv = (log_h + log_t) * f;
-    __CLC_GENTYPE r1 =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(f_inv) & 0xfffffffff8000000L);
-    __CLC_GENTYPE r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
-    __CLC_GENTYPE r = r1 + r2;
-
-    __CLC_GENTYPE poly = __clc_fma(
-        r,
-        __clc_fma(r,
-                  __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
-                  1.0 / 4.0),
-        1.0 / 3.0);
-    poly = poly * r * r * r;
-
-    __CLC_GENTYPE hr1r1 = 0.5 * r1 * r1;
-    __CLC_GENTYPE poly0h = r1 + hr1r1;
-    __CLC_GENTYPE poly0t = r1 - poly0h + hr1r1;
-    poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;
-
-    log_h = __CLC_USE_TABLE(powlog_tbl_head, index);
-    log_t = __CLC_USE_TABLE(powlog_tbl_tail, index);
-
-    __CLC_GENTYPE resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
-    __CLC_GENTYPE resT = resT_t - poly0h;
-    __CLC_GENTYPE resH = __clc_fma(xexp, real_log2_lead, log_h);
-    __CLC_GENTYPE resT_h = poly0h;
-
-    __CLC_GENTYPE H = resT + resH;
-    __CLC_GENTYPE H_h =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(H) & 0xfffffffff8000000L);
-    __CLC_GENTYPE T =
-        (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
-    H = H_h;
-
-    __CLC_GENTYPE y_head =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(uy) & 0xfffffffff8000000L);
-    __CLC_GENTYPE y_tail = y - y_head;
-
-    __CLC_GENTYPE fnyh =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(dny) & 0xfffffffffff00000);
-    __CLC_GENTYPE fnyt = __CLC_CONVERT_GENTYPE(ny - __CLC_CONVERT_INTN(fnyh));
-    y_tail = __clc_fma(-fnyt, y_head, __clc_fma(-fnyh, y_head, 1.0)) / dny;
-
-    __CLC_GENTYPE temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
-    v = __clc_fma(y_head, H, temp);
-    vt = __clc_fma(y_head, H, -v) + temp;
-  }
-
-  // Now calculate exp of (v,vt)
-
-  __CLC_GENTYPE expv;
-  {
-    const __CLC_GENTYPE max_exp_arg = 709.782712893384;
-    const __CLC_GENTYPE min_exp_arg = -745.1332191019411;
-    const __CLC_GENTYPE sixtyfour_by_lnof2 = 92.33248261689366;
-    const __CLC_GENTYPE lnof2_by_64_head = 0.010830424260348081;
-    const __CLC_GENTYPE lnof2_by_64_tail = -4.359010638708991e-10;
-
-    // If v is so large that we need to return INFINITY, or so small that we
-    // need to return 0, set v to known values that will produce that result. Do
-    // not try to continue the computation with the original v and patch it up
-    // afterwards because v may be so large that temp is out of range of int, in
-    // which case that conversion, and a value based on that conversion being
-    // passed to __clc_ldexp, results in undefined behavior.
-    v = v > max_exp_arg ? 1000.0 : v;
-    v = v < min_exp_arg ? -1000.0 : v;
-
-    __CLC_GENTYPE temp = v * sixtyfour_by_lnof2;
-    __CLC_INTN n = __CLC_CONVERT_INTN(temp);
-    __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n);
-    __CLC_INTN j = n & 0x0000003f;
-    __CLC_INTN m = n >> 6;
-
-    __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j);
-    __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j);
-    __CLC_GENTYPE f = f1 + f2;
-
-    __CLC_GENTYPE r1 = __clc_fma(dn, -lnof2_by_64_head, v);
-    __CLC_GENTYPE r2 = dn * lnof2_by_64_tail;
-    __CLC_GENTYPE r = (r1 + r2) + vt;
-
-    __CLC_GENTYPE q =
-        __clc_fma(r,
-                  __clc_fma(r,
-                            __clc_fma(r,
-                                      __clc_fma(r, 1.38889490863777199667e-03,
-                                                8.33336798434219616221e-03),
-                                      4.16666666662260795726e-02),
-                            1.66666666665260878863e-01),
-                  5.00000000000000008883e-01);
-    q = __clc_fma(r * r, q, r);
-
-    expv = __clc_fma(f, q, f2) + f1;
-    expv = __clc_ldexp(expv, m);
-  }
-
-  // See whether y is an integer.
-  // inty = 0 means not an integer.
-  // inty = 1 means odd integer.
-  // inty = 2 means even integer.
-
-  __CLC_LONGN inty = __CLC_CONVERT_LONGN(2 - (ny & 1));
-
-  expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
-
-  __CLC_LONGN ret = __CLC_AS_LONGN(expv);
-
-  // Now all the edge cases
-  __CLC_BIT_INTN x_is_ninf = ux == (__CLC_LONGN)NINFBITPATT_DP64;
-  __CLC_BIT_INTN x_is_pinf = ux == (__CLC_LONGN)PINFBITPATT_DP64;
-  ret = (!xpos & (inty == 2)) ? __CLC_AS_LONGN(__CLC_GENTYPE_NAN) : ret;
-  __CLC_LONGN xinf =
-      xpos ? (__CLC_LONGN)PINFBITPATT_DP64 : (__CLC_LONGN)NINFBITPATT_DP64;
-  ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
-  ret =
-      ((ax == 0L) & !ypos & (inty == 2)) ? (__CLC_LONGN)PINFBITPATT_DP64 : ret;
-  ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret;
-  __CLC_LONGN xzero = xpos ? 0L : (__CLC_LONGN)0x8000000000000000L;
-  ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
-  ret = (x_is_ninf & ypos & (inty == 1)) ? (__CLC_LONGN)NINFBITPATT_DP64 : ret;
-  ret = (x_is_ninf & !ypos & (inty == 1)) ? (__CLC_LONGN)0x8000000000000000L
-                                          : ret;
-  ret = (x_is_pinf & !ypos) ? 0L : ret;
-  ret = (x_is_pinf & ypos) ? (__CLC_LONGN)PINFBITPATT_DP64 : ret;
-  ret = ax > (__CLC_LONGN)PINFBITPATT_DP64 ? ux : ret;
-  ret = __CLC_CONVERT_LONGN(ny == 0) ? __CLC_AS_LONGN(__CLC_GENTYPE_NAN) : ret;
-  return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
-                                                 __CLC_INTN y) {
-  return __CLC_CONVERT_GENTYPE(__clc_rootn(__CLC_CONVERT_FLOATN(x), y));
-}
-
-#endif
diff --git a/libclc/opencl/lib/generic/math/pow.cl b/libclc/opencl/lib/generic/math/pow.cl
index 2fd3a50733ed5..07aa50abe48d6 100644
--- a/libclc/opencl/lib/generic/math/pow.cl
+++ b/libclc/opencl/lib/generic/math/pow.cl
@@ -6,8 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/math/clc_pow.h>
+#include "clc/math/clc_pow.h"
 
 #define __CLC_FUNCTION pow
-#define __CLC_BODY <clc/shared/binary_def.inc>
-#include <clc/math/gentype.inc>
+#define __CLC_BODY "clc/shared/binary_def.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+#undef __CLC_IMPL_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __pow_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_pow_fast
+#define __CLC_BODY "clc/shared/binary_def.inc"
+#include "clc/math/gentype.inc"
diff --git a/libclc/opencl/lib/generic/math/pown.cl b/libclc/opencl/lib/generic/math/pown.cl
index e48bc10a636ab..7f51666530b1a 100644
--- a/libclc/opencl/lib/generic/math/pown.cl
+++ b/libclc/opencl/lib/generic/math/pown.cl
@@ -6,8 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/math/clc_pown.h>
+#include "clc/math/clc_pown.h"
 
 #define __CLC_FUNCTION pown
 #define __CLC_BODY <clc/shared/binary_def_with_int_second_arg.inc>
-#include <clc/math/gentype.inc>
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+#undef __CLC_IMPL_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __pown_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_pown_fast
+#define __CLC_BODY <clc/shared/binary_def_with_int_second_arg.inc>
+#include "clc/math/gentype.inc"
diff --git a/libclc/opencl/lib/generic/math/powr.cl b/libclc/opencl/lib/generic/math/powr.cl
index 168e30aa57b08..e74297589a2b9 100644
--- a/libclc/opencl/lib/generic/math/powr.cl
+++ b/libclc/opencl/lib/generic/math/powr.cl
@@ -6,8 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/math/clc_powr.h>
+#include "clc/math/clc_powr.h"
 
 #define __CLC_FUNCTION powr
 #define __CLC_BODY <clc/shared/binary_def.inc>
-#include <clc/math/gentype.inc>
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+#undef __CLC_IMPL_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __powr_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_powr_fast
+#define __CLC_BODY <clc/shared/binary_def.inc>
+#include "clc/math/gentype.inc"
diff --git a/libclc/opencl/lib/generic/math/rootn.cl b/libclc/opencl/lib/generic/math/rootn.cl
index 8c9c7f4cc72f5..1c329e49dedaf 100644
--- a/libclc/opencl/lib/generic/math/rootn.cl
+++ b/libclc/opencl/lib/generic/math/rootn.cl
@@ -6,8 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/math/clc_rootn.h>
+#include "clc/math/clc_rootn.h"
 
 #define __CLC_FUNCTION rootn
 #define __CLC_BODY <clc/shared/binary_def_with_int_second_arg.inc>
-#include <clc/math/gentype.inc>
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+#undef __CLC_IMPL_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __rootn_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_rootn_fast
+#define __CLC_BODY <clc/shared/binary_def_with_int_second_arg.inc>
+#include "clc/math/gentype.inc"

>From 04dc71ffc20fed0ade86c930da1014c838715693 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 17 Mar 2026 08:12:32 +0100
Subject: [PATCH 2/3] Rename defines

---
 libclc/clc/lib/generic/math/clc_pow.cl       |  2 +-
 libclc/clc/lib/generic/math/clc_pow_base.inc | 32 ++++++++++----------
 libclc/clc/lib/generic/math/clc_pown.cl      |  2 +-
 libclc/clc/lib/generic/math/clc_powr.cl      |  2 +-
 libclc/clc/lib/generic/math/clc_rootn.cl     |  2 +-
 5 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/libclc/clc/lib/generic/math/clc_pow.cl b/libclc/clc/lib/generic/math/clc_pow.cl
index 14fbfb68359f5..eba5daa1a1e41 100644
--- a/libclc/clc/lib/generic/math/clc_pow.cl
+++ b/libclc/clc/lib/generic/math/clc_pow.cl
@@ -25,7 +25,7 @@
 #include "clc/relational/clc_isinf.h"
 #include "clc/relational/clc_isunordered.h"
 
-#define COMPILING_POW
+#define __CLC_COMPILING_POW
 #define __CLC_BODY "clc_pow_base.inc"
 #include "clc/math/gentype.inc"
 
diff --git a/libclc/clc/lib/generic/math/clc_pow_base.inc b/libclc/clc/lib/generic/math/clc_pow_base.inc
index 56fdb177a844b..016a506c41487 100644
--- a/libclc/clc/lib/generic/math/clc_pow_base.inc
+++ b/libclc/clc/lib/generic/math/clc_pow_base.inc
@@ -52,7 +52,7 @@
 
 #ifdef __CLC_SCALAR
 
-#ifdef COMPILING_POW
+#ifdef __CLC_COMPILING_POW
 
 _CLC_OVERLOAD _CLC_CONST static bool is_integer(__CLC_GENTYPE ay) {
   return __clc_trunc(ay) == ay;
@@ -76,7 +76,7 @@ static __CLC_GENTYPE fast_expylnx(__CLC_GENTYPE x, __CLC_GENTYPE y) {
   return __clc_exp2(y * __clc_log2(ax));
 }
 
-#if defined(COMPILING_POW) || defined(COMPILING_POWR)
+#if defined(__CLC_COMPILING_POW) || defined(__CLC_COMPILING_POWR)
 
 _CLC_CONST
 static __CLC_GENTYPE compute_expylnx_float(__CLC_GENTYPE x, __CLC_GENTYPE y) {
@@ -85,7 +85,7 @@ static __CLC_GENTYPE compute_expylnx_float(__CLC_GENTYPE x, __CLC_GENTYPE y) {
 }
 #endif
 
-#if defined(COMPILING_POW)
+#if defined(__CLC_COMPILING_POW)
 
 _CLC_CONST
 static __CLC_GENTYPE pow_fixup(__CLC_GENTYPE x, __CLC_GENTYPE y,
@@ -139,7 +139,7 @@ __clc_pow_fast(__CLC_GENTYPE x, __CLC_GENTYPE y) {
   return pow_fixup(x, y, expylnx);
 }
 
-#elif defined(COMPILING_POWR)
+#elif defined(__CLC_COMPILING_POWR)
 
 _CLC_CONST
 static __CLC_GENTYPE powr_fixup(__CLC_GENTYPE x, __CLC_GENTYPE y,
@@ -183,7 +183,7 @@ __clc_powr_fast(__CLC_GENTYPE x, __CLC_GENTYPE y) {
   return powr_fixup(x, y, expylnx);
 }
 
-#elif defined(COMPILING_POWN)
+#elif defined(__CLC_COMPILING_POWN)
 
 _CLC_CONST
 static __CLC_GENTYPE compute_expylnx_int(__CLC_GENTYPE x, __CLC_INTN ny) {
@@ -226,7 +226,7 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown_fast(__CLC_GENTYPE x,
   return pown_fixup(x, ny, expylnx);
 }
 
-#elif defined(COMPILING_ROOTN)
+#elif defined(__CLC_COMPILING_ROOTN)
 
 // root version of compute_expylnx_int
 _CLC_CONST
@@ -278,7 +278,7 @@ __clc_rootn_fast(__CLC_GENTYPE x, __CLC_INTN ny) {
 
 #elif __CLC_FPSIZE == 64
 
-#if defined(COMPILING_POW)
+#if defined(__CLC_COMPILING_POW)
 
 _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
                                                           __CLC_GENTYPE y) {
@@ -317,7 +317,7 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
   return ret;
 }
 
-#elif defined(COMPILING_POWR)
+#elif defined(__CLC_COMPILING_POWR)
 
 _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
                                                            __CLC_GENTYPE y) {
@@ -348,7 +348,7 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
   return ret;
 }
 
-#elif defined(COMPILING_POWN)
+#elif defined(__CLC_COMPILING_POWN)
 
 _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
                                                            __CLC_INTN ny) {
@@ -373,7 +373,7 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
   return ret;
 }
 
-#elif defined(COMPILING_ROOTN)
+#elif defined(__CLC_COMPILING_ROOTN)
 
 _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
                                                             __CLC_INTN ny) {
@@ -404,7 +404,7 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
 
 #elif __CLC_FPSIZE == 16
 
-#if defined(COMPILING_POW) || defined(COMPILING_POWR)
+#if defined(__CLC_COMPILING_POW) || defined(__CLC_COMPILING_POWR)
 
 _CLC_CONST
 static __CLC_GENTYPE compute_expylnx_f16(__CLC_GENTYPE ax, __CLC_GENTYPE y) {
@@ -414,9 +414,9 @@ static __CLC_GENTYPE compute_expylnx_f16(__CLC_GENTYPE ax, __CLC_GENTYPE y) {
   return __CLC_CONVERT_GENTYPE(result);
 }
 
-#endif // defined(COMPILING_POW) || defined(COMPILING_POWR)
+#endif // defined(__CLC_COMPILING_POW) || defined(__CLC_COMPILING_POWR)
 
-#if defined(COMPILING_POW)
+#if defined(__CLC_COMPILING_POW)
 
 _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
                                                           __CLC_GENTYPE y) {
@@ -454,7 +454,7 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
   return ret;
 }
 
-#elif defined(COMPILING_POWR)
+#elif defined(__CLC_COMPILING_POWR)
 
 _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
                                                            __CLC_GENTYPE y) {
@@ -482,7 +482,7 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
   return ret;
 }
 
-#elif defined(COMPILING_POWN)
+#elif defined(__CLC_COMPILING_POWN)
 
 _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
                                                            __CLC_INTN ny) {
@@ -507,7 +507,7 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
   return ret;
 }
 
-#elif defined(COMPILING_ROOTN)
+#elif defined(__CLC_COMPILING_ROOTN)
 
 _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
                                                             __CLC_INTN ny) {
diff --git a/libclc/clc/lib/generic/math/clc_pown.cl b/libclc/clc/lib/generic/math/clc_pown.cl
index c21738de8cf2f..6ddf5fd59b754 100644
--- a/libclc/clc/lib/generic/math/clc_pown.cl
+++ b/libclc/clc/lib/generic/math/clc_pown.cl
@@ -20,7 +20,7 @@
 #include "clc/math/clc_trunc.h"
 #include "clc/relational/clc_isinf.h"
 
-#define COMPILING_POWN
+#define __CLC_COMPILING_POWN
 #define __CLC_BODY "clc_pow_base.inc"
 #include "clc/math/gentype.inc"
 #undef __CLC_FUNCTION
diff --git a/libclc/clc/lib/generic/math/clc_powr.cl b/libclc/clc/lib/generic/math/clc_powr.cl
index 287e1a31b5b2a..fc153dfebf9a0 100644
--- a/libclc/clc/lib/generic/math/clc_powr.cl
+++ b/libclc/clc/lib/generic/math/clc_powr.cl
@@ -25,7 +25,7 @@
 #include "clc/relational/clc_isinf.h"
 #include "clc/relational/clc_isunordered.h"
 
-#define COMPILING_POWR
+#define __CLC_COMPILING_POWR
 #define __CLC_BODY <clc_pow_base.inc>
 #include "clc/math/gentype.inc"
 
diff --git a/libclc/clc/lib/generic/math/clc_rootn.cl b/libclc/clc/lib/generic/math/clc_rootn.cl
index 29a51baac87c3..9c50531542cbb 100644
--- a/libclc/clc/lib/generic/math/clc_rootn.cl
+++ b/libclc/clc/lib/generic/math/clc_rootn.cl
@@ -23,7 +23,7 @@
 #include "clc/relational/clc_isinf.h"
 #include "clc/relational/clc_isunordered.h"
 
-#define COMPILING_ROOTN
+#define __CLC_COMPILING_ROOTN
 #define __CLC_BODY <clc_pow_base.inc>
 #include <clc/math/gentype.inc>
 

>From 49ec3365f786d8541c795bce893dc3259d6263bf Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 17 Mar 2026 08:32:45 +0100
Subject: [PATCH 3/3] New scalarize file

---
 .../clc/shared/binary_def_scalarize_loop.inc  | 69 +++++++++++++++++++
 ...def_with_int_second_arg_scalarize_loop.inc | 37 ----------
 libclc/clc/lib/generic/math/clc_pown.cl       | 10 +--
 libclc/clc/lib/generic/math/clc_rootn.cl      | 10 +--
 4 files changed, 81 insertions(+), 45 deletions(-)
 create mode 100644 libclc/clc/include/clc/shared/binary_def_scalarize_loop.inc
 delete mode 100644 libclc/clc/include/clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc

diff --git a/libclc/clc/include/clc/shared/binary_def_scalarize_loop.inc b/libclc/clc/include/clc/shared/binary_def_scalarize_loop.inc
new file mode 100644
index 0000000000000..cd79e0f6aafed
--- /dev/null
+++ b/libclc/clc/include/clc/shared/binary_def_scalarize_loop.inc
@@ -0,0 +1,69 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/utils.h"
+
+#ifndef __CLC_IMPL_FUNCTION
+#define __CLC_IMPL_FUNCTION __CLC_FUNCTION
+#endif
+
+#ifndef __CLC_MIN_VECSIZE
+#define __CLC_MIN_VECSIZE 2
+#endif
+
+#ifndef __CLC_RET_TYPE
+#define __CLC_RET_TYPE __CLC_GENTYPE
+#define __CLC_RET_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#ifndef __CLC_ARG1_TYPE
+#define __CLC_ARG1_TYPE __CLC_GENTYPE
+#define __CLC_ARG1_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#ifndef __CLC_ARG2_TYPE
+#define __CLC_ARG2_TYPE __CLC_GENTYPE
+#define __CLC_ARG2_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#if __CLC_MIN_VECSIZE == 1
+
+_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE __CLC_FUNCTION(__CLC_ARG1_TYPE x,
+                                                           __CLC_ARG2_TYPE y) {
+  return __CLC_IMPL_FUNCTION(x, y);
+}
+
+#elif __CLC_VECSIZE_OR_1 >= 2
+
+_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE __CLC_FUNCTION(__CLC_ARG1_TYPE x,
+                                                     __CLC_ARG2_TYPE y) {
+  union {
+    __CLC_ARG1_TYPE vec;
+    __CLC_ARG1_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+  } u_x;
+
+  union {
+    __CLC_ARG2_TYPE vec;
+    __CLC_ARG2_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+  } u_y;
+
+  union {
+    __CLC_RET_TYPE vec;
+    __CLC_RET_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+  } u_result;
+
+  u_x.vec = x;
+  u_y.vec = y;
+  for (int i = 0; i < __CLC_VECSIZE_OR_1; ++i) {
+    u_result.arr[i] = __CLC_IMPL_FUNCTION(u_x.arr[i], u_y.arr[i]);
+  }
+
+  return u_result.vec;
+}
+
+#endif // __CLC_VECSIZE_OR_1 >= 2
diff --git a/libclc/clc/include/clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc b/libclc/clc/include/clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc
deleted file mode 100644
index 498df2fc420de..0000000000000
--- a/libclc/clc/include/clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc
+++ /dev/null
@@ -1,37 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clc/utils.h"
-
-#if __CLC_VECSIZE_OR_1 >= 2
-
-#ifndef __CLC_IMPL_FUNCTION
-#define __CLC_IMPL_FUNCTION __CLC_FUNCTION
-#endif
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
-                                                    __CLC_INTN y) {
-  union {
-    __CLC_GENTYPE vec;
-    __CLC_SCALAR_GENTYPE arr[__CLC_VECSIZE_OR_1];
-  } u_x, u_result;
-
-  union {
-    __CLC_INTN vec;
-    int arr[__CLC_VECSIZE_OR_1];
-  } u_y;
-
-  u_x.vec = x;
-  u_y.vec = y;
-  for (int i = 0; i < __CLC_VECSIZE_OR_1; ++i) {
-    u_result.arr[i] = __CLC_IMPL_FUNCTION(u_x.arr[i], u_y.arr[i]);
-  }
-  return u_result.vec;
-}
-
-#endif // __CLC_VECSIZE_OR_1 >= 2
diff --git a/libclc/clc/lib/generic/math/clc_pown.cl b/libclc/clc/lib/generic/math/clc_pown.cl
index 6ddf5fd59b754..b3d5389b66896 100644
--- a/libclc/clc/lib/generic/math/clc_pown.cl
+++ b/libclc/clc/lib/generic/math/clc_pown.cl
@@ -25,15 +25,17 @@
 #include "clc/math/gentype.inc"
 #undef __CLC_FUNCTION
 
+#define __CLC_ARG2_TYPE __CLC_INTN
+#define __CLC_ARG2_SCALAR_TYPE int
 #define __CLC_FUNCTION __clc_pown
-#define __CLC_BODY                                                             \
-  "clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc"
+#define __CLC_BODY "clc/shared/binary_def_scalarize_loop.inc"
 #include "clc/math/gentype.inc"
 #undef __CLC_FUNCTION
 
 #define __CLC_FLOAT_ONLY
+#define __CLC_ARG2_TYPE __CLC_INTN
+#define __CLC_ARG2_SCALAR_TYPE int
 #define __CLC_FUNCTION __clc_pown_fast
-#define __CLC_BODY                                                             \
-  "clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc"
+#define __CLC_BODY "clc/shared/binary_def_scalarize_loop.inc"
 #include "clc/math/gentype.inc"
 #undef __CLC_FUNCTION
diff --git a/libclc/clc/lib/generic/math/clc_rootn.cl b/libclc/clc/lib/generic/math/clc_rootn.cl
index 9c50531542cbb..00c5a27fc44d5 100644
--- a/libclc/clc/lib/generic/math/clc_rootn.cl
+++ b/libclc/clc/lib/generic/math/clc_rootn.cl
@@ -28,13 +28,15 @@
 #include <clc/math/gentype.inc>
 
 #define __CLC_FUNCTION __clc_rootn
-#define __CLC_BODY                                                             \
-  "clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc"
+#define __CLC_ARG2_TYPE __CLC_INTN
+#define __CLC_ARG2_SCALAR_TYPE int
+#define __CLC_BODY "clc/shared/binary_def_scalarize_loop.inc"
 #include "clc/math/gentype.inc"
 #undef __CLC_FUNCTION
 
 #define __CLC_FLOAT_ONLY
+#define __CLC_ARG2_TYPE __CLC_INTN
+#define __CLC_ARG2_SCALAR_TYPE int
 #define __CLC_FUNCTION __clc_rootn_fast
-#define __CLC_BODY                                                             \
-  "clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc"
+#define __CLC_BODY "clc/shared/binary_def_scalarize_loop.inc"
 #include "clc/math/gentype.inc"