[libclc] libclc: Update pow functions (PR #186890)
Matt Arsenault via cfe-commits
cfe-commits at lists.llvm.org
Tue Mar 17 00:46:02 PDT 2026
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/186890
>From ffbdec6cf46144fca7f22370ca0db1124ab2a062 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 12 Mar 2026 13:02:07 +0100
Subject: [PATCH 1/3] libclc: Update pow functions
The 4 flavors of pow were originally ported from rocm
device libs between c45ec604f593fcb03d770f4398142d2446017f68,
cc5c65b2c25e0a82fbad95f0ce3bb5262e29eeee, and
fe8e00bc3c65115b2e3d2a43cf3d0d756a934a52. Update to a newer
version. Additionally expose fast variants for use by the
libcall optimizer (e.g, __pow_fast) for float types.
---
libclc/clc/include/clc/math/clc_ep_decl.inc | 5 +
libclc/clc/include/clc/math/clc_exp2_fast.h | 19 +
libclc/clc/include/clc/math/clc_log2_fast.h | 19 +
libclc/clc/include/clc/math/clc_pow.h | 7 +-
libclc/clc/include/clc/math/clc_pown.h | 9 +-
libclc/clc/include/clc/math/clc_powr.h | 9 +-
libclc/clc/include/clc/math/clc_rootn.h | 9 +-
...def_with_int_second_arg_scalarize_loop.inc | 37 ++
libclc/clc/lib/amdgpu/CMakeLists.txt | 2 +
libclc/clc/lib/amdgpu/math/clc_exp2_fast.cl | 33 ++
libclc/clc/lib/amdgpu/math/clc_log2_fast.cl | 25 +
libclc/clc/lib/generic/CMakeLists.txt | 2 +
libclc/clc/lib/generic/math/clc_ep.cl | 3 +
libclc/clc/lib/generic/math/clc_ep.inc | 78 +++
libclc/clc/lib/generic/math/clc_exp2_fast.cl | 15 +
libclc/clc/lib/generic/math/clc_log2_fast.cl | 15 +
libclc/clc/lib/generic/math/clc_pow.cl | 43 +-
libclc/clc/lib/generic/math/clc_pow.inc | 438 --------------
libclc/clc/lib/generic/math/clc_pow_base.inc | 542 ++++++++++++++++++
libclc/clc/lib/generic/math/clc_pown.cl | 42 +-
libclc/clc/lib/generic/math/clc_pown.inc | 402 -------------
libclc/clc/lib/generic/math/clc_powr.cl | 44 +-
libclc/clc/lib/generic/math/clc_powr.inc | 414 -------------
libclc/clc/lib/generic/math/clc_rootn.cl | 41 +-
libclc/clc/lib/generic/math/clc_rootn.inc | 405 -------------
libclc/opencl/lib/generic/math/pow.cl | 14 +-
libclc/opencl/lib/generic/math/pown.cl | 12 +-
libclc/opencl/lib/generic/math/powr.cl | 12 +-
libclc/opencl/lib/generic/math/rootn.cl | 12 +-
29 files changed, 986 insertions(+), 1722 deletions(-)
create mode 100644 libclc/clc/include/clc/math/clc_exp2_fast.h
create mode 100644 libclc/clc/include/clc/math/clc_log2_fast.h
create mode 100644 libclc/clc/include/clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc
create mode 100644 libclc/clc/lib/amdgpu/math/clc_exp2_fast.cl
create mode 100644 libclc/clc/lib/amdgpu/math/clc_log2_fast.cl
create mode 100644 libclc/clc/lib/generic/math/clc_exp2_fast.cl
create mode 100644 libclc/clc/lib/generic/math/clc_log2_fast.cl
delete mode 100644 libclc/clc/lib/generic/math/clc_pow.inc
create mode 100644 libclc/clc/lib/generic/math/clc_pow_base.inc
delete mode 100644 libclc/clc/lib/generic/math/clc_pown.inc
delete mode 100644 libclc/clc/lib/generic/math/clc_powr.inc
delete mode 100644 libclc/clc/lib/generic/math/clc_rootn.inc
diff --git a/libclc/clc/include/clc/math/clc_ep_decl.inc b/libclc/clc/include/clc/math/clc_ep_decl.inc
index d29cfdc6346ba..1739da9f3ae63 100644
--- a/libclc/clc/include/clc/math/clc_ep_decl.inc
+++ b/libclc/clc/include/clc/math/clc_ep_decl.inc
@@ -128,4 +128,9 @@ _CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_sqrt(__CLC_GENTYPE a);
_CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_sqrt(__CLC_EP_PAIR a);
+#if __CLC_FPSIZE == 32 || __CLC_FPSIZE == 64
+_CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_ep_exp(__CLC_EP_PAIR a);
+_CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_ln(__CLC_GENTYPE a);
+#endif
+
#endif
diff --git a/libclc/clc/include/clc/math/clc_exp2_fast.h b/libclc/clc/include/clc/math/clc_exp2_fast.h
new file mode 100644
index 0000000000000..a42e6c9b7fd48
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_exp2_fast.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_EXP2_FAST_H__
+#define __CLC_MATH_CLC_EXP2_FAST_H__
+
+#define __CLC_FUNCTION __clc_exp2_fast
+#define __CLC_BODY <clc/shared/unary_decl.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_EXP2_FAST_H__
diff --git a/libclc/clc/include/clc/math/clc_log2_fast.h b/libclc/clc/include/clc/math/clc_log2_fast.h
new file mode 100644
index 0000000000000..5160afbedebf7
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_log2_fast.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_LOG2_FAST_H__
+#define __CLC_MATH_CLC_LOG2_FAST_H__
+
+#define __CLC_FUNCTION __clc_log2_fast
+#define __CLC_BODY <clc/shared/unary_decl.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_LOG2_FAST_H__
diff --git a/libclc/clc/include/clc/math/clc_pow.h b/libclc/clc/include/clc/math/clc_pow.h
index 5e37e5bf6da65..f7399873a4994 100644
--- a/libclc/clc/include/clc/math/clc_pow.h
+++ b/libclc/clc/include/clc/math/clc_pow.h
@@ -11,9 +11,14 @@
#define __CLC_BODY <clc/shared/binary_decl.inc>
#define __CLC_FUNCTION __clc_pow
-
#include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
+#define __CLC_FLOAT_ONLY
+#define __CLC_BODY <clc/shared/binary_decl.inc>
+#define __CLC_FUNCTION __clc_pow_fast
+#include <clc/math/gentype.inc>
#undef __CLC_FUNCTION
+#undef __CLC_FLOAT_ONLY
#endif // __CLC_MATH_CLC_POW_H__
diff --git a/libclc/clc/include/clc/math/clc_pown.h b/libclc/clc/include/clc/math/clc_pown.h
index 30628efb19001..3e2b359468b48 100644
--- a/libclc/clc/include/clc/math/clc_pown.h
+++ b/libclc/clc/include/clc/math/clc_pown.h
@@ -9,11 +9,16 @@
#ifndef __CLC_MATH_CLC_POWN_H__
#define __CLC_MATH_CLC_POWN_H__
-#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
#define __CLC_FUNCTION __clc_pown
-
+#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
#include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_pown_fast
+#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
+#include <clc/math/gentype.inc>
#undef __CLC_FUNCTION
+#undef __CLC_FLOAT_ONLY
#endif // __CLC_MATH_CLC_POWN_H__
diff --git a/libclc/clc/include/clc/math/clc_powr.h b/libclc/clc/include/clc/math/clc_powr.h
index baa494cce6989..67c591ca6aa82 100644
--- a/libclc/clc/include/clc/math/clc_powr.h
+++ b/libclc/clc/include/clc/math/clc_powr.h
@@ -9,11 +9,16 @@
#ifndef __CLC_MATH_CLC_POWR_H__
#define __CLC_MATH_CLC_POWR_H__
-#define __CLC_BODY <clc/shared/binary_decl.inc>
#define __CLC_FUNCTION __clc_powr
-
+#define __CLC_BODY <clc/shared/binary_decl.inc>
#include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_powr_fast
+#define __CLC_BODY <clc/shared/binary_decl.inc>
+#include <clc/math/gentype.inc>
#undef __CLC_FUNCTION
+#undef __CLC_FLOAT_ONLY
#endif // __CLC_MATH_CLC_POWR_H__
diff --git a/libclc/clc/include/clc/math/clc_rootn.h b/libclc/clc/include/clc/math/clc_rootn.h
index 90a25ad52d867..26d111a8671d4 100644
--- a/libclc/clc/include/clc/math/clc_rootn.h
+++ b/libclc/clc/include/clc/math/clc_rootn.h
@@ -9,11 +9,16 @@
#ifndef __CLC_MATH_CLC_ROOTN_H__
#define __CLC_MATH_CLC_ROOTN_H__
-#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
#define __CLC_FUNCTION __clc_rootn
-
+#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
#include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_rootn_fast
+#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
+#include <clc/math/gentype.inc>
#undef __CLC_FUNCTION
+#undef __CLC_FLOAT_ONLY
#endif // __CLC_MATH_CLC_ROOTN_H__
diff --git a/libclc/clc/include/clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc b/libclc/clc/include/clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc
new file mode 100644
index 0000000000000..498df2fc420de
--- /dev/null
+++ b/libclc/clc/include/clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/utils.h"
+
+#if __CLC_VECSIZE_OR_1 >= 2
+
+#ifndef __CLC_IMPL_FUNCTION
+#define __CLC_IMPL_FUNCTION __CLC_FUNCTION
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
+ __CLC_INTN y) {
+ union {
+ __CLC_GENTYPE vec;
+ __CLC_SCALAR_GENTYPE arr[__CLC_VECSIZE_OR_1];
+ } u_x, u_result;
+
+ union {
+ __CLC_INTN vec;
+ int arr[__CLC_VECSIZE_OR_1];
+ } u_y;
+
+ u_x.vec = x;
+ u_y.vec = y;
+ for (int i = 0; i < __CLC_VECSIZE_OR_1; ++i) {
+ u_result.arr[i] = __CLC_IMPL_FUNCTION(u_x.arr[i], u_y.arr[i]);
+ }
+ return u_result.vec;
+}
+
+#endif // __CLC_VECSIZE_OR_1 >= 2
diff --git a/libclc/clc/lib/amdgpu/CMakeLists.txt b/libclc/clc/lib/amdgpu/CMakeLists.txt
index 9b6c9a231ade0..daccc00b841b3 100644
--- a/libclc/clc/lib/amdgpu/CMakeLists.txt
+++ b/libclc/clc/lib/amdgpu/CMakeLists.txt
@@ -3,6 +3,7 @@ libclc_configure_source_list(CLC_AMDGPU_SOURCES
address_space/clc_qualifier.cl
math/clc_exp.cl
math/clc_exp2.cl
+ math/clc_exp2_fast.cl
math/clc_exp10.cl
math/clc_frexp.cl
math/clc_half_exp.cl
@@ -15,6 +16,7 @@ libclc_configure_source_list(CLC_AMDGPU_SOURCES
math/clc_half_rsqrt.cl
math/clc_half_sqrt.cl
math/clc_ldexp.cl
+ math/clc_log2_fast.cl
math/clc_native_exp.cl
math/clc_native_exp2.cl
math/clc_native_log10.cl
diff --git a/libclc/clc/lib/amdgpu/math/clc_exp2_fast.cl b/libclc/clc/lib/amdgpu/math/clc_exp2_fast.cl
new file mode 100644
index 0000000000000..b73bc9f6e260b
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/math/clc_exp2_fast.cl
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_MIN_VECSIZE 1
+#define __CLC_FUNCTION __clc_exp2_fast
+#define __CLC_IMPL_FUNCTION(x) __builtin_amdgcn_exp2f(x)
+#define __CLC_BODY <clc/shared/unary_def_scalarize.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_FLOAT_ONLY
+
+#define __CLC_HALF_ONLY
+#define __CLC_IMPL_FUNCTION(x) __clc_exp2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_HALF_ONLY
+
+#define __CLC_DOUBLE_ONLY
+#define __CLC_IMPL_FUNCTION(x) __clc_exp2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_DOUBLE_ONLY
diff --git a/libclc/clc/lib/amdgpu/math/clc_log2_fast.cl b/libclc/clc/lib/amdgpu/math/clc_log2_fast.cl
new file mode 100644
index 0000000000000..a47fc84b26a00
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/math/clc_log2_fast.cl
@@ -0,0 +1,25 @@
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_MIN_VECSIZE 1
+#define __CLC_FUNCTION __clc_log2_fast
+#define __CLC_IMPL_FUNCTION(x) __builtin_amdgcn_logf(x)
+#define __CLC_BODY <clc/shared/unary_def_scalarize.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_FLOAT_ONLY
+
+#define __CLC_HALF_ONLY
+#define __CLC_IMPL_FUNCTION(x) __clc_log2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_HALF_ONLY
+
+#define __CLC_DOUBLE_ONLY
+#define __CLC_IMPL_FUNCTION(x) __clc_log2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_DOUBLE_ONLY
diff --git a/libclc/clc/lib/generic/CMakeLists.txt b/libclc/clc/lib/generic/CMakeLists.txt
index 70a0863524b19..bda2ec67a55c3 100644
--- a/libclc/clc/lib/generic/CMakeLists.txt
+++ b/libclc/clc/lib/generic/CMakeLists.txt
@@ -80,6 +80,7 @@ libclc_configure_source_list(CLC_GENERIC_SOURCES
math/clc_erfc.cl
math/clc_exp.cl
math/clc_exp2.cl
+ math/clc_exp2_fast.cl
math/clc_exp10.cl
math/clc_exp_helper.cl
math/clc_expm1.cl
@@ -114,6 +115,7 @@ libclc_configure_source_list(CLC_GENERIC_SOURCES
math/clc_lgamma_r.cl
math/clc_log.cl
math/clc_log2.cl
+ math/clc_log2_fast.cl
math/clc_log10.cl
math/clc_log1p.cl
math/clc_logb.cl
diff --git a/libclc/clc/lib/generic/math/clc_ep.cl b/libclc/clc/lib/generic/math/clc_ep.cl
index f0e3020f65f6f..2e62563e2a107 100644
--- a/libclc/clc/lib/generic/math/clc_ep.cl
+++ b/libclc/clc/lib/generic/math/clc_ep.cl
@@ -9,8 +9,11 @@
#include "clc/clc_convert.h"
#include "clc/math/clc_div_fast.h"
#include "clc/math/clc_ep.h"
+#include "clc/math/clc_exp.h"
#include "clc/math/clc_fma.h"
+#include "clc/math/clc_frexp.h"
#include "clc/math/clc_ldexp.h"
+#include "clc/math/clc_mad.h"
#include "clc/math/clc_recip_fast.h"
#include "clc/math/clc_sqrt_fast.h"
#include "clc/relational/clc_isinf.h"
diff --git a/libclc/clc/lib/generic/math/clc_ep.inc b/libclc/clc/lib/generic/math/clc_ep.inc
index 38fa513c46aac..56a7f1a7becb2 100644
--- a/libclc/clc/lib/generic/math/clc_ep.inc
+++ b/libclc/clc/lib/generic/math/clc_ep.inc
@@ -387,5 +387,83 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_sqrt(__CLC_EP_PAIR a) {
a.hi == __CLC_FP_LIT(0.0) ? __CLC_FP_LIT(0.0) : slo);
}
+#if __CLC_FPSIZE == 32
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_ep_exp(__CLC_EP_PAIR x) {
+ float d = x.hi == 0x1.62e430p+6f ? 0x1.0p-17f : 0.0f;
+ x.hi -= d;
+ x.lo += d;
+ float z = __clc_exp(x.hi);
+ float zz = __clc_fma(z, x.lo, z);
+ return __clc_isinf(z) ? z : zz;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_ln(float a) {
+ int a_exp;
+ float m = __clc_frexp(a, &a_exp);
+ int b = m < (2.0f / 3.0f);
+ m = __clc_ldexp(m, b);
+ int e = a_exp - b;
+
+ __CLC_EP_PAIR x = __clc_ep_div(m - 1.0f, __clc_ep_fast_add(1.0f, m));
+ __CLC_EP_PAIR s = __clc_ep_sqr(x);
+ float t = s.hi;
+ float p = __clc_mad(t, __clc_mad(t, 0x1.ed89c2p-3f, 0x1.23e988p-2f),
+ 0x1.999bdep-2f);
+
+ // ln(2)*e + 2*x + x^3(c3 + x^2*p)
+ float2 r = __clc_ep_add(
+ __clc_ep_mul(__clc_ep_make_pair(0x1.62e430p-1f, -0x1.05c610p-29f),
+ (float)e),
+ __clc_ep_fast_add(
+ __clc_ep_ldexp(x, 1),
+ __clc_ep_mul(__clc_ep_mul(s, x),
+ __clc_ep_fast_add(
+ __clc_ep_make_pair(0x1.555554p-1f, 0x1.e72020p-29f),
+ __clc_ep_mul(s, p)))));
+ return r;
+}
+
+#elif __CLC_FPSIZE == 64
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_ep_exp(__CLC_EP_PAIR x) {
+ __CLC_GENTYPE z = __clc_exp(x.hi);
+ __CLC_GENTYPE zz = __clc_mad(z, x.lo, z);
+ return __clc_isinf(z) ? z : zz;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_ln(double a) {
+ int a_exp;
+ double m = __clc_frexp(a, &a_exp);
+ int b = m < __CLC_FP_LIT(2.0 / 3.0);
+ m = __clc_ldexp(m, b);
+ int e = a_exp - b;
+
+ double2 x = __clc_ep_div(m - 1.0, __clc_ep_fast_add(1.0, m));
+ double2 s = __clc_ep_sqr(x);
+ double t = s.hi;
+ double p = __clc_mad(t, __clc_mad(t, __clc_mad(t, __clc_mad(t,
+ __clc_mad(t, __clc_mad(t, __clc_mad(t, __clc_mad(t,
+ 0x1.dee674222de17p-4, 0x1.a6564968915a9p-4), 0x1.e25e43abe935ap-4), 0x1.110ef47e6c9c2p-3),
+ 0x1.3b13bcfa74449p-3), 0x1.745d171bf3c30p-3), 0x1.c71c71c7792cep-3), 0x1.24924924920dap-2),
+ 0x1.999999999999cp-2);
+
+ // ln(2)*e + 2*x + x^3(c3 + x^2*p)
+ double2 r = __clc_ep_add(
+ __clc_ep_mul(
+ __clc_ep_make_pair(0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56),
+ (double)e),
+ __clc_ep_fast_add(
+ __clc_ep_ldexp(x, 1),
+ __clc_ep_mul(
+ __clc_ep_mul(s, x),
+ __clc_ep_fast_add(__clc_ep_make_pair(0x1.5555555555555p-1,
+ 0x1.543b0d5df274dp-55),
+ __clc_ep_mul(s, p)))));
+ return r;
+}
+
+#endif
+
#undef __CLC_EP_USE_FMA
#endif
diff --git a/libclc/clc/lib/generic/math/clc_exp2_fast.cl b/libclc/clc/lib/generic/math/clc_exp2_fast.cl
new file mode 100644
index 0000000000000..e09bd65d7e02a
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_exp2_fast.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+
+#define __CLC_FUNCTION __clc_exp2_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_exp2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_log2_fast.cl b/libclc/clc/lib/generic/math/clc_log2_fast.cl
new file mode 100644
index 0000000000000..2aad63967e888
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_log2_fast.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+
+#define __CLC_FUNCTION __clc_log2_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_log2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_pow.cl b/libclc/clc/lib/generic/math/clc_pow.cl
index 70d3d614a8d36..14fbfb68359f5 100644
--- a/libclc/clc/lib/generic/math/clc_pow.cl
+++ b/libclc/clc/lib/generic/math/clc_pow.cl
@@ -6,16 +6,35 @@
//
//===----------------------------------------------------------------------===//
-#include <clc/clc_convert.h>
-#include <clc/internal/clc.h>
-#include <clc/math/clc_fabs.h>
-#include <clc/math/clc_fma.h>
-#include <clc/math/clc_ldexp.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/clc_subnormal_config.h>
-#include <clc/math/math.h>
-#include <clc/math/tables.h>
-#include <clc/relational/clc_select.h>
+#include "clc/clc_convert.h"
+#include "clc/float/definitions.h"
+#include "clc/internal/clc.h"
+#include "clc/math/clc_copysign.h"
+#include "clc/math/clc_ep.h"
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+#include "clc/math/clc_fabs.h"
+#include "clc/math/clc_ldexp.h"
+#include "clc/math/clc_log.h"
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+#include "clc/math/clc_mad.h"
+#include "clc/math/clc_recip_fast.h"
+#include "clc/math/clc_trunc.h"
+#include "clc/math/math.h"
+#include "clc/relational/clc_isinf.h"
+#include "clc/relational/clc_isunordered.h"
-#define __CLC_BODY <clc_pow.inc>
-#include <clc/math/gentype.inc>
+#define COMPILING_POW
+#define __CLC_BODY "clc_pow_base.inc"
+#include "clc/math/gentype.inc"
+
+#define __CLC_FUNCTION __clc_pow
+#define __CLC_BODY "clc/shared/binary_def_scalarize.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_pow_fast
+#define __CLC_BODY "clc/shared/binary_def_scalarize.inc"
+#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_pow.inc b/libclc/clc/lib/generic/math/clc_pow.inc
deleted file mode 100644
index 35cbcdae8ffff..0000000000000
--- a/libclc/clc/lib/generic/math/clc_pow.inc
+++ /dev/null
@@ -1,438 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Computes pow using log and exp
-//
-// x^y = exp(y * log(x))
-//
-// We take care not to lose precision in the intermediate steps
-//
-// When computing log, calculate it in splits:
-//
-// r = f * (p_invead + p_inv_tail)
-// r = rh + rt
-//
-// Calculate log polynomial using r, in end addition, do:
-//
-// poly = poly + ((rh-r) + rt)
-//
-// lth = -r
-// ltt = ((xexp * log2_t) - poly) + logT
-// lt = lth + ltt
-//
-// lh = (xexp * log2_h) + logH
-// l = lh + lt
-//
-// Calculate final log answer as gh and gt:
-//
-// gh = l & higher-half bits
-// gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh))
-//
-// yh = y & higher-half bits
-// yt = y - yh
-//
-// Before entering computation of exp:
-//
-// vs = ((yt*gt + yt*gh) + yh*gt)
-// v = vs + yh*gh
-// vt = ((yh*gh - v) + vs)
-//
-// In calculation of exp, add vt to r that is used for poly.
-//
-// At the end of exp, do
-//
-// ((((expT * poly) + expT) + expH*poly) + expH)
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
- __CLC_GENTYPE y) {
- __CLC_GENTYPE absx = __clc_fabs(x);
- __CLC_INTN ix = __CLC_AS_INTN(x);
- __CLC_INTN ax = __CLC_AS_INTN(absx);
- __CLC_INTN xpos = ix == ax;
-
- __CLC_INTN iy = __CLC_AS_INTN(y);
- __CLC_INTN ay = __CLC_AS_INTN(__clc_fabs(y));
- __CLC_INTN ypos = iy == ay;
-
- /* Extra precise log calculation
- * First handle case that x is close to 1
- */
- __CLC_GENTYPE r = 1.0f - absx;
- __CLC_INTN near1 = __clc_fabs(r) < 0x1.0p-4f;
- __CLC_GENTYPE r2 = r * r;
-
- /* Coefficients are just 1/3, 1/4, 1/5 and 1/6 */
- __CLC_GENTYPE poly = __clc_mad(
- r,
- __clc_mad(r,
- __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
- 0x1.99999ap-3f),
- 0x1.000000p-2f),
- 0x1.555556p-2f);
-
- poly *= r2 * r;
-
- __CLC_GENTYPE lth_near1 = -r2 * 0.5f;
- __CLC_GENTYPE ltt_near1 = -poly;
- __CLC_GENTYPE lt_near1 = lth_near1 + ltt_near1;
- __CLC_GENTYPE lh_near1 = -r;
- __CLC_GENTYPE l_near1 = lh_near1 + lt_near1;
-
- /* Computations for x not near 1 */
- __CLC_INTN m = __CLC_CONVERT_INTN(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
- __CLC_GENTYPE mf = __CLC_CONVERT_GENTYPE(m);
- __CLC_INTN ixs = __CLC_AS_INTN(__CLC_AS_GENTYPE(ax | 0x3f800000) - 1.0f);
- __CLC_GENTYPE mfs = __CLC_CONVERT_GENTYPE((ixs >> EXPSHIFTBITS_SP32) - 253);
- __CLC_INTN c = m == -127;
- __CLC_INTN ixn = c ? ixs : ax;
- __CLC_GENTYPE mfn = c ? mfs : mf;
-
- __CLC_INTN indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
- /* F - Y */
- __CLC_GENTYPE f = __CLC_AS_GENTYPE(0x3f000000 | indx) -
- __CLC_AS_GENTYPE(0x3f000000 | (ixn & MANTBITS_SP32));
-
- indx = indx >> 16;
- __CLC_GENTYPE rh = f * __CLC_USE_TABLE(log_inv_tbl_ep_head, indx);
- __CLC_GENTYPE rt = f * __CLC_USE_TABLE(log_inv_tbl_ep_tail, indx);
- r = rh + rt;
-
- poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
- (r * r);
- poly += (rh - r) + rt;
-
- const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f; /* 0.693115234 */
- const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; /* 0.0000319461833 */
- __CLC_GENTYPE logel = __CLC_USE_TABLE(loge_tbl_lo, indx);
- __CLC_GENTYPE logeh = __CLC_USE_TABLE(loge_tbl_hi, indx);
- __CLC_GENTYPE lth = -r;
- __CLC_GENTYPE ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + logeh;
- __CLC_GENTYPE lt = lth + ltt;
- __CLC_GENTYPE lh = __clc_mad(mfn, LOG2_HEAD, logel);
- __CLC_GENTYPE l = lh + lt;
-
- /* Select near 1 or not */
- lth = near1 ? lth_near1 : lth;
- ltt = near1 ? ltt_near1 : ltt;
- lt = near1 ? lt_near1 : lt;
- lh = near1 ? lh_near1 : lh;
- l = near1 ? l_near1 : l;
-
- __CLC_GENTYPE gh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(l) & 0xfffff000);
- __CLC_GENTYPE gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
- __CLC_GENTYPE yh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(iy) & 0xfffff000);
-
- __CLC_GENTYPE yt = y - yh;
-
- __CLC_GENTYPE ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
- __CLC_GENTYPE ylogx = __clc_mad(yh, gh, ylogx_s);
- __CLC_GENTYPE ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
-
- /* Extra precise exp of ylogx */
- /* 64/log2 : 92.332482616893657 */
- const __CLC_GENTYPE R_64_BY_LOG2 = 0x1.715476p+6f;
- __CLC_INTN n = __CLC_CONVERT_INTN(ylogx * R_64_BY_LOG2);
- __CLC_GENTYPE nf = __CLC_CONVERT_GENTYPE(n);
-
- __CLC_INTN j = n & 0x3f;
- m = n >> 6;
- __CLC_INTN m2 = m << EXPSHIFTBITS_SP32;
-
- /* log2/64 lead: 0.0108032227 */
- const __CLC_GENTYPE R_LOG2_BY_64_LD = 0x1.620000p-7f;
- /* log2/64 tail: 0.0000272020388 */
- const __CLC_GENTYPE R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
- r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
- ylogx_t;
-
- /* Truncated Taylor series for e^r */
- poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
- 0x1.000000p-1f),
- r * r, r);
-
- __CLC_GENTYPE exp_head = __CLC_USE_TABLE(exp_tbl_ep_head, j);
- __CLC_GENTYPE exp_tail = __CLC_USE_TABLE(exp_tbl_ep_tail, j);
-
- __CLC_GENTYPE expylogx =
- __clc_mad(exp_head, poly, __clc_mad(exp_tail, poly, exp_tail)) + exp_head;
- __CLC_GENTYPE sexpylogx =
- expylogx * __CLC_AS_GENTYPE((__CLC_UINTN)0x1 << (m + 149));
- __CLC_GENTYPE texpylogx = __CLC_AS_GENTYPE(__CLC_AS_INTN(expylogx) + m2);
- expylogx = m < -125 ? sexpylogx : texpylogx;
-
- /* Result is +-Inf if (ylogx + ylogx_t) > 128*log2 */
- expylogx =
- __clc_select(expylogx, __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32),
- ylogx > 0x1.62e430p+6f ||
- (ylogx == 0x1.62e430p+6f && ylogx_t > -0x1.05c610p-22f));
-
- /* Result is 0 if ylogx < -149*log2 */
- expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
- /* Classify y:
- * inty = 0 means not an integer.
- * inty = 1 means odd integer.
- * inty = 2 means even integer.
- */
-
- __CLC_INTN yexp =
- __CLC_CONVERT_INTN(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
- __CLC_INTN mask = ((__CLC_INTN)1 << (24 - yexp)) - 1;
- __CLC_INTN yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
- __CLC_INTN inty = yodd ? 1 : 2;
- inty = (iy & mask) != 0 ? 0 : inty;
- inty = yexp < 1 ? 0 : inty;
- inty = yexp > 24 ? 2 : inty;
-
- __CLC_GENTYPE signval =
- __CLC_AS_GENTYPE((__CLC_AS_UINTN(expylogx) ^ SIGNBIT_SP32));
- expylogx = ((inty == 1) && !xpos) ? signval : expylogx;
- __CLC_INTN ret = __CLC_AS_INTN(expylogx);
-
- /* Corner case handling */
- ret = (!xpos && (inty == 0)) ? QNANBITPATT_SP32 : ret;
- ret = ax < 0x3f800000 && iy == (__CLC_INTN)NINFBITPATT_SP32 ? PINFBITPATT_SP32
- : ret;
- ret = ax > 0x3f800000 && iy == (__CLC_INTN)NINFBITPATT_SP32 ? 0 : ret;
- ret = ax < 0x3f800000 && iy == (__CLC_INTN)PINFBITPATT_SP32 ? 0 : ret;
- ret = ax > 0x3f800000 && iy == (__CLC_INTN)PINFBITPATT_SP32 ? PINFBITPATT_SP32
- : ret;
- __CLC_BIT_INTN x_is_ninf = ix == (__CLC_INTN)NINFBITPATT_SP32;
- __CLC_BIT_INTN x_is_pinf = ix == (__CLC_INTN)PINFBITPATT_SP32;
- __CLC_INTN xinf =
- xpos ? (__CLC_INTN)PINFBITPATT_SP32 : (__CLC_INTN)NINFBITPATT_SP32;
-
- ret = ((ax == 0) && !ypos && (inty == 1)) ? xinf : ret;
- ret = ((ax == 0) && !ypos && (inty != 1)) ? PINFBITPATT_SP32 : ret;
- __CLC_INTN xzero = xpos ? (__CLC_INTN)0 : (__CLC_INTN)0x80000000;
- ret = ((ax == 0) && ypos && (inty == 1)) ? xzero : ret;
- ret = ((ax == 0) && ypos && (inty != 1)) ? 0 : ret;
- ret = ((ax == 0) && (iy == (__CLC_INTN)NINFBITPATT_SP32)) ? PINFBITPATT_SP32
- : ret;
- ret = (ix == (__CLC_INTN)0xbf800000 && ay == PINFBITPATT_SP32) ? 0x3f800000
- : ret;
- ret = (x_is_ninf && !ypos && (inty == 1)) ? (__CLC_INTN)0x80000000 : ret;
- ret = (x_is_ninf && !ypos && (inty != 1)) ? 0 : ret;
- ret = (x_is_ninf && ypos && (inty == 1)) ? (__CLC_INTN)NINFBITPATT_SP32 : ret;
- ret = (x_is_ninf && ypos && (inty != 1)) ? (__CLC_INTN)PINFBITPATT_SP32 : ret;
- ret = (x_is_pinf && !ypos) ? 0 : ret;
- ret = (x_is_pinf && ypos) ? PINFBITPATT_SP32 : ret;
- ret = (ax > PINFBITPATT_SP32) ? ix : ret;
- ret = (ay > PINFBITPATT_SP32) ? iy : ret;
- ret = ay == 0 ? 0x3f800000 : ret;
- ret = ix == 0x3f800000 ? 0x3f800000 : ret;
-
- return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 64
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
- __CLC_GENTYPE y) {
- const __CLC_GENTYPE real_log2_tail = 5.76999904754328540596e-08;
- const __CLC_GENTYPE real_log2_lead = 6.93147122859954833984e-01;
-
- __CLC_LONGN ux = __CLC_AS_LONGN(x);
- __CLC_LONGN ax = __CLC_AS_LONGN(__clc_fabs(x));
- __CLC_BIT_INTN xpos = ax == ux;
-
- __CLC_LONGN uy = __CLC_AS_LONGN(y);
- __CLC_LONGN ay = __CLC_AS_LONGN(__clc_fabs(y));
- __CLC_BIT_INTN ypos = ay == uy;
-
- // Extended precision log
- __CLC_GENTYPE v, vt;
- {
- __CLC_INTN exp = __CLC_CONVERT_INTN(ax >> 52) - 1023;
- __CLC_INTN mask_exp_1023 = exp == (__CLC_INTN)-1023;
- __CLC_GENTYPE xexp = __CLC_CONVERT_GENTYPE(exp);
- __CLC_LONGN mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
- __CLC_LONGN temp_ux =
- __CLC_AS_LONGN(__CLC_AS_GENTYPE(0x3ff0000000000000L | mantissa) - 1.0);
- exp = __CLC_CONVERT_INTN((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
- __CLC_GENTYPE xexp1 = __CLC_CONVERT_GENTYPE(exp);
- __CLC_LONGN mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
- xexp = __CLC_CONVERT_LONGN(mask_exp_1023) ? xexp1 : xexp;
- mantissa = __CLC_CONVERT_LONGN(mask_exp_1023) ? mantissa1 : mantissa;
-
- __CLC_LONGN rax = (mantissa & 0x000ff00000000000) +
- ((mantissa & 0x0000080000000000) << 1);
- __CLC_INTN index = __CLC_CONVERT_INTN(rax >> 44);
-
- __CLC_GENTYPE F = __CLC_AS_GENTYPE(rax | 0x3FE0000000000000L);
- __CLC_GENTYPE Y = __CLC_AS_GENTYPE(mantissa | 0x3FE0000000000000L);
- __CLC_GENTYPE f = F - Y;
- __CLC_GENTYPE log_h = __CLC_USE_TABLE(log_f_inv_tbl_head, index);
- __CLC_GENTYPE log_t = __CLC_USE_TABLE(log_f_inv_tbl_tail, index);
- __CLC_GENTYPE f_inv = (log_h + log_t) * f;
- __CLC_GENTYPE r1 =
- __CLC_AS_GENTYPE(__CLC_AS_ULONGN(f_inv) & 0xfffffffff8000000L);
- __CLC_GENTYPE r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
- __CLC_GENTYPE r = r1 + r2;
-
- __CLC_GENTYPE poly = __clc_fma(
- r,
- __clc_fma(r,
- __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
- 1.0 / 4.0),
- 1.0 / 3.0);
- poly = poly * r * r * r;
-
- __CLC_GENTYPE hr1r1 = 0.5 * r1 * r1;
- __CLC_GENTYPE poly0h = r1 + hr1r1;
- __CLC_GENTYPE poly0t = r1 - poly0h + hr1r1;
- poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;
-
- log_h = __CLC_USE_TABLE(powlog_tbl_head, index);
- log_t = __CLC_USE_TABLE(powlog_tbl_tail, index);
-
- __CLC_GENTYPE resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
- __CLC_GENTYPE resT = resT_t - poly0h;
- __CLC_GENTYPE resH = __clc_fma(xexp, real_log2_lead, log_h);
- __CLC_GENTYPE resT_h = poly0h;
-
- __CLC_GENTYPE H = resT + resH;
- __CLC_GENTYPE H_h =
- __CLC_AS_GENTYPE(__CLC_AS_ULONGN(H) & 0xfffffffff8000000L);
- __CLC_GENTYPE T =
- (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
- H = H_h;
-
- __CLC_GENTYPE y_head =
- __CLC_AS_GENTYPE(__CLC_AS_ULONGN(uy) & 0xfffffffff8000000L);
- __CLC_GENTYPE y_tail = y - y_head;
-
- __CLC_GENTYPE temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
- v = __clc_fma(y_head, H, temp);
- vt = __clc_fma(y_head, H, -v) + temp;
- }
-
- // Now calculate exp of (v,vt)
-
- __CLC_GENTYPE expv;
- {
- const __CLC_GENTYPE max_exp_arg = 709.782712893384;
- const __CLC_GENTYPE min_exp_arg = -745.1332191019411;
- const __CLC_GENTYPE sixtyfour_by_lnof2 = 92.33248261689366;
- const __CLC_GENTYPE lnof2_by_64_head = 0.010830424260348081;
- const __CLC_GENTYPE lnof2_by_64_tail = -4.359010638708991e-10;
-
- // If v is so large that we need to return INFINITY, or so small that we
- // need to return 0, set v to known values that will produce that result. Do
- // not try to continue the computation with the original v and patch it up
- // afterwards because v may be so large that temp is out of range of int, in
- // which case that conversion, and a value based on that conversion being
- // passed to __clc_ldexp, results in undefined behavior.
- v = v > max_exp_arg ? 1000.0 : v;
- v = v < min_exp_arg ? -1000.0 : v;
-
- __CLC_GENTYPE temp = v * sixtyfour_by_lnof2;
- __CLC_INTN n = __CLC_CONVERT_INTN(temp);
- __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n);
- __CLC_INTN j = n & 0x0000003f;
- __CLC_INTN m = n >> 6;
-
- __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j);
- __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j);
- __CLC_GENTYPE f = f1 + f2;
-
- __CLC_GENTYPE r1 = __clc_fma(dn, -lnof2_by_64_head, v);
- __CLC_GENTYPE r2 = dn * lnof2_by_64_tail;
- __CLC_GENTYPE r = (r1 + r2) + vt;
-
- __CLC_GENTYPE q =
- __clc_fma(r,
- __clc_fma(r,
- __clc_fma(r,
- __clc_fma(r, 1.38889490863777199667e-03,
- 8.33336798434219616221e-03),
- 4.16666666662260795726e-02),
- 1.66666666665260878863e-01),
- 5.00000000000000008883e-01);
- q = __clc_fma(r * r, q, r);
-
- expv = __clc_fma(f, q, f2) + f1;
- expv = __clc_ldexp(expv, m);
- }
-
- // See whether y is an integer.
- // inty = 0 means not an integer.
- // inty = 1 means odd integer.
- // inty = 2 means even integer.
-
- __CLC_LONGN inty;
- {
- __CLC_INTN yexp =
- __CLC_CONVERT_INTN(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
- inty = __CLC_CONVERT_LONGN(yexp < 1 ? 0 : 2);
- inty = __CLC_CONVERT_LONGN(yexp > 53) ? 2 : inty;
- __CLC_LONGN mask = ((__CLC_LONGN)1L << (53 - yexp)) - 1L;
- __CLC_LONGN inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1L : 2L;
- inty1 = (ay & mask) != 0 ? 0 : inty1;
- inty = __CLC_CONVERT_LONGN(!(yexp < 1) && !(yexp > 53)) ? inty1 : inty;
- }
-
- expv *= (inty == 1) && !xpos ? -1.0 : 1.0;
-
- __CLC_LONGN ret = __CLC_AS_LONGN(expv);
-
- // Now all the edge cases
- __CLC_BIT_INTN x_is_ninf = ux == (__CLC_LONGN)NINFBITPATT_DP64;
- __CLC_BIT_INTN x_is_pinf = ux == (__CLC_LONGN)PINFBITPATT_DP64;
- __CLC_BIT_INTN y_is_ninf = uy == (__CLC_LONGN)NINFBITPATT_DP64;
- __CLC_BIT_INTN y_is_pinf = uy == (__CLC_LONGN)PINFBITPATT_DP64;
- ret = !xpos && (inty == 0) ? QNANBITPATT_DP64 : ret;
- ret = ax < 0x3ff0000000000000L && y_is_ninf ? PINFBITPATT_DP64 : ret;
- ret = ax > 0x3ff0000000000000L && y_is_ninf ? 0L : ret;
- ret = ax < 0x3ff0000000000000L && y_is_pinf ? 0L : ret;
- ret = ax > 0x3ff0000000000000L && y_is_pinf ? PINFBITPATT_DP64 : ret;
- __CLC_LONGN xinf =
- xpos ? (__CLC_LONGN)PINFBITPATT_DP64 : (__CLC_LONGN)NINFBITPATT_DP64;
- ret = ((ax == 0L) && !ypos && (inty == 1)) ? xinf : ret;
- ret = ((ax == 0L) && !ypos && (inty != 1)) ? PINFBITPATT_DP64 : ret;
- __CLC_LONGN xzero = xpos ? (__CLC_LONGN)0L : (__CLC_LONGN)0x8000000000000000L;
- ret = ((ax == 0L) && ypos && (inty == 1)) ? xzero : ret;
- ret = ((ax == 0L) && ypos && (inty != 1)) ? 0L : ret;
- ret = ((ax == 0L) && y_is_ninf) ? PINFBITPATT_DP64 : ret;
- ret = ((ux == (__CLC_LONGN)0xbff0000000000000L) && (ay == PINFBITPATT_DP64))
- ? 0x3ff0000000000000L
- : ret;
- ret = (x_is_ninf && !ypos && (inty == 1)) ? (__CLC_LONGN)0x8000000000000000L
- : ret;
- ret = (x_is_ninf && !ypos && (inty != 1)) ? 0L : ret;
- ret =
- (x_is_ninf && ypos && (inty == 1)) ? (__CLC_LONGN)NINFBITPATT_DP64 : ret;
- ret =
- (x_is_ninf && ypos && (inty != 1)) ? (__CLC_LONGN)PINFBITPATT_DP64 : ret;
- ret = x_is_pinf && !ypos ? 0L : ret;
- ret = x_is_pinf && ypos ? PINFBITPATT_DP64 : ret;
- ret = ax > PINFBITPATT_DP64 ? ux : ret;
- ret = ay > PINFBITPATT_DP64 ? uy : ret;
- ret = ay == 0L ? 0x3ff0000000000000L : ret;
- ret = ux == 0x3ff0000000000000L ? 0x3ff0000000000000L : ret;
-
- return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
- __CLC_GENTYPE y) {
- return __CLC_CONVERT_GENTYPE(
- __clc_pow(__CLC_CONVERT_FLOATN(x), __CLC_CONVERT_FLOATN(y)));
-}
-
-#endif
diff --git a/libclc/clc/lib/generic/math/clc_pow_base.inc b/libclc/clc/lib/generic/math/clc_pow_base.inc
new file mode 100644
index 0000000000000..56fdb177a844b
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_pow_base.inc
@@ -0,0 +1,542 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Computes pow using log and exp
+//
+// x^y = exp(y * log(x))
+//
+// We take care not to lose precision in the intermediate steps.
+//
+// When computing log, calculate it in splits:
+//
+// r = f * (p_invead + p_inv_tail)
+// r = rh + rt
+//
+// Calculate log polynomial using r, in end addition, do:
+//
+// poly = poly + ((rh-r) + rt)
+//
+// lth = -r
+// ltt = ((xexp * log2_t) - poly) + logT
+// lt = lth + ltt
+//
+// lh = (xexp * log2_h) + logH
+// l = lh + lt
+//
+// Calculate final log answer as gh and gt:
+//
+// gh = l & higher-half bits
+// gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh))
+//
+// yh = y & higher-half bits
+// yt = y - yh
+//
+// Before entering computation of exp:
+//
+// vs = ((yt*gt + yt*gh) + yh*gt)
+// v = vs + yh*gh
+// vt = ((yh*gh - v) + vs)
+//
+// In calculation of exp, add vt to r that is used for poly.
+//
+// At the end of exp, do:
+//
+// ((((expT * poly) + expT) + expH*poly) + expH)
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+#ifdef COMPILING_POW
+
+_CLC_OVERLOAD _CLC_CONST static bool is_integer(__CLC_GENTYPE ay) {
+ return __clc_trunc(ay) == ay;
+}
+
+_CLC_OVERLOAD _CLC_CONST static bool is_even_integer(__CLC_GENTYPE ay) {
+ // Even integers are still integers after division by 2.
+ return is_integer(__CLC_FP_LIT(0.5) * ay);
+}
+
+_CLC_OVERLOAD _CLC_CONST static bool is_odd_integer(__CLC_GENTYPE ay) {
+ return is_integer(ay) && !is_even_integer(ay);
+}
+#endif
+
+#if __CLC_FPSIZE == 32
+
+_CLC_CONST
+static __CLC_GENTYPE fast_expylnx(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+ __CLC_GENTYPE ax = __clc_fabs(x);
+ return __clc_exp2(y * __clc_log2(ax));
+}
+
+#if defined(COMPILING_POW) || defined(COMPILING_POWR)
+
+_CLC_CONST
+static __CLC_GENTYPE compute_expylnx_float(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+ __CLC_GENTYPE ax = __clc_fabs(x);
+ return __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+}
+#endif
+
+#if defined(COMPILING_POW)
+
+_CLC_CONST
+static __CLC_GENTYPE pow_fixup(__CLC_GENTYPE x, __CLC_GENTYPE y,
+ __CLC_GENTYPE expylnx) {
+ __CLC_GENTYPE ax = __clc_fabs(x);
+ bool is_odd_y = is_odd_integer(y);
+
+ __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0f);
+
+ // Now all the edge cases
+ if (x < 0.0f && !is_integer(y))
+ ret = FLT_NAN;
+
+ __CLC_GENTYPE ay = __clc_fabs(y);
+ if (__clc_isinf(ay)) {
+ // FIXME: Missing backend optimization to save on
+ // materialization cost of mixed sign constant infinities.
+ bool y_is_neg_inf = y != ay;
+ ret = ax == 1.0f ? ax : ((ax < 1.0f) ^ y_is_neg_inf ? 0.0f : ay);
+ }
+
+ if (__clc_isinf(ax) || x == 0.0f)
+ ret = __clc_copysign((x == 0.0f) ^ (y < 0.0f) ? 0.0f : __CLC_GENTYPE_INF,
+ is_odd_y ? x : 0.0f);
+
+ if (__clc_isunordered(x, y))
+ ret = __CLC_GENTYPE_NAN;
+
+ return ret;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
+ __CLC_GENTYPE y) {
+ if (x == 1.0f)
+ y = 1.0f;
+ if (y == 0.0f)
+ x = 1.0f;
+
+ __CLC_GENTYPE expylnx = compute_expylnx_float(x, y);
+ return pow_fixup(x, y, expylnx);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
+__clc_pow_fast(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+ if (x == 1.0f)
+ y = 1.0f;
+ if (y == 0.0f)
+ x = 1.0f;
+
+ __CLC_GENTYPE expylnx = fast_expylnx(x, y);
+ return pow_fixup(x, y, expylnx);
+}
+
+#elif defined(COMPILING_POWR)
+
+_CLC_CONST
+static __CLC_GENTYPE powr_fixup(__CLC_GENTYPE x, __CLC_GENTYPE y,
+ __CLC_GENTYPE expylnx) {
+ __CLC_GENTYPE ret = expylnx;
+
+ // Now all the edge cases
+ __CLC_GENTYPE iz = y < 0.0f ? __CLC_GENTYPE_INF : 0.0f;
+ __CLC_GENTYPE zi = y < 0.0f ? 0.0f : __CLC_GENTYPE_INF;
+
+ if (x == 0.0f)
+ ret = y == 0.0f ? __CLC_GENTYPE_NAN : iz;
+
+ if (x == __CLC_GENTYPE_INF && y != 0.0f)
+ ret = zi;
+
+ if (__clc_isinf(y) && x != 1.0f)
+ ret = x < 1.0f ? iz : zi;
+
+ if (__clc_isunordered(x, y))
+ ret = __CLC_GENTYPE_NAN;
+
+ return ret;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
+ __CLC_GENTYPE y) {
+ if (x < 0.0f)
+ x = __CLC_GENTYPE_NAN;
+
+ __CLC_GENTYPE expylnx = compute_expylnx_float(x, y);
+ return powr_fixup(x, y, expylnx);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
+__clc_powr_fast(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+ if (x < 0.0f)
+ x = __CLC_GENTYPE_NAN;
+
+ __CLC_GENTYPE expylnx = fast_expylnx(x, y);
+ return powr_fixup(x, y, expylnx);
+}
+
+#elif defined(COMPILING_POWN)
+
+_CLC_CONST
+static __CLC_GENTYPE compute_expylnx_int(__CLC_GENTYPE x, __CLC_INTN ny) {
+ __CLC_GENTYPE ax = __clc_fabs(x);
+ __CLC_INTN nyh = ny & 0xffff0000;
+ __CLC_EP_PAIR y = __clc_ep_fast_add(__CLC_CONVERT_GENTYPE(nyh),
+ __CLC_CONVERT_GENTYPE(ny - nyh));
+ return __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+}
+
+_CLC_CONST
+static __CLC_GENTYPE pown_fixup(__CLC_GENTYPE x, __CLC_INTN ny,
+ __CLC_GENTYPE expylnx) {
+ bool is_odd_y = ny & 1;
+
+ __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0f);
+
+ // Now all the edge cases
+ if (__clc_isinf(x) || x == 0.0f)
+ ret = __clc_copysign((x == 0.0f) ^ (ny < 0) ? 0.0f : __CLC_GENTYPE_INF,
+ is_odd_y ? x : 0.0f);
+ return ret;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
+ __CLC_INTN ny) {
+ if (ny == 0)
+ x = 1.0f;
+
+ __CLC_GENTYPE expylnx = compute_expylnx_int(x, ny);
+ return pown_fixup(x, ny, expylnx);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown_fast(__CLC_GENTYPE x,
+ __CLC_INTN ny) {
+ if (ny == 0)
+ x = 1.0f;
+
+ __CLC_GENTYPE expylnx = fast_expylnx(x, __CLC_CONVERT_GENTYPE(ny));
+ return pown_fixup(x, ny, expylnx);
+}
+
+#elif defined(COMPILING_ROOTN)
+
+// root version of compute_expylnx_int
+_CLC_CONST
+static __CLC_GENTYPE compute_exp_inverse_y_lnx_int(__CLC_GENTYPE x,
+ __CLC_INTN ny) {
+ __CLC_GENTYPE ax = __clc_fabs(x);
+ __CLC_INTN nyh = ny & 0xffff0000;
+ __CLC_EP_PAIR y = __clc_ep_fast_add(__CLC_CONVERT_GENTYPE(nyh),
+ __CLC_CONVERT_GENTYPE(ny - nyh));
+ y = __clc_ep_recip(y);
+ return __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+}
+
+_CLC_CONST
+static __CLC_GENTYPE rootn_fixup(__CLC_GENTYPE x, __CLC_INTN ny,
+ __CLC_GENTYPE expylnx) {
+ bool is_odd_y = ny & 1;
+
+ __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0f);
+
+ // Now all the edge cases
+ if (__clc_isinf(x) || x == 0.0f)
+ ret = __clc_copysign((x == 0.0f) ^ (ny < 0) ? 0.0f : __CLC_GENTYPE_INF,
+ is_odd_y ? x : 0.0f);
+
+ if ((x < 0.0f && !is_odd_y) || ny == 0)
+ ret = __CLC_GENTYPE_NAN;
+
+ return ret;
+}
+
+_CLC_CONST
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
+ __CLC_INTN ny) {
+ __CLC_GENTYPE expylnx = compute_exp_inverse_y_lnx_int(x, ny);
+ return rootn_fixup(x, ny, expylnx);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
+__clc_rootn_fast(__CLC_GENTYPE x, __CLC_INTN ny) {
+ __CLC_GENTYPE y = __clc_recip_fast(__CLC_CONVERT_GENTYPE(ny));
+ __CLC_GENTYPE expylnx = fast_expylnx(x, y);
+ return rootn_fixup(x, ny, expylnx);
+}
+
+#else
+#error missing function macro
+#endif
+
+#elif __CLC_FPSIZE == 64
+
+#if defined(COMPILING_POW)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
+ __CLC_GENTYPE y) {
+ if (x == 1.0)
+ y = 1.0;
+ if (y == 0.0)
+ x = 1.0;
+
+ __CLC_GENTYPE ax = __clc_fabs(x);
+ __CLC_GENTYPE expylnx =
+ __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+
+ bool is_odd_y = is_odd_integer(y);
+
+ __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0);
+
+ // Now all the edge cases
+ if (x < 0.0 && !is_integer(y))
+ ret = __CLC_GENTYPE_NAN;
+
+ __CLC_GENTYPE ay = __clc_fabs(y);
+ if (__clc_isinf(ay)) {
+ // FIXME: Missing backend optimization to save on
+ // materialization cost of mixed sign constant infinities.
+ bool y_is_neg_inf = y != ay;
+ ret = ax == 1.0 ? ax : ((ax < 1.0) ^ y_is_neg_inf ? 0.0 : ay);
+ }
+
+ if (__clc_isinf(ax) || x == 0.0)
+ ret = __clc_copysign((x == 0.0) ^ (y < 0.0) ? 0.0 : __CLC_GENTYPE_INF,
+ is_odd_y ? x : 0.0);
+
+ if (__clc_isunordered(x, y))
+ ret = __CLC_GENTYPE_NAN;
+
+ return ret;
+}
+
+#elif defined(COMPILING_POWR)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
+ __CLC_GENTYPE y) {
+ if (x < 0.0)
+ x = __CLC_GENTYPE_NAN;
+
+ __CLC_GENTYPE ret = __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(x)));
+
+ // Now all the edge cases
+ __CLC_GENTYPE iz = y < 0.0 ? __CLC_GENTYPE_INF : 0.0;
+ __CLC_GENTYPE zi = y < 0.0 ? 0.0 : __CLC_GENTYPE_INF;
+
+ if (x == 0.0)
+ ret = y == 0.0 ? __CLC_GENTYPE_NAN : iz;
+
+ if (x == __CLC_GENTYPE_INF && y != 0.0)
+ ret = zi;
+
+ if (__clc_isinf(y) && x != 1.0)
+ ret = x < 1.0 ? iz : zi;
+
+ if (y == 0.0)
+ ret = x == 0.0 || __clc_isinf(x) ? __CLC_GENTYPE_NAN : 1.0;
+
+ if (__clc_isunordered(x, y))
+ ret = __CLC_GENTYPE_NAN;
+
+ return ret;
+}
+
+#elif defined(COMPILING_POWN)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
+ __CLC_INTN ny) {
+ if (ny == 0)
+ x = 1.0;
+
+ __CLC_GENTYPE y = __CLC_CONVERT_GENTYPE(ny);
+
+ __CLC_GENTYPE ax = __clc_fabs(x);
+ __CLC_GENTYPE expylnx =
+ __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+
+ bool is_odd_y = ny & 1;
+
+ __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0);
+
+ // Now all the edge cases
+ if (__clc_isinf(ax) || x == 0.0)
+ ret = __clc_copysign((x == 0.0) ^ (ny < 0) ? 0.0 : __CLC_GENTYPE_INF,
+ is_odd_y ? x : 0.0);
+
+ return ret;
+}
+
+#elif defined(COMPILING_ROOTN)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
+ __CLC_INTN ny) {
+ __CLC_EP_PAIR y = __clc_ep_recip(__CLC_CONVERT_GENTYPE(ny));
+
+ __CLC_GENTYPE ax = __clc_fabs(x);
+ __CLC_GENTYPE expylnx =
+ __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+
+ bool is_odd_y = ny & 1;
+
+ __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0);
+
+ // Now all the edge cases
+ if (__clc_isinf(ax) || x == 0.0)
+ ret = __clc_copysign((x == 0.0) ^ (ny < 0) ? 0.0 : __CLC_GENTYPE_INF,
+ is_odd_y ? x : 0.0);
+
+ if ((x < 0.0 && !is_odd_y) || ny == 0)
+ ret = __CLC_GENTYPE_NAN;
+
+ return ret;
+}
+
+#else
+#error missing function macro
+#endif
+
+#elif __CLC_FPSIZE == 16
+
+#if defined(COMPILING_POW) || defined(COMPILING_POWR)
+
+_CLC_CONST
+static __CLC_GENTYPE compute_expylnx_f16(__CLC_GENTYPE ax, __CLC_GENTYPE y) {
+ __CLC_FLOATN x_float = __CLC_CONVERT_FLOATN(ax);
+ __CLC_FLOATN y_float = __CLC_CONVERT_FLOATN(y);
+ __CLC_FLOATN result = __clc_exp2_fast(y_float * __clc_log2_fast(x_float));
+ return __CLC_CONVERT_GENTYPE(result);
+}
+
+#endif // defined(COMPILING_POW) || defined(COMPILING_POWR)
+
+#if defined(COMPILING_POW)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
+ __CLC_GENTYPE y) {
+ if (x == 1.0h)
+ y = 1.0h;
+ if (y == 0.0h)
+ x = 1.0h;
+
+ __CLC_GENTYPE ax = __clc_fabs(x);
+ __CLC_GENTYPE p = compute_expylnx_f16(ax, y);
+
+ bool is_odd_y = is_odd_integer(y);
+ __CLC_GENTYPE ret = __clc_copysign(p, is_odd_y ? x : 1.0h);
+
+ // Now all the edge cases
+ if (x < 0.0h && !is_integer(y))
+ ret = __CLC_GENTYPE_NAN;
+
+ __CLC_GENTYPE ay = __clc_fabs(y);
+ if (__clc_isinf(ay)) {
+ // FIXME: Missing backend optimization to save on
+ // materialization cost of mixed sign constant infinities.
+ bool y_is_neg_inf = y != ay;
+ ret = ax == 1.0h ? ax : ((ax < 1.0h) ^ y_is_neg_inf ? 0.0h : ay);
+ }
+
+ if (__clc_isinf(ax) || x == 0.0h) {
+ ret = __clc_copysign((x == 0.0h) ^ (y < 0.0h) ? 0.0h : __CLC_GENTYPE_INF,
+ is_odd_y ? x : 0.0h);
+ }
+
+ if (__clc_isunordered(x, y))
+ ret = __CLC_GENTYPE_NAN;
+
+ return ret;
+}
+
+#elif defined(COMPILING_POWR)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
+ __CLC_GENTYPE y) {
+ if (x < 0.0h)
+ x = __CLC_GENTYPE_NAN;
+
+ __CLC_GENTYPE ret = compute_expylnx_f16(x, y);
+
+ // Now all the edge cases
+ __CLC_GENTYPE iz = y < 0.0h ? __CLC_GENTYPE_INF : 0.0h;
+ __CLC_GENTYPE zi = y < 0.0h ? 0.0h : __CLC_GENTYPE_INF;
+
+ if (x == 0.0h)
+ ret = y == 0.0h ? __CLC_GENTYPE_NAN : iz;
+
+ if (x == __CLC_GENTYPE_INF && y != 0.0h)
+ ret = zi;
+
+ if (__clc_isinf(y) && x != 1.0h)
+ ret = x < 1.0h ? iz : zi;
+
+ if (__clc_isunordered(x, y))
+ ret = __CLC_GENTYPE_NAN;
+
+ return ret;
+}
+
+#elif defined(COMPILING_POWN)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
+ __CLC_INTN ny) {
+ if (ny == 0)
+ x = 1.0h;
+
+ __CLC_GENTYPE ax = __clc_fabs(x);
+ __CLC_FLOATN fy = __CLC_CONVERT_FLOATN(ny);
+ __CLC_FLOATN p =
+ __clc_exp2_fast(fy * __clc_log2_fast(__CLC_CONVERT_FLOATN(ax)));
+
+ bool is_odd_y = ny & 1;
+
+ __CLC_GENTYPE ret =
+ __clc_copysign(__CLC_CONVERT_GENTYPE(p), is_odd_y ? x : 1.0h);
+
+ // Now all the edge cases
+ if (__clc_isinf(ax) || x == 0.0h)
+ ret = __clc_copysign((x == 0.0h) ^ (ny < 0) ? 0.0h : __CLC_GENTYPE_INF,
+ is_odd_y ? x : 0.0h);
+
+ return ret;
+}
+
+#elif defined(COMPILING_ROOTN)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
+ __CLC_INTN ny) {
+ __CLC_GENTYPE ax = __clc_fabs(x);
+
+ __CLC_FLOATN fy = __clc_recip_fast(__CLC_CONVERT_FLOATN(ny));
+
+ __CLC_FLOATN p =
+ __clc_exp2_fast(fy * __clc_log2_fast(__CLC_CONVERT_FLOATN(ax)));
+
+ bool is_odd_y = ny & 1;
+
+ __CLC_GENTYPE ret =
+ __clc_copysign(__CLC_CONVERT_GENTYPE(p), is_odd_y ? x : 1.0h);
+
+ // Now all the edge cases
+ if (__clc_isinf(ax) || x == 0.0h)
+ ret = __clc_copysign((x == 0.0h) ^ (ny < 0) ? 0.0h : __CLC_GENTYPE_INF,
+ is_odd_y ? x : 0.0h);
+
+ if ((x < 0.0h && !is_odd_y) || ny == 0)
+ ret = __CLC_GENTYPE_NAN;
+
+ return ret;
+}
+
+#else
+#error missing function macro
+#endif
+
+#endif
+#endif // __CLC_SCALAR
diff --git a/libclc/clc/lib/generic/math/clc_pown.cl b/libclc/clc/lib/generic/math/clc_pown.cl
index 5aa9560174b99..c21738de8cf2f 100644
--- a/libclc/clc/lib/generic/math/clc_pown.cl
+++ b/libclc/clc/lib/generic/math/clc_pown.cl
@@ -6,16 +6,34 @@
//
//===----------------------------------------------------------------------===//
-#include <clc/clc_convert.h>
-#include <clc/internal/clc.h>
-#include <clc/math/clc_fabs.h>
-#include <clc/math/clc_fma.h>
-#include <clc/math/clc_ldexp.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/clc_subnormal_config.h>
-#include <clc/math/math.h>
-#include <clc/math/tables.h>
-#include <clc/relational/clc_select.h>
+#include "clc/clc_convert.h"
+#include "clc/math/clc_copysign.h"
+#include "clc/math/clc_ep.h"
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+#include "clc/math/clc_fabs.h"
+#include "clc/math/clc_ldexp.h"
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+#include "clc/math/clc_mad.h"
+#include "clc/math/clc_pown.h"
+#include "clc/math/clc_trunc.h"
+#include "clc/relational/clc_isinf.h"
-#define __CLC_BODY <clc_pown.inc>
-#include <clc/math/gentype.inc>
+#define COMPILING_POWN
+#define __CLC_BODY "clc_pow_base.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#define __CLC_FUNCTION __clc_pown
+#define __CLC_BODY \
+ "clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_pown_fast
+#define __CLC_BODY \
+ "clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
diff --git a/libclc/clc/lib/generic/math/clc_pown.inc b/libclc/clc/lib/generic/math/clc_pown.inc
deleted file mode 100644
index 1a681b5e4b397..0000000000000
--- a/libclc/clc/lib/generic/math/clc_pown.inc
+++ /dev/null
@@ -1,402 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Computes pow using log and exp
-//
-// x^y = exp(y * log(x))
-//
-// We take care not to lose precision in the intermediate steps.
-//
-// When computing log, calculate it in splits:
-//
-// r = f * (p_invead + p_inv_tail)
-// r = rh + rt
-//
-// Calculate log polynomial using r, in end addition, do:
-//
-// poly = poly + ((rh-r) + rt)
-//
-// lth = -r
-// ltt = ((xexp * log2_t) - poly) + logT
-// lt = lth + ltt
-//
-// lh = (xexp * log2_h) + logH
-// l = lh + lt
-//
-// Calculate final log answer as gh and gt:
-//
-// gh = l & higher-half bits
-// gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh))
-//
-// yh = y & higher-half bits
-// yt = y - yh
-//
-// Before entering computation of exp:
-//
-// vs = ((yt*gt + yt*gh) + yh*gt)
-// v = vs + yh*gh
-// vt = ((yh*gh - v) + vs)
-//
-// In calculation of exp, add vt to r that is used for poly.
-//
-// At the end of exp, do:
-//
-// ((((expT * poly) + expT) + expH*poly) + expH)
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
- __CLC_INTN ny) {
- __CLC_GENTYPE y = __CLC_CONVERT_GENTYPE(ny);
-
- __CLC_GENTYPE absx = __clc_fabs(x);
- __CLC_INTN ix = __CLC_AS_INTN(x);
- __CLC_INTN ax = __CLC_AS_INTN(absx);
- __CLC_INTN xpos = ix == ax;
-
- __CLC_INTN iy = __CLC_AS_INTN(y);
- __CLC_INTN ay = __CLC_AS_INTN(__clc_fabs(y));
- __CLC_INTN ypos = iy == ay;
-
- // Extra precise log calculation
- // First handle case that x is close to 1
- __CLC_GENTYPE r = 1.0f - absx;
- __CLC_INTN near1 = __clc_fabs(r) < 0x1.0p-4f;
- __CLC_GENTYPE r2 = r * r;
-
- // Coefficients are just 1/3, 1/4, 1/5 and 1/6
- __CLC_GENTYPE poly = __clc_mad(
- r,
- __clc_mad(r,
- __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
- 0x1.99999ap-3f),
- 0x1.000000p-2f),
- 0x1.555556p-2f);
-
- poly *= r2 * r;
-
- __CLC_GENTYPE lth_near1 = -r2 * 0.5f;
- __CLC_GENTYPE ltt_near1 = -poly;
- __CLC_GENTYPE lt_near1 = lth_near1 + ltt_near1;
- __CLC_GENTYPE lh_near1 = -r;
- __CLC_GENTYPE l_near1 = lh_near1 + lt_near1;
-
- // Computations for x not near 1
- __CLC_INTN m = __CLC_CONVERT_INTN(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
- __CLC_GENTYPE mf = __CLC_CONVERT_GENTYPE(m);
- __CLC_INTN ixs = __CLC_AS_INTN(__CLC_AS_GENTYPE(ax | 0x3f800000) - 1.0f);
- __CLC_GENTYPE mfs = __CLC_CONVERT_GENTYPE((ixs >> EXPSHIFTBITS_SP32) - 253);
- __CLC_INTN c = m == -127;
- __CLC_INTN ixn = c ? ixs : ax;
- __CLC_GENTYPE mfn = c ? mfs : mf;
-
- __CLC_INTN indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
- // F - Y
- __CLC_GENTYPE f = __CLC_AS_GENTYPE(0x3f000000 | indx) -
- __CLC_AS_GENTYPE(0x3f000000 | (ixn & MANTBITS_SP32));
-
- indx = indx >> 16;
- __CLC_GENTYPE rh = f * __CLC_USE_TABLE(log_inv_tbl_ep_head, indx);
- __CLC_GENTYPE rt = f * __CLC_USE_TABLE(log_inv_tbl_ep_tail, indx);
- r = rh + rt;
-
- poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
- (r * r);
- poly += (rh - r) + rt;
-
- const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234
- const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
- __CLC_GENTYPE logel = __CLC_USE_TABLE(loge_tbl_lo, indx);
- __CLC_GENTYPE logeh = __CLC_USE_TABLE(loge_tbl_hi, indx);
- __CLC_GENTYPE lth = -r;
- __CLC_GENTYPE ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + logeh;
- __CLC_GENTYPE lt = lth + ltt;
- __CLC_GENTYPE lh = __clc_mad(mfn, LOG2_HEAD, logel);
- __CLC_GENTYPE l = lh + lt;
-
- // Select near 1 or not
- lth = near1 ? lth_near1 : lth;
- ltt = near1 ? ltt_near1 : ltt;
- lt = near1 ? lt_near1 : lt;
- lh = near1 ? lh_near1 : lh;
- l = near1 ? l_near1 : l;
-
- __CLC_GENTYPE gh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(l) & 0xfffff000);
- __CLC_GENTYPE gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
- __CLC_GENTYPE yh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(iy) & 0xfffff000);
-
- __CLC_GENTYPE yt = __CLC_CONVERT_GENTYPE(ny - __CLC_CONVERT_INTN(yh));
-
- __CLC_GENTYPE ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
- __CLC_GENTYPE ylogx = __clc_mad(yh, gh, ylogx_s);
- __CLC_GENTYPE ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
-
- // Extra precise exp of ylogx
- // 64/log2 : 92.332482616893657
- const __CLC_GENTYPE R_64_BY_LOG2 = 0x1.715476p+6f;
- __CLC_INTN n = __CLC_CONVERT_INTN(ylogx * R_64_BY_LOG2);
- __CLC_GENTYPE nf = __CLC_CONVERT_GENTYPE(n);
-
- __CLC_INTN j = n & 0x3f;
- m = n >> 6;
- __CLC_INTN m2 = m << EXPSHIFTBITS_SP32;
-
- // log2/64 lead: 0.0108032227
- const __CLC_GENTYPE R_LOG2_BY_64_LD = 0x1.620000p-7f;
- // log2/64 tail: 0.0000272020388
- const __CLC_GENTYPE R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
- r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
- ylogx_t;
-
- // Truncated Taylor series for e^r
- poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
- 0x1.000000p-1f),
- r * r, r);
-
- __CLC_GENTYPE exp_head = __CLC_USE_TABLE(exp_tbl_ep_head, j);
- __CLC_GENTYPE exp_tail = __CLC_USE_TABLE(exp_tbl_ep_tail, j);
-
- __CLC_GENTYPE expylogx =
- __clc_mad(exp_head, poly, __clc_mad(exp_tail, poly, exp_tail)) + exp_head;
- __CLC_GENTYPE sexpylogx =
- expylogx * __CLC_AS_GENTYPE((__CLC_INTN)0x1 << (m + 149));
- __CLC_GENTYPE texpylogx = __CLC_AS_GENTYPE(__CLC_AS_INTN(expylogx) + m2);
- expylogx = m < -125 ? sexpylogx : texpylogx;
-
- // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
- expylogx =
- __clc_select(expylogx, __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32),
- ylogx > 0x1.62e430p+6f ||
- (ylogx == 0x1.62e430p+6f && ylogx_t > -0x1.05c610p-22f));
-
- // Result is 0 if ylogx < -149*log2
- expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
- // Classify y:
- // inty = 0 means not an integer.
- // inty = 1 means odd integer.
- // inty = 2 means even integer.
-
- __CLC_INTN inty = 2 - (ny & 1);
-
- __CLC_GENTYPE signval =
- __CLC_AS_GENTYPE((__CLC_AS_UINTN(expylogx) ^ SIGNBIT_SP32));
- expylogx = ((inty == 1) && !xpos) ? signval : expylogx;
- __CLC_INTN ret = __CLC_AS_INTN(expylogx);
-
- // Corner case handling
- __CLC_BIT_INTN x_is_ninf = ix == (__CLC_INTN)NINFBITPATT_SP32;
-
- __CLC_INTN xinf =
- xpos ? (__CLC_INTN)PINFBITPATT_SP32 : (__CLC_INTN)NINFBITPATT_SP32;
- ret = ((ax == 0) && !ypos && (inty == 1)) ? xinf : ret;
- ret = ((ax == 0) && !ypos && (inty == 2)) ? PINFBITPATT_SP32 : ret;
- ret = ((ax == 0) && ypos && (inty == 2)) ? 0 : ret;
- __CLC_INTN xzero = !xpos ? (__CLC_INTN)0x80000000 : (__CLC_INTN)0;
- ret = ((ax == 0) && ypos && (inty == 1)) ? xzero : ret;
- ret = (x_is_ninf && !ypos && (inty == 1)) ? (__CLC_INTN)0x80000000 : ret;
- ret = (x_is_ninf && !ypos && (inty != 1)) ? 0 : ret;
- ret = (x_is_ninf && ypos && (inty == 1)) ? (__CLC_INTN)NINFBITPATT_SP32 : ret;
- ret = (x_is_ninf && ypos && (inty != 1)) ? (__CLC_INTN)PINFBITPATT_SP32 : ret;
- ret = ((ix == PINFBITPATT_SP32) && !ypos) ? 0 : ret;
- ret = ((ix == PINFBITPATT_SP32) && ypos) ? (__CLC_INTN)PINFBITPATT_SP32 : ret;
- ret = ax > PINFBITPATT_SP32 ? ix : ret;
- ret = ny == 0 ? 0x3f800000 : ret;
-
- return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 64
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
- __CLC_INTN ny) {
- const __CLC_GENTYPE real_log2_tail = 5.76999904754328540596e-08;
- const __CLC_GENTYPE real_log2_lead = 6.93147122859954833984e-01;
-
- __CLC_GENTYPE y = __CLC_CONVERT_GENTYPE(ny);
-
- __CLC_LONGN ux = __CLC_AS_LONGN(x);
- __CLC_LONGN ax = __CLC_AS_LONGN(__clc_fabs(x));
- __CLC_BIT_INTN xpos = ax == ux;
-
- __CLC_LONGN uy = __CLC_AS_LONGN(y);
- __CLC_LONGN ay = __CLC_AS_LONGN(__clc_fabs(y));
- __CLC_BIT_INTN ypos = ay == uy;
-
- // Extended precision log
- __CLC_GENTYPE v, vt;
- {
- __CLC_INTN exp = __CLC_CONVERT_INTN(ax >> 52) - 1023;
- __CLC_INTN mask_exp_1023 = exp == -1023;
- __CLC_GENTYPE xexp = __CLC_CONVERT_GENTYPE(exp);
- __CLC_LONGN mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
- __CLC_LONGN temp_ux =
- __CLC_AS_LONGN(__CLC_AS_GENTYPE(0x3ff0000000000000L | mantissa) - 1.0);
- exp = __CLC_CONVERT_INTN((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
- __CLC_GENTYPE xexp1 = __CLC_CONVERT_GENTYPE(exp);
- __CLC_LONGN mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
- xexp = __CLC_CONVERT_LONGN(mask_exp_1023) ? xexp1 : xexp;
- mantissa = __CLC_CONVERT_LONGN(mask_exp_1023) ? mantissa1 : mantissa;
-
- __CLC_LONGN rax = (mantissa & 0x000ff00000000000) +
- ((mantissa & 0x0000080000000000) << 1);
- __CLC_INTN index = __CLC_CONVERT_INTN(rax >> 44);
-
- __CLC_GENTYPE F = __CLC_AS_GENTYPE(rax | 0x3FE0000000000000L);
- __CLC_GENTYPE Y = __CLC_AS_GENTYPE(mantissa | 0x3FE0000000000000L);
- __CLC_GENTYPE f = F - Y;
- __CLC_GENTYPE log_h = __CLC_USE_TABLE(log_f_inv_tbl_head, index);
- __CLC_GENTYPE log_t = __CLC_USE_TABLE(log_f_inv_tbl_tail, index);
- __CLC_GENTYPE f_inv = (log_h + log_t) * f;
- __CLC_GENTYPE r1 =
- __CLC_AS_GENTYPE(__CLC_AS_ULONGN(f_inv) & 0xfffffffff8000000L);
- __CLC_GENTYPE r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
- __CLC_GENTYPE r = r1 + r2;
-
- __CLC_GENTYPE poly = __clc_fma(
- r,
- __clc_fma(r,
- __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
- 1.0 / 4.0),
- 1.0 / 3.0);
- poly = poly * r * r * r;
-
- __CLC_GENTYPE hr1r1 = 0.5 * r1 * r1;
- __CLC_GENTYPE poly0h = r1 + hr1r1;
- __CLC_GENTYPE poly0t = r1 - poly0h + hr1r1;
- poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;
-
- log_h = __CLC_USE_TABLE(powlog_tbl_head, index);
- log_t = __CLC_USE_TABLE(powlog_tbl_tail, index);
-
- __CLC_GENTYPE resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
- __CLC_GENTYPE resT = resT_t - poly0h;
- __CLC_GENTYPE resH = __clc_fma(xexp, real_log2_lead, log_h);
- __CLC_GENTYPE resT_h = poly0h;
-
- __CLC_GENTYPE H = resT + resH;
- __CLC_GENTYPE H_h =
- __CLC_AS_GENTYPE(__CLC_AS_ULONGN(H) & 0xfffffffff8000000L);
- __CLC_GENTYPE T =
- (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
- H = H_h;
-
- __CLC_GENTYPE y_head =
- __CLC_AS_GENTYPE(__CLC_AS_ULONGN(uy) & 0xfffffffff8000000L);
- __CLC_GENTYPE y_tail = y - y_head;
-
- __CLC_BIT_INTN mask_2_24 = ay > 0x4170000000000000; // 2^24
- __CLC_INTN nyh = __CLC_CONVERT_INTN(y_head);
- __CLC_INTN nyt = ny - nyh;
- __CLC_GENTYPE y_tail1 = __CLC_CONVERT_GENTYPE(nyt);
- y_tail = mask_2_24 ? y_tail1 : y_tail;
-
- __CLC_GENTYPE temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
- v = __clc_fma(y_head, H, temp);
- vt = __clc_fma(y_head, H, -v) + temp;
- }
-
- // Now calculate exp of (v,vt)
-
- __CLC_GENTYPE expv;
- {
- const __CLC_GENTYPE max_exp_arg = 709.782712893384;
- const __CLC_GENTYPE min_exp_arg = -745.1332191019411;
- const __CLC_GENTYPE sixtyfour_by_lnof2 = 92.33248261689366;
- const __CLC_GENTYPE lnof2_by_64_head = 0.010830424260348081;
- const __CLC_GENTYPE lnof2_by_64_tail = -4.359010638708991e-10;
-
- // If v is so large that we need to return INFINITY, or so small that we
- // need to return 0, set v to known values that will produce that result. Do
- // not try to continue the computation with the original v and patch it up
- // afterwards because v may be so large that temp is out of range of int, in
- // which case that conversion, and a value based on that conversion being
- // passed to __clc_ldexp, results in undefined behavior.
- v = v > max_exp_arg ? 1000.0 : v;
- v = v < min_exp_arg ? -1000.0 : v;
-
- __CLC_GENTYPE temp = v * sixtyfour_by_lnof2;
- __CLC_INTN n = __CLC_CONVERT_INTN(temp);
- __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n);
- __CLC_INTN j = n & 0x0000003f;
- __CLC_INTN m = n >> 6;
-
- __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j);
- __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j);
- __CLC_GENTYPE f = f1 + f2;
-
- __CLC_GENTYPE r1 = __clc_fma(dn, -lnof2_by_64_head, v);
- __CLC_GENTYPE r2 = dn * lnof2_by_64_tail;
- __CLC_GENTYPE r = (r1 + r2) + vt;
-
- __CLC_GENTYPE q =
- __clc_fma(r,
- __clc_fma(r,
- __clc_fma(r,
- __clc_fma(r, 1.38889490863777199667e-03,
- 8.33336798434219616221e-03),
- 4.16666666662260795726e-02),
- 1.66666666665260878863e-01),
- 5.00000000000000008883e-01);
- q = __clc_fma(r * r, q, r);
-
- expv = __clc_fma(f, q, f2) + f1;
- expv = __clc_ldexp(expv, m);
- }
-
- // See whether y is an integer.
- // inty = 0 means not an integer.
- // inty = 1 means odd integer.
- // inty = 2 means even integer.
-
- __CLC_LONGN inty = __CLC_CONVERT_LONGN(2 - (ny & 1));
-
- expv *= ((inty == 1) && !xpos) ? -1.0 : 1.0;
-
- __CLC_LONGN ret = __CLC_AS_LONGN(expv);
-
- // Now all the edge cases
- __CLC_BIT_INTN x_is_ninf = ux == (__CLC_LONGN)NINFBITPATT_DP64;
- __CLC_BIT_INTN x_is_pinf = ux == (__CLC_LONGN)PINFBITPATT_DP64;
- __CLC_LONGN xinf =
- xpos ? (__CLC_LONGN)PINFBITPATT_DP64 : (__CLC_LONGN)NINFBITPATT_DP64;
-
- ret = ((ax == 0L) && !ypos && (inty == 1)) ? xinf : ret;
- ret = ((ax == 0L) && !ypos && (inty == 2)) ? (__CLC_LONGN)PINFBITPATT_DP64
- : ret;
- ret = ((ax == 0L) && ypos && (inty == 2)) ? 0L : ret;
- __CLC_LONGN xzero = !xpos ? (__CLC_LONGN)0x8000000000000000L : 0L;
- ret = ((ax == 0L) && ypos && (inty == 1)) ? xzero : ret;
- ret = (x_is_ninf && !ypos && (inty == 1)) ? (__CLC_LONGN)0x8000000000000000L
- : ret;
- ret = (x_is_ninf && !ypos && (inty != 1)) ? 0L : ret;
- ret =
- (x_is_ninf && ypos && (inty == 1)) ? (__CLC_LONGN)NINFBITPATT_DP64 : ret;
- ret =
- (x_is_ninf && ypos && (inty != 1)) ? (__CLC_LONGN)PINFBITPATT_DP64 : ret;
- ret = (x_is_pinf && !ypos) ? 0L : ret;
- ret = (x_is_pinf && ypos) ? (__CLC_LONGN)PINFBITPATT_DP64 : ret;
- ret = ax > (__CLC_LONGN)PINFBITPATT_DP64 ? ux : ret;
- ret = __CLC_CONVERT_LONGN(ny == 0) ? (__CLC_LONGN)0x3ff0000000000000L : ret;
-
- return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x, __CLC_INTN y) {
- return __CLC_CONVERT_GENTYPE(__clc_pown(__CLC_CONVERT_FLOATN(x), y));
-}
-
-#endif
diff --git a/libclc/clc/lib/generic/math/clc_powr.cl b/libclc/clc/lib/generic/math/clc_powr.cl
index 0556ec97d6f3c..287e1a31b5b2a 100644
--- a/libclc/clc/lib/generic/math/clc_powr.cl
+++ b/libclc/clc/lib/generic/math/clc_powr.cl
@@ -6,16 +6,36 @@
//
//===----------------------------------------------------------------------===//
-#include <clc/clc_convert.h>
-#include <clc/internal/clc.h>
-#include <clc/math/clc_fabs.h>
-#include <clc/math/clc_fma.h>
-#include <clc/math/clc_ldexp.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/clc_subnormal_config.h>
-#include <clc/math/math.h>
-#include <clc/math/tables.h>
-#include <clc/relational/clc_select.h>
+#include "clc/clc_convert.h"
+#include "clc/float/definitions.h"
+#include "clc/internal/clc.h"
+#include "clc/math/clc_copysign.h"
+#include "clc/math/clc_ep.h"
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+#include "clc/math/clc_fabs.h"
+#include "clc/math/clc_ldexp.h"
+#include "clc/math/clc_log.h"
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+#include "clc/math/clc_mad.h"
+#include "clc/math/clc_recip_fast.h"
+#include "clc/math/clc_trunc.h"
+#include "clc/math/math.h"
+#include "clc/relational/clc_isinf.h"
+#include "clc/relational/clc_isunordered.h"
-#define __CLC_BODY <clc_powr.inc>
-#include <clc/math/gentype.inc>
+#define COMPILING_POWR
+#define __CLC_BODY <clc_pow_base.inc>
+#include "clc/math/gentype.inc"
+
+#define __CLC_FUNCTION __clc_powr
+#define __CLC_BODY "clc/shared/binary_def_scalarize.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_powr_fast
+#define __CLC_BODY "clc/shared/binary_def_scalarize.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
diff --git a/libclc/clc/lib/generic/math/clc_powr.inc b/libclc/clc/lib/generic/math/clc_powr.inc
deleted file mode 100644
index b94dbfdcbdeb7..0000000000000
--- a/libclc/clc/lib/generic/math/clc_powr.inc
+++ /dev/null
@@ -1,414 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Computes pow using log and exp
-//
-// x^y = exp(y * log(x))
-//
-// We take care not to lose precision in the intermediate steps
-//
-// When computing log, calculate it in splits:
-//
-// r = f * (p_invead + p_inv_tail)
-// r = rh + rt
-//
-// Calculate log polynomial using r, in end addition, do:
-//
-// poly = poly + ((rh-r) + rt)
-//
-// lth = -r
-// ltt = ((xexp * log2_t) - poly) + logT
-// lt = lth + ltt
-//
-// lh = (xexp * log2_h) + logH
-// l = lh + lt
-//
-// Calculate final log answer as gh and gt:
-//
-// gh = l & higher-half bits
-// gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh))
-//
-// yh = y & higher-half bits
-// yt = y - yh
-//
-// Before entering computation of exp:
-//
-// vs = ((yt*gt + yt*gh) + yh*gt)
-// v = vs + yh*gh
-// vt = ((yh*gh - v) + vs)
-//
-// In calculation of exp, add vt to r that is used for poly.
-//
-// At the end of exp, do
-//
-// ((((expT * poly) + expT) + expH*poly) + expH)
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
- __CLC_GENTYPE y) {
- __CLC_GENTYPE absx = __clc_fabs(x);
- __CLC_INTN ix = __CLC_AS_INTN(x);
- __CLC_INTN ax = __CLC_AS_INTN(absx);
- __CLC_INTN xpos = ix == ax;
-
- __CLC_INTN iy = __CLC_AS_INTN(y);
- __CLC_INTN ay = __CLC_AS_INTN(__clc_fabs(y));
- __CLC_INTN ypos = iy == ay;
-
- // Extra precise log calculation
- // First handle case that x is close to 1
- __CLC_GENTYPE r = 1.0f - absx;
- __CLC_INTN near1 = __clc_fabs(r) < 0x1.0p-4f;
- __CLC_GENTYPE r2 = r * r;
-
- // Coefficients are just 1/3, 1/4, 1/5 and 1/6
- __CLC_GENTYPE poly = __clc_mad(
- r,
- __clc_mad(r,
- __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
- 0x1.99999ap-3f),
- 0x1.000000p-2f),
- 0x1.555556p-2f);
-
- poly *= r2 * r;
-
- __CLC_GENTYPE lth_near1 = -r2 * 0.5f;
- __CLC_GENTYPE ltt_near1 = -poly;
- __CLC_GENTYPE lt_near1 = lth_near1 + ltt_near1;
- __CLC_GENTYPE lh_near1 = -r;
- __CLC_GENTYPE l_near1 = lh_near1 + lt_near1;
-
- // Computations for x not near 1
- __CLC_INTN m = __CLC_CONVERT_INTN(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
- __CLC_GENTYPE mf = __CLC_CONVERT_GENTYPE(m);
- __CLC_INTN ixs = __CLC_AS_INTN(__CLC_AS_GENTYPE(ax | 0x3f800000) - 1.0f);
- __CLC_GENTYPE mfs = __CLC_CONVERT_GENTYPE((ixs >> EXPSHIFTBITS_SP32) - 253);
- __CLC_INTN c = m == -127;
- __CLC_INTN ixn = c ? ixs : ax;
- __CLC_GENTYPE mfn = c ? mfs : mf;
-
- __CLC_INTN indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
- // F - Y
- __CLC_GENTYPE f = __CLC_AS_GENTYPE(0x3f000000 | indx) -
- __CLC_AS_GENTYPE(0x3f000000 | (ixn & MANTBITS_SP32));
-
- indx = indx >> 16;
- __CLC_GENTYPE rh = f * __CLC_USE_TABLE(log_inv_tbl_ep_head, indx);
- __CLC_GENTYPE rt = f * __CLC_USE_TABLE(log_inv_tbl_ep_tail, indx);
- r = rh + rt;
-
- poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
- (r * r);
- poly += (rh - r) + rt;
-
- const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234
- const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
- __CLC_GENTYPE logel = __CLC_USE_TABLE(loge_tbl_lo, indx);
- __CLC_GENTYPE logeh = __CLC_USE_TABLE(loge_tbl_hi, indx);
- __CLC_GENTYPE lth = -r;
- __CLC_GENTYPE ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + logeh;
- __CLC_GENTYPE lt = lth + ltt;
- __CLC_GENTYPE lh = __clc_mad(mfn, LOG2_HEAD, logel);
- __CLC_GENTYPE l = lh + lt;
-
- // Select near 1 or not
- lth = near1 ? lth_near1 : lth;
- ltt = near1 ? ltt_near1 : ltt;
- lt = near1 ? lt_near1 : lt;
- lh = near1 ? lh_near1 : lh;
- l = near1 ? l_near1 : l;
-
- __CLC_GENTYPE gh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(l) & 0xfffff000);
- __CLC_GENTYPE gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
- __CLC_GENTYPE yh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(iy) & 0xfffff000);
-
- __CLC_GENTYPE yt = y - yh;
-
- __CLC_GENTYPE ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
- __CLC_GENTYPE ylogx = __clc_mad(yh, gh, ylogx_s);
- __CLC_GENTYPE ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
-
- // Extra precise exp of ylogx
- // 64/log2 : 92.332482616893657
- const __CLC_GENTYPE R_64_BY_LOG2 = 0x1.715476p+6f;
- __CLC_INTN n = __CLC_CONVERT_INTN(ylogx * R_64_BY_LOG2);
- __CLC_GENTYPE nf = __CLC_CONVERT_GENTYPE(n);
-
- __CLC_INTN j = n & 0x3f;
- m = n >> 6;
- __CLC_INTN m2 = m << EXPSHIFTBITS_SP32;
- // log2/64 lead: 0.0108032227
- const __CLC_GENTYPE R_LOG2_BY_64_LD = 0x1.620000p-7f;
- // log2/64 tail: 0.0000272020388
- const __CLC_GENTYPE R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
- r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
- ylogx_t;
-
- // Truncated Taylor series for e^r
- poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
- 0x1.000000p-1f),
- r * r, r);
-
- __CLC_GENTYPE exp_head = __CLC_USE_TABLE(exp_tbl_ep_head, j);
- __CLC_GENTYPE exp_tail = __CLC_USE_TABLE(exp_tbl_ep_tail, j);
-
- __CLC_GENTYPE expylogx =
- __clc_mad(exp_head, poly, __clc_mad(exp_tail, poly, exp_tail)) + exp_head;
- __CLC_GENTYPE sexpylogx =
- expylogx * __CLC_AS_GENTYPE((__CLC_INTN)0x1 << (m + 149));
- __CLC_GENTYPE texpylogx = __CLC_AS_GENTYPE(__CLC_AS_INTN(expylogx) + m2);
- expylogx = m < -125 ? sexpylogx : texpylogx;
-
- // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
- expylogx =
- __clc_select(expylogx, __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32),
- (ylogx > 0x1.62e430p+6f) ||
- (ylogx == 0x1.62e430p+6f && ylogx_t > -0x1.05c610p-22f));
-
- // Result is 0 if ylogx < -149*log2
- expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
- // Classify y:
- // inty = 0 means not an integer.
- // inty = 1 means odd integer.
- // inty = 2 means even integer.
-
- __CLC_INTN yexp = (__CLC_INTN)(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
- __CLC_INTN mask = ((__CLC_INTN)1 << (24 - yexp)) - 1;
- __CLC_INTN yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
- __CLC_INTN inty = yodd ? 1 : 2;
- inty = (iy & mask) != 0 ? 0 : inty;
- inty = yexp < 1 ? 0 : inty;
- inty = yexp > 24 ? 2 : inty;
-
- __CLC_GENTYPE signval =
- __CLC_AS_GENTYPE((__CLC_AS_UINTN(expylogx) ^ SIGNBIT_SP32));
- expylogx = ((inty == 1) && !xpos) ? signval : expylogx;
- __CLC_INTN ret = __CLC_AS_INTN(expylogx);
-
- // Corner case handling
- __CLC_BIT_INTN y_is_ninf = iy == (__CLC_INTN)NINFBITPATT_SP32;
- __CLC_BIT_INTN y_is_pinf = iy == (__CLC_INTN)PINFBITPATT_SP32;
- __CLC_BIT_INTN x_is_inf = ax == (__CLC_INTN)PINFBITPATT_SP32;
-
- ret = ax < 0x3f800000 && y_is_ninf ? PINFBITPATT_SP32 : ret;
- ret = ax < 0x3f800000 && y_is_pinf ? 0 : ret;
- ret = ax == 0x3f800000 && ay < PINFBITPATT_SP32 ? 0x3f800000 : ret;
- ret = ax == 0x3f800000 && ay == PINFBITPATT_SP32 ? QNANBITPATT_SP32 : ret;
- ret = ax > 0x3f800000 && y_is_ninf ? 0 : ret;
- ret = ax > 0x3f800000 && y_is_pinf ? PINFBITPATT_SP32 : ret;
- ret = ((ix < PINFBITPATT_SP32) && (ay == 0)) ? 0x3f800000 : ret;
- ret = (x_is_inf && !ypos) ? 0 : ret;
- ret = (x_is_inf && ypos) ? PINFBITPATT_SP32 : ret;
- ret = (x_is_inf && y_is_pinf) ? PINFBITPATT_SP32 : ret;
- ret = (x_is_inf && (ay == 0)) ? QNANBITPATT_SP32 : ret;
- ret = ((ax == 0) && !ypos) ? PINFBITPATT_SP32 : ret;
- ret = ((ax == 0) && ypos) ? 0 : ret;
- ret = ((ax == 0) && (ay == 0)) ? QNANBITPATT_SP32 : ret;
- ret = ((ax != 0) && !xpos) ? QNANBITPATT_SP32 : ret;
- ret = ax > PINFBITPATT_SP32 ? ix : ret;
- ret = ay > PINFBITPATT_SP32 ? iy : ret;
-
- return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 64
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
- __CLC_GENTYPE y) {
- const __CLC_GENTYPE real_log2_tail = 5.76999904754328540596e-08;
- const __CLC_GENTYPE real_log2_lead = 6.93147122859954833984e-01;
-
- __CLC_LONGN ux = __CLC_AS_LONGN(x);
- __CLC_LONGN ax = __CLC_AS_LONGN(__clc_fabs(x));
- __CLC_BIT_INTN xpos = ax == ux;
-
- __CLC_LONGN uy = __CLC_AS_LONGN(y);
- __CLC_LONGN ay = __CLC_AS_LONGN(__clc_fabs(y));
- __CLC_BIT_INTN ypos = ay == uy;
-
- // Extended precision log
- __CLC_GENTYPE v, vt;
- {
- __CLC_INTN exp = __CLC_CONVERT_INTN(ax >> 52) - 1023;
- __CLC_INTN mask_exp_1023 = exp == (__CLC_INTN)-1023;
- __CLC_GENTYPE xexp = __CLC_CONVERT_GENTYPE(exp);
- __CLC_LONGN mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
- __CLC_LONGN temp_ux =
- __CLC_AS_LONGN(__CLC_AS_GENTYPE(0x3ff0000000000000L | mantissa) - 1.0);
- exp = __CLC_CONVERT_INTN((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
- __CLC_GENTYPE xexp1 = __CLC_CONVERT_GENTYPE(exp);
- __CLC_LONGN mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
- xexp = __CLC_CONVERT_LONGN(mask_exp_1023) ? xexp1 : xexp;
- mantissa = __CLC_CONVERT_LONGN(mask_exp_1023) ? mantissa1 : mantissa;
-
- __CLC_LONGN rax = (mantissa & 0x000ff00000000000) +
- ((mantissa & 0x0000080000000000) << 1);
- __CLC_INTN index = __CLC_CONVERT_INTN(rax >> 44);
-
- __CLC_GENTYPE F = __CLC_AS_GENTYPE(rax | 0x3FE0000000000000L);
- __CLC_GENTYPE Y = __CLC_AS_GENTYPE(mantissa | 0x3FE0000000000000L);
- __CLC_GENTYPE f = F - Y;
- __CLC_GENTYPE log_h = __CLC_USE_TABLE(log_f_inv_tbl_head, index);
- __CLC_GENTYPE log_t = __CLC_USE_TABLE(log_f_inv_tbl_tail, index);
- __CLC_GENTYPE f_inv = (log_h + log_t) * f;
- __CLC_GENTYPE r1 =
- __CLC_AS_GENTYPE(__CLC_AS_ULONGN(f_inv) & 0xfffffffff8000000L);
- __CLC_GENTYPE r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
- __CLC_GENTYPE r = r1 + r2;
-
- __CLC_GENTYPE poly = __clc_fma(
- r,
- __clc_fma(r,
- __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
- 1.0 / 4.0),
- 1.0 / 3.0);
- poly = poly * r * r * r;
-
- __CLC_GENTYPE hr1r1 = 0.5 * r1 * r1;
- __CLC_GENTYPE poly0h = r1 + hr1r1;
- __CLC_GENTYPE poly0t = r1 - poly0h + hr1r1;
- poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;
-
- log_h = __CLC_USE_TABLE(powlog_tbl_head, index);
- log_t = __CLC_USE_TABLE(powlog_tbl_tail, index);
-
- __CLC_GENTYPE resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
- __CLC_GENTYPE resT = resT_t - poly0h;
- __CLC_GENTYPE resH = __clc_fma(xexp, real_log2_lead, log_h);
- __CLC_GENTYPE resT_h = poly0h;
-
- __CLC_GENTYPE H = resT + resH;
- __CLC_GENTYPE H_h =
- __CLC_AS_GENTYPE(__CLC_AS_ULONGN(H) & 0xfffffffff8000000L);
- __CLC_GENTYPE T =
- (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
- H = H_h;
-
- __CLC_GENTYPE y_head =
- __CLC_AS_GENTYPE(__CLC_AS_ULONGN(uy) & 0xfffffffff8000000L);
- __CLC_GENTYPE y_tail = y - y_head;
-
- __CLC_GENTYPE temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
- v = __clc_fma(y_head, H, temp);
- vt = __clc_fma(y_head, H, -v) + temp;
- }
-
- // Now calculate exp of (v,vt)
-
- __CLC_GENTYPE expv;
- {
- const __CLC_GENTYPE max_exp_arg = 709.782712893384;
- const __CLC_GENTYPE min_exp_arg = -745.1332191019411;
- const __CLC_GENTYPE sixtyfour_by_lnof2 = 92.33248261689366;
- const __CLC_GENTYPE lnof2_by_64_head = 0.010830424260348081;
- const __CLC_GENTYPE lnof2_by_64_tail = -4.359010638708991e-10;
-
- // If v is so large that we need to return INFINITY, or so small that we
- // need to return 0, set v to known values that will produce that result. Do
- // not try to continue the computation with the original v and patch it up
- // afterwards because v may be so large that temp is out of range of int, in
- // which case that conversion, and a value based on that conversion being
- // passed to __clc_ldexp, results in undefined behavior.
- v = v > max_exp_arg ? 1000.0 : v;
- v = v < min_exp_arg ? -1000.0 : v;
-
- __CLC_GENTYPE temp = v * sixtyfour_by_lnof2;
- __CLC_INTN n = __CLC_CONVERT_INTN(temp);
- __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n);
- __CLC_INTN j = n & 0x0000003f;
- __CLC_INTN m = n >> 6;
-
- __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j);
- __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j);
- __CLC_GENTYPE f = f1 + f2;
-
- __CLC_GENTYPE r1 = __clc_fma(dn, -lnof2_by_64_head, v);
- __CLC_GENTYPE r2 = dn * lnof2_by_64_tail;
- __CLC_GENTYPE r = (r1 + r2) + vt;
-
- __CLC_GENTYPE q =
- __clc_fma(r,
- __clc_fma(r,
- __clc_fma(r,
- __clc_fma(r, 1.38889490863777199667e-03,
- 8.33336798434219616221e-03),
- 4.16666666662260795726e-02),
- 1.66666666665260878863e-01),
- 5.00000000000000008883e-01);
- q = __clc_fma(r * r, q, r);
-
- expv = __clc_fma(f, q, f2) + f1;
- expv = __clc_ldexp(expv, m);
- }
-
- // See whether y is an integer.
- // inty = 0 means not an integer.
- // inty = 1 means odd integer.
- // inty = 2 means even integer.
-
- __CLC_LONGN inty;
- {
- __CLC_INTN yexp =
- __CLC_CONVERT_INTN(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
- inty = __CLC_CONVERT_LONGN(yexp < 1 ? 0 : 2);
- inty = __CLC_CONVERT_LONGN(yexp > 53) ? 2 : inty;
- __CLC_LONGN mask = ((__CLC_LONGN)1L << (53 - yexp)) - 1L;
- __CLC_LONGN inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1L : 2L;
- inty1 = (ay & mask) != 0 ? 0 : inty1;
- inty = __CLC_CONVERT_LONGN(!(yexp < 1) && !(yexp > 53)) ? inty1 : inty;
- }
-
- expv *= ((inty == 1) && !xpos) ? -1.0 : 1.0;
-
- __CLC_LONGN ret = __CLC_AS_LONGN(expv);
-
- // Now all the edge cases
- __CLC_BIT_INTN y_is_ninf = uy == (__CLC_LONGN)NINFBITPATT_DP64;
- __CLC_BIT_INTN y_is_pinf = uy == (__CLC_LONGN)PINFBITPATT_DP64;
- __CLC_BIT_INTN x_is_inf = ax == (__CLC_LONGN)PINFBITPATT_DP64;
-
- ret = ax < 0x3ff0000000000000L && y_is_ninf ? PINFBITPATT_DP64 : ret;
- ret = ax < 0x3ff0000000000000L && y_is_pinf ? 0L : ret;
- ret = ax == 0x3ff0000000000000L && ay < PINFBITPATT_DP64 ? 0x3ff0000000000000L
- : ret;
- ret = ax == 0x3ff0000000000000L && ay == PINFBITPATT_DP64 ? QNANBITPATT_DP64
- : ret;
- ret = ax > 0x3ff0000000000000L && y_is_ninf ? 0L : ret;
- ret = ax > 0x3ff0000000000000L && y_is_pinf ? PINFBITPATT_DP64 : ret;
- ret = ux < PINFBITPATT_DP64 && ay == 0L ? 0x3ff0000000000000L : ret;
- ret = (x_is_inf && !ypos) ? 0L : ret;
- ret = (x_is_inf && ypos) ? PINFBITPATT_DP64 : ret;
- ret = (x_is_inf && y_is_pinf) ? PINFBITPATT_DP64 : ret;
- ret = ((ax == PINFBITPATT_DP64) && (ay == 0L)) ? QNANBITPATT_DP64 : ret;
- ret = ((ax == 0L) && !ypos) ? PINFBITPATT_DP64 : ret;
- ret = ((ax == 0L) && ypos) ? 0L : ret;
- ret = ((ax == 0L) && (ay == 0L)) ? QNANBITPATT_DP64 : ret;
- ret = ((ax != 0L) && !xpos) ? QNANBITPATT_DP64 : ret;
- ret = ax > PINFBITPATT_DP64 ? ux : ret;
- ret = ay > PINFBITPATT_DP64 ? uy : ret;
-
- return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
- __CLC_GENTYPE y) {
- return __CLC_CONVERT_GENTYPE(
- __clc_powr(__CLC_CONVERT_FLOATN(x), __CLC_CONVERT_FLOATN(y)));
-}
-
-#endif
diff --git a/libclc/clc/lib/generic/math/clc_rootn.cl b/libclc/clc/lib/generic/math/clc_rootn.cl
index da397cf66da62..29a51baac87c3 100644
--- a/libclc/clc/lib/generic/math/clc_rootn.cl
+++ b/libclc/clc/lib/generic/math/clc_rootn.cl
@@ -6,16 +6,35 @@
//
//===----------------------------------------------------------------------===//
-#include <clc/clc_convert.h>
-#include <clc/float/definitions.h>
-#include <clc/internal/clc.h>
-#include <clc/math/clc_fabs.h>
-#include <clc/math/clc_fma.h>
-#include <clc/math/clc_ldexp.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/clc_subnormal_config.h>
-#include <clc/math/math.h>
-#include <clc/math/tables.h>
+#include "clc/clc_convert.h"
+#include "clc/float/definitions.h"
+#include "clc/math/clc_copysign.h"
+#include "clc/math/clc_ep.h"
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+#include "clc/math/clc_fabs.h"
+#include "clc/math/clc_ldexp.h"
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+#include "clc/math/clc_mad.h"
+#include "clc/math/clc_recip_fast.h"
+#include "clc/math/clc_rootn.h"
+#include "clc/math/clc_trunc.h"
+#include "clc/relational/clc_isinf.h"
+#include "clc/relational/clc_isunordered.h"
-#define __CLC_BODY <clc_rootn.inc>
+#define COMPILING_ROOTN
+#define __CLC_BODY <clc_pow_base.inc>
#include <clc/math/gentype.inc>
+
+#define __CLC_FUNCTION __clc_rootn
+#define __CLC_BODY \
+ "clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_rootn_fast
+#define __CLC_BODY \
+ "clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc"
+#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_rootn.inc b/libclc/clc/lib/generic/math/clc_rootn.inc
deleted file mode 100644
index fd3d0becb0dff..0000000000000
--- a/libclc/clc/lib/generic/math/clc_rootn.inc
+++ /dev/null
@@ -1,405 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Computes pow using log and exp
-//
-// x^y = exp(y * log(x))
-//
-// We take care not to lose precision in the intermediate steps.
-//
-// When computing log, calculate it in splits:
-//
-// r = f * (p_invead + p_inv_tail)
-// r = rh + rt
-//
-// Calculate log polynomial using r, in end addition, do:
-//
-// poly = poly + ((rh-r) + rt)
-//
-// lth = -r
-// ltt = ((xexp * log2_t) - poly) + logT
-// lt = lth + ltt
-//
-// lh = (xexp * log2_h) + logH
-// l = lh + lt
-//
-// Calculate final log answer as gh and gt:
-//
-// gh = l & higher-half bits
-// gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh))
-//
-// yh = y & higher-half bits
-// yt = y - yh
-//
-// Before entering computation of exp:
-//
-// vs = ((yt*gt + yt*gh) + yh*gt)
-// v = vs + yh*gh
-// vt = ((yh*gh - v) + vs)
-//
-// In calculation of exp, add vt to r that is used for poly.
-//
-// At the end of exp, do:
-//
-// ((((expT * poly) + expT) + expH*poly) + expH)
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
- __CLC_INTN ny) {
- __CLC_GENTYPE y = MATH_RECIP(__CLC_CONVERT_GENTYPE(ny));
-
- __CLC_INTN ix = __CLC_AS_INTN(x);
- __CLC_INTN ax = ix & EXSIGNBIT_SP32;
- __CLC_INTN xpos = ix == ax;
-
- __CLC_INTN iy = __CLC_AS_INTN(y);
- __CLC_INTN ay = iy & EXSIGNBIT_SP32;
- __CLC_INTN ypos = iy == ay;
-
- // Extra precise log calculation
- // First handle case that x is close to 1
- __CLC_GENTYPE r = 1.0f - __CLC_AS_GENTYPE(ax);
- __CLC_INTN near1 = __clc_fabs(r) < 0x1.0p-4f;
- __CLC_GENTYPE r2 = r * r;
-
- // Coefficients are just 1/3, 1/4, 1/5 and 1/6
- __CLC_GENTYPE poly = __clc_mad(
- r,
- __clc_mad(r,
- __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
- 0x1.99999ap-3f),
- 0x1.000000p-2f),
- 0x1.555556p-2f);
-
- poly *= r2 * r;
-
- __CLC_GENTYPE lth_near1 = -r2 * 0.5f;
- __CLC_GENTYPE ltt_near1 = -poly;
- __CLC_GENTYPE lt_near1 = lth_near1 + ltt_near1;
- __CLC_GENTYPE lh_near1 = -r;
- __CLC_GENTYPE l_near1 = lh_near1 + lt_near1;
-
- // Computations for x not near 1
- __CLC_INTN m = __CLC_CONVERT_INTN(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
- __CLC_GENTYPE mf = __CLC_CONVERT_GENTYPE(m);
- __CLC_INTN ixs = __CLC_AS_INTN(__CLC_AS_GENTYPE(ax | 0x3f800000) - 1.0f);
- __CLC_GENTYPE mfs = __CLC_CONVERT_GENTYPE((ixs >> EXPSHIFTBITS_SP32) - 253);
- __CLC_INTN c = m == -127;
- __CLC_INTN ixn = c ? ixs : ax;
- __CLC_GENTYPE mfn = c ? mfs : mf;
-
- __CLC_INTN indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
- // F - Y
- __CLC_GENTYPE f = __CLC_AS_GENTYPE(0x3f000000 | indx) -
- __CLC_AS_GENTYPE(0x3f000000 | (ixn & MANTBITS_SP32));
-
- indx = indx >> 16;
- __CLC_GENTYPE rh = f * __CLC_USE_TABLE(log_inv_tbl_ep_head, indx);
- __CLC_GENTYPE rt = f * __CLC_USE_TABLE(log_inv_tbl_ep_tail, indx);
- ;
- r = rh + rt;
-
- poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
- (r * r);
- poly += (rh - r) + rt;
-
- const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234
- const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
- __CLC_GENTYPE lth = -r;
- __CLC_GENTYPE ltt =
- __clc_mad(mfn, LOG2_TAIL, -poly) + __CLC_USE_TABLE(loge_tbl_hi, indx);
- __CLC_GENTYPE lt = lth + ltt;
- __CLC_GENTYPE lh =
- __clc_mad(mfn, LOG2_HEAD, __CLC_USE_TABLE(loge_tbl_lo, indx));
- __CLC_GENTYPE l = lh + lt;
-
- // Select near 1 or not
- lth = near1 ? lth_near1 : lth;
- ltt = near1 ? ltt_near1 : ltt;
- lt = near1 ? lt_near1 : lt;
- lh = near1 ? lh_near1 : lh;
- l = near1 ? l_near1 : l;
-
- __CLC_GENTYPE gh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(l) & 0xfffff000);
- __CLC_GENTYPE gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
- __CLC_GENTYPE yh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(iy) & 0xfffff000);
-
- __CLC_GENTYPE fny = __CLC_CONVERT_GENTYPE(ny);
- __CLC_GENTYPE fnyh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(fny) & 0xfffff000);
- __CLC_GENTYPE fnyt = __CLC_CONVERT_GENTYPE(ny - __CLC_CONVERT_INTN(fnyh));
- __CLC_GENTYPE yt =
- MATH_DIVIDE(__clc_mad(-fnyt, yh, __clc_mad(-fnyh, yh, 1.0f)), fny);
-
- __CLC_GENTYPE ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
- __CLC_GENTYPE ylogx = __clc_mad(yh, gh, ylogx_s);
- __CLC_GENTYPE ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
-
- // Extra precise exp of ylogx
- const __CLC_GENTYPE R_64_BY_LOG2 =
- 0x1.715476p+6f; // 64/log2 : 92.332482616893657
- __CLC_INTN n = __CLC_CONVERT_INTN(ylogx * R_64_BY_LOG2);
- __CLC_GENTYPE nf = __CLC_CONVERT_GENTYPE(n);
-
- __CLC_INTN j = n & 0x3f;
- m = n >> 6;
- __CLC_INTN m2 = m << EXPSHIFTBITS_SP32;
-
- // log2/64 lead: 0.0108032227
- const __CLC_GENTYPE R_LOG2_BY_64_LD = 0x1.620000p-7f;
- // log2/64 tail: 0.0000272020388
- const __CLC_GENTYPE R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
- r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
- ylogx_t;
-
- // Truncated Taylor series for e^r
- poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
- 0x1.000000p-1f),
- r * r, r);
-
- __CLC_GENTYPE exph = __CLC_USE_TABLE(exp_tbl_ep_head, j);
- __CLC_GENTYPE expt = __CLC_USE_TABLE(exp_tbl_ep_tail, j);
-
- __CLC_GENTYPE expylogx =
- __clc_mad(exph, poly, __clc_mad(expt, poly, expt)) + exph;
- __CLC_GENTYPE sexpylogx =
- __clc_fp32_subnormals_supported()
- ? expylogx * __CLC_AS_GENTYPE((__CLC_INTN)0x1 << (m + 149))
- : 0.0f;
-
- __CLC_GENTYPE texpylogx = __CLC_AS_GENTYPE(__CLC_AS_INTN(expylogx) + m2);
- expylogx = m < -125 ? sexpylogx : texpylogx;
-
- // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
- expylogx = ((ylogx > 0x1.62e430p+6f) |
- (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f))
- ? __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32)
- : expylogx;
-
- // Result is 0 if ylogx < -149*log2
- expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
- // Classify y:
- // inty = 0 means not an integer.
- // inty = 1 means odd integer.
- // inty = 2 means even integer.
-
- __CLC_INTN inty = 2 - (ny & 1);
-
- __CLC_GENTYPE signval =
- __CLC_AS_GENTYPE((__CLC_AS_UINTN(expylogx) ^ SIGNBIT_SP32));
- expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
- __CLC_INTN ret = __CLC_AS_INTN(expylogx);
-
- // Corner case handling
- __CLC_BIT_INTN x_is_ninf = ix == (__CLC_INTN)NINFBITPATT_SP32;
- __CLC_BIT_INTN x_is_pinf = ix == (__CLC_INTN)PINFBITPATT_SP32;
-
- ret = (!xpos & (inty == 2)) ? __CLC_AS_INTN(__CLC_GENTYPE_NAN) : ret;
- __CLC_INTN xinf =
- xpos ? (__CLC_INTN)PINFBITPATT_SP32 : (__CLC_INTN)NINFBITPATT_SP32;
- ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
- ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret;
- ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret;
- __CLC_INTN xzero = xpos ? 0 : (__CLC_INTN)0x80000000;
- ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
- ret = (x_is_ninf & ypos & (inty == 1)) ? (__CLC_INTN)NINFBITPATT_SP32 : ret;
- ret = (x_is_ninf & !ypos & (inty == 1)) ? (__CLC_INTN)0x80000000 : ret;
- ret = (x_is_pinf & !ypos) ? 0 : ret;
- ret = (x_is_pinf & ypos) ? PINFBITPATT_SP32 : ret;
- ret = ax > PINFBITPATT_SP32 ? ix : ret;
- ret = ny == 0 ? __CLC_AS_INTN(__CLC_GENTYPE_NAN) : ret;
-
- return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 64
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
- __CLC_INTN ny) {
- const __CLC_GENTYPE real_log2_tail = 5.76999904754328540596e-08;
- const __CLC_GENTYPE real_log2_lead = 6.93147122859954833984e-01;
-
- __CLC_GENTYPE dny = __CLC_CONVERT_GENTYPE(ny);
- __CLC_GENTYPE y = 1.0 / dny;
-
- __CLC_LONGN ux = __CLC_AS_LONGN(x);
- __CLC_LONGN ax = __CLC_AS_LONGN(__clc_fabs(x));
- __CLC_BIT_INTN xpos = ax == ux;
-
- __CLC_LONGN uy = __CLC_AS_LONGN(y);
- __CLC_LONGN ay = __CLC_AS_LONGN(__clc_fabs(y));
- __CLC_BIT_INTN ypos = ay == uy;
-
- // Extended precision log
- __CLC_GENTYPE v, vt;
- {
- __CLC_INTN exp = __CLC_CONVERT_INTN(ax >> 52) - 1023;
- __CLC_INTN mask_exp_1023 = exp == -1023;
- __CLC_GENTYPE xexp = __CLC_CONVERT_GENTYPE(exp);
- __CLC_LONGN mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
- __CLC_LONGN temp_ux =
- __CLC_AS_LONGN(__CLC_AS_GENTYPE(0x3ff0000000000000L | mantissa) - 1.0);
- exp = __CLC_CONVERT_INTN((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
- __CLC_GENTYPE xexp1 = __CLC_CONVERT_GENTYPE(exp);
- __CLC_LONGN mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
- xexp = __CLC_CONVERT_LONGN(mask_exp_1023) ? xexp1 : xexp;
- mantissa = __CLC_CONVERT_LONGN(mask_exp_1023) ? mantissa1 : mantissa;
-
- __CLC_LONGN rax = (mantissa & 0x000ff00000000000) +
- ((mantissa & 0x0000080000000000) << 1);
- __CLC_INTN index = __CLC_CONVERT_INTN(rax >> 44);
-
- __CLC_GENTYPE F = __CLC_AS_GENTYPE(rax | 0x3FE0000000000000L);
- __CLC_GENTYPE Y = __CLC_AS_GENTYPE(mantissa | 0x3FE0000000000000L);
- __CLC_GENTYPE f = F - Y;
- __CLC_GENTYPE log_h = __CLC_USE_TABLE(log_f_inv_tbl_head, index);
- __CLC_GENTYPE log_t = __CLC_USE_TABLE(log_f_inv_tbl_tail, index);
- __CLC_GENTYPE f_inv = (log_h + log_t) * f;
- __CLC_GENTYPE r1 =
- __CLC_AS_GENTYPE(__CLC_AS_ULONGN(f_inv) & 0xfffffffff8000000L);
- __CLC_GENTYPE r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
- __CLC_GENTYPE r = r1 + r2;
-
- __CLC_GENTYPE poly = __clc_fma(
- r,
- __clc_fma(r,
- __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
- 1.0 / 4.0),
- 1.0 / 3.0);
- poly = poly * r * r * r;
-
- __CLC_GENTYPE hr1r1 = 0.5 * r1 * r1;
- __CLC_GENTYPE poly0h = r1 + hr1r1;
- __CLC_GENTYPE poly0t = r1 - poly0h + hr1r1;
- poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;
-
- log_h = __CLC_USE_TABLE(powlog_tbl_head, index);
- log_t = __CLC_USE_TABLE(powlog_tbl_tail, index);
-
- __CLC_GENTYPE resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
- __CLC_GENTYPE resT = resT_t - poly0h;
- __CLC_GENTYPE resH = __clc_fma(xexp, real_log2_lead, log_h);
- __CLC_GENTYPE resT_h = poly0h;
-
- __CLC_GENTYPE H = resT + resH;
- __CLC_GENTYPE H_h =
- __CLC_AS_GENTYPE(__CLC_AS_ULONGN(H) & 0xfffffffff8000000L);
- __CLC_GENTYPE T =
- (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
- H = H_h;
-
- __CLC_GENTYPE y_head =
- __CLC_AS_GENTYPE(__CLC_AS_ULONGN(uy) & 0xfffffffff8000000L);
- __CLC_GENTYPE y_tail = y - y_head;
-
- __CLC_GENTYPE fnyh =
- __CLC_AS_GENTYPE(__CLC_AS_ULONGN(dny) & 0xfffffffffff00000);
- __CLC_GENTYPE fnyt = __CLC_CONVERT_GENTYPE(ny - __CLC_CONVERT_INTN(fnyh));
- y_tail = __clc_fma(-fnyt, y_head, __clc_fma(-fnyh, y_head, 1.0)) / dny;
-
- __CLC_GENTYPE temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
- v = __clc_fma(y_head, H, temp);
- vt = __clc_fma(y_head, H, -v) + temp;
- }
-
- // Now calculate exp of (v,vt)
-
- __CLC_GENTYPE expv;
- {
- const __CLC_GENTYPE max_exp_arg = 709.782712893384;
- const __CLC_GENTYPE min_exp_arg = -745.1332191019411;
- const __CLC_GENTYPE sixtyfour_by_lnof2 = 92.33248261689366;
- const __CLC_GENTYPE lnof2_by_64_head = 0.010830424260348081;
- const __CLC_GENTYPE lnof2_by_64_tail = -4.359010638708991e-10;
-
- // If v is so large that we need to return INFINITY, or so small that we
- // need to return 0, set v to known values that will produce that result. Do
- // not try to continue the computation with the original v and patch it up
- // afterwards because v may be so large that temp is out of range of int, in
- // which case that conversion, and a value based on that conversion being
- // passed to __clc_ldexp, results in undefined behavior.
- v = v > max_exp_arg ? 1000.0 : v;
- v = v < min_exp_arg ? -1000.0 : v;
-
- __CLC_GENTYPE temp = v * sixtyfour_by_lnof2;
- __CLC_INTN n = __CLC_CONVERT_INTN(temp);
- __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n);
- __CLC_INTN j = n & 0x0000003f;
- __CLC_INTN m = n >> 6;
-
- __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j);
- __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j);
- __CLC_GENTYPE f = f1 + f2;
-
- __CLC_GENTYPE r1 = __clc_fma(dn, -lnof2_by_64_head, v);
- __CLC_GENTYPE r2 = dn * lnof2_by_64_tail;
- __CLC_GENTYPE r = (r1 + r2) + vt;
-
- __CLC_GENTYPE q =
- __clc_fma(r,
- __clc_fma(r,
- __clc_fma(r,
- __clc_fma(r, 1.38889490863777199667e-03,
- 8.33336798434219616221e-03),
- 4.16666666662260795726e-02),
- 1.66666666665260878863e-01),
- 5.00000000000000008883e-01);
- q = __clc_fma(r * r, q, r);
-
- expv = __clc_fma(f, q, f2) + f1;
- expv = __clc_ldexp(expv, m);
- }
-
- // See whether y is an integer.
- // inty = 0 means not an integer.
- // inty = 1 means odd integer.
- // inty = 2 means even integer.
-
- __CLC_LONGN inty = __CLC_CONVERT_LONGN(2 - (ny & 1));
-
- expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
-
- __CLC_LONGN ret = __CLC_AS_LONGN(expv);
-
- // Now all the edge cases
- __CLC_BIT_INTN x_is_ninf = ux == (__CLC_LONGN)NINFBITPATT_DP64;
- __CLC_BIT_INTN x_is_pinf = ux == (__CLC_LONGN)PINFBITPATT_DP64;
- ret = (!xpos & (inty == 2)) ? __CLC_AS_LONGN(__CLC_GENTYPE_NAN) : ret;
- __CLC_LONGN xinf =
- xpos ? (__CLC_LONGN)PINFBITPATT_DP64 : (__CLC_LONGN)NINFBITPATT_DP64;
- ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
- ret =
- ((ax == 0L) & !ypos & (inty == 2)) ? (__CLC_LONGN)PINFBITPATT_DP64 : ret;
- ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret;
- __CLC_LONGN xzero = xpos ? 0L : (__CLC_LONGN)0x8000000000000000L;
- ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
- ret = (x_is_ninf & ypos & (inty == 1)) ? (__CLC_LONGN)NINFBITPATT_DP64 : ret;
- ret = (x_is_ninf & !ypos & (inty == 1)) ? (__CLC_LONGN)0x8000000000000000L
- : ret;
- ret = (x_is_pinf & !ypos) ? 0L : ret;
- ret = (x_is_pinf & ypos) ? (__CLC_LONGN)PINFBITPATT_DP64 : ret;
- ret = ax > (__CLC_LONGN)PINFBITPATT_DP64 ? ux : ret;
- ret = __CLC_CONVERT_LONGN(ny == 0) ? __CLC_AS_LONGN(__CLC_GENTYPE_NAN) : ret;
- return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
- __CLC_INTN y) {
- return __CLC_CONVERT_GENTYPE(__clc_rootn(__CLC_CONVERT_FLOATN(x), y));
-}
-
-#endif
diff --git a/libclc/opencl/lib/generic/math/pow.cl b/libclc/opencl/lib/generic/math/pow.cl
index 2fd3a50733ed5..07aa50abe48d6 100644
--- a/libclc/opencl/lib/generic/math/pow.cl
+++ b/libclc/opencl/lib/generic/math/pow.cl
@@ -6,8 +6,16 @@
//
//===----------------------------------------------------------------------===//
-#include <clc/math/clc_pow.h>
+#include "clc/math/clc_pow.h"
#define __CLC_FUNCTION pow
-#define __CLC_BODY <clc/shared/binary_def.inc>
-#include <clc/math/gentype.inc>
+#define __CLC_BODY "clc/shared/binary_def.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+#undef __CLC_IMPL_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __pow_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_pow_fast
+#define __CLC_BODY "clc/shared/binary_def.inc"
+#include "clc/math/gentype.inc"
diff --git a/libclc/opencl/lib/generic/math/pown.cl b/libclc/opencl/lib/generic/math/pown.cl
index e48bc10a636ab..7f51666530b1a 100644
--- a/libclc/opencl/lib/generic/math/pown.cl
+++ b/libclc/opencl/lib/generic/math/pown.cl
@@ -6,8 +6,16 @@
//
//===----------------------------------------------------------------------===//
-#include <clc/math/clc_pown.h>
+#include "clc/math/clc_pown.h"
#define __CLC_FUNCTION pown
#define __CLC_BODY <clc/shared/binary_def_with_int_second_arg.inc>
-#include <clc/math/gentype.inc>
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+#undef __CLC_IMPL_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __pown_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_pown_fast
+#define __CLC_BODY <clc/shared/binary_def_with_int_second_arg.inc>
+#include "clc/math/gentype.inc"
diff --git a/libclc/opencl/lib/generic/math/powr.cl b/libclc/opencl/lib/generic/math/powr.cl
index 168e30aa57b08..e74297589a2b9 100644
--- a/libclc/opencl/lib/generic/math/powr.cl
+++ b/libclc/opencl/lib/generic/math/powr.cl
@@ -6,8 +6,16 @@
//
//===----------------------------------------------------------------------===//
-#include <clc/math/clc_powr.h>
+#include "clc/math/clc_powr.h"
#define __CLC_FUNCTION powr
#define __CLC_BODY <clc/shared/binary_def.inc>
-#include <clc/math/gentype.inc>
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+#undef __CLC_IMPL_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __powr_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_powr_fast
+#define __CLC_BODY <clc/shared/binary_def.inc>
+#include "clc/math/gentype.inc"
diff --git a/libclc/opencl/lib/generic/math/rootn.cl b/libclc/opencl/lib/generic/math/rootn.cl
index 8c9c7f4cc72f5..1c329e49dedaf 100644
--- a/libclc/opencl/lib/generic/math/rootn.cl
+++ b/libclc/opencl/lib/generic/math/rootn.cl
@@ -6,8 +6,16 @@
//
//===----------------------------------------------------------------------===//
-#include <clc/math/clc_rootn.h>
+#include "clc/math/clc_rootn.h"
#define __CLC_FUNCTION rootn
#define __CLC_BODY <clc/shared/binary_def_with_int_second_arg.inc>
-#include <clc/math/gentype.inc>
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+#undef __CLC_IMPL_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __rootn_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_rootn_fast
+#define __CLC_BODY <clc/shared/binary_def_with_int_second_arg.inc>
+#include "clc/math/gentype.inc"
>From 04dc71ffc20fed0ade86c930da1014c838715693 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 17 Mar 2026 08:12:32 +0100
Subject: [PATCH 2/3] Rename defines
---
libclc/clc/lib/generic/math/clc_pow.cl | 2 +-
libclc/clc/lib/generic/math/clc_pow_base.inc | 32 ++++++++++----------
libclc/clc/lib/generic/math/clc_pown.cl | 2 +-
libclc/clc/lib/generic/math/clc_powr.cl | 2 +-
libclc/clc/lib/generic/math/clc_rootn.cl | 2 +-
5 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/libclc/clc/lib/generic/math/clc_pow.cl b/libclc/clc/lib/generic/math/clc_pow.cl
index 14fbfb68359f5..eba5daa1a1e41 100644
--- a/libclc/clc/lib/generic/math/clc_pow.cl
+++ b/libclc/clc/lib/generic/math/clc_pow.cl
@@ -25,7 +25,7 @@
#include "clc/relational/clc_isinf.h"
#include "clc/relational/clc_isunordered.h"
-#define COMPILING_POW
+#define __CLC_COMPILING_POW
#define __CLC_BODY "clc_pow_base.inc"
#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_pow_base.inc b/libclc/clc/lib/generic/math/clc_pow_base.inc
index 56fdb177a844b..016a506c41487 100644
--- a/libclc/clc/lib/generic/math/clc_pow_base.inc
+++ b/libclc/clc/lib/generic/math/clc_pow_base.inc
@@ -52,7 +52,7 @@
#ifdef __CLC_SCALAR
-#ifdef COMPILING_POW
+#ifdef __CLC_COMPILING_POW
_CLC_OVERLOAD _CLC_CONST static bool is_integer(__CLC_GENTYPE ay) {
return __clc_trunc(ay) == ay;
@@ -76,7 +76,7 @@ static __CLC_GENTYPE fast_expylnx(__CLC_GENTYPE x, __CLC_GENTYPE y) {
return __clc_exp2(y * __clc_log2(ax));
}
-#if defined(COMPILING_POW) || defined(COMPILING_POWR)
+#if defined(__CLC_COMPILING_POW) || defined(__CLC_COMPILING_POWR)
_CLC_CONST
static __CLC_GENTYPE compute_expylnx_float(__CLC_GENTYPE x, __CLC_GENTYPE y) {
@@ -85,7 +85,7 @@ static __CLC_GENTYPE compute_expylnx_float(__CLC_GENTYPE x, __CLC_GENTYPE y) {
}
#endif
-#if defined(COMPILING_POW)
+#if defined(__CLC_COMPILING_POW)
_CLC_CONST
static __CLC_GENTYPE pow_fixup(__CLC_GENTYPE x, __CLC_GENTYPE y,
@@ -139,7 +139,7 @@ __clc_pow_fast(__CLC_GENTYPE x, __CLC_GENTYPE y) {
return pow_fixup(x, y, expylnx);
}
-#elif defined(COMPILING_POWR)
+#elif defined(__CLC_COMPILING_POWR)
_CLC_CONST
static __CLC_GENTYPE powr_fixup(__CLC_GENTYPE x, __CLC_GENTYPE y,
@@ -183,7 +183,7 @@ __clc_powr_fast(__CLC_GENTYPE x, __CLC_GENTYPE y) {
return powr_fixup(x, y, expylnx);
}
-#elif defined(COMPILING_POWN)
+#elif defined(__CLC_COMPILING_POWN)
_CLC_CONST
static __CLC_GENTYPE compute_expylnx_int(__CLC_GENTYPE x, __CLC_INTN ny) {
@@ -226,7 +226,7 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown_fast(__CLC_GENTYPE x,
return pown_fixup(x, ny, expylnx);
}
-#elif defined(COMPILING_ROOTN)
+#elif defined(__CLC_COMPILING_ROOTN)
// root version of compute_expylnx_int
_CLC_CONST
@@ -278,7 +278,7 @@ __clc_rootn_fast(__CLC_GENTYPE x, __CLC_INTN ny) {
#elif __CLC_FPSIZE == 64
-#if defined(COMPILING_POW)
+#if defined(__CLC_COMPILING_POW)
_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
__CLC_GENTYPE y) {
@@ -317,7 +317,7 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
return ret;
}
-#elif defined(COMPILING_POWR)
+#elif defined(__CLC_COMPILING_POWR)
_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
__CLC_GENTYPE y) {
@@ -348,7 +348,7 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
return ret;
}
-#elif defined(COMPILING_POWN)
+#elif defined(__CLC_COMPILING_POWN)
_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
__CLC_INTN ny) {
@@ -373,7 +373,7 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
return ret;
}
-#elif defined(COMPILING_ROOTN)
+#elif defined(__CLC_COMPILING_ROOTN)
_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
__CLC_INTN ny) {
@@ -404,7 +404,7 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
#elif __CLC_FPSIZE == 16
-#if defined(COMPILING_POW) || defined(COMPILING_POWR)
+#if defined(__CLC_COMPILING_POW) || defined(__CLC_COMPILING_POWR)
_CLC_CONST
static __CLC_GENTYPE compute_expylnx_f16(__CLC_GENTYPE ax, __CLC_GENTYPE y) {
@@ -414,9 +414,9 @@ static __CLC_GENTYPE compute_expylnx_f16(__CLC_GENTYPE ax, __CLC_GENTYPE y) {
return __CLC_CONVERT_GENTYPE(result);
}
-#endif // defined(COMPILING_POW) || defined(COMPILING_POWR)
+#endif // defined(__CLC_COMPILING_POW) || defined(__CLC_COMPILING_POWR)
-#if defined(COMPILING_POW)
+#if defined(__CLC_COMPILING_POW)
_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
__CLC_GENTYPE y) {
@@ -454,7 +454,7 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
return ret;
}
-#elif defined(COMPILING_POWR)
+#elif defined(__CLC_COMPILING_POWR)
_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
__CLC_GENTYPE y) {
@@ -482,7 +482,7 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
return ret;
}
-#elif defined(COMPILING_POWN)
+#elif defined(__CLC_COMPILING_POWN)
_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
__CLC_INTN ny) {
@@ -507,7 +507,7 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
return ret;
}
-#elif defined(COMPILING_ROOTN)
+#elif defined(__CLC_COMPILING_ROOTN)
_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
__CLC_INTN ny) {
diff --git a/libclc/clc/lib/generic/math/clc_pown.cl b/libclc/clc/lib/generic/math/clc_pown.cl
index c21738de8cf2f..6ddf5fd59b754 100644
--- a/libclc/clc/lib/generic/math/clc_pown.cl
+++ b/libclc/clc/lib/generic/math/clc_pown.cl
@@ -20,7 +20,7 @@
#include "clc/math/clc_trunc.h"
#include "clc/relational/clc_isinf.h"
-#define COMPILING_POWN
+#define __CLC_COMPILING_POWN
#define __CLC_BODY "clc_pow_base.inc"
#include "clc/math/gentype.inc"
#undef __CLC_FUNCTION
diff --git a/libclc/clc/lib/generic/math/clc_powr.cl b/libclc/clc/lib/generic/math/clc_powr.cl
index 287e1a31b5b2a..fc153dfebf9a0 100644
--- a/libclc/clc/lib/generic/math/clc_powr.cl
+++ b/libclc/clc/lib/generic/math/clc_powr.cl
@@ -25,7 +25,7 @@
#include "clc/relational/clc_isinf.h"
#include "clc/relational/clc_isunordered.h"
-#define COMPILING_POWR
+#define __CLC_COMPILING_POWR
#define __CLC_BODY <clc_pow_base.inc>
#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_rootn.cl b/libclc/clc/lib/generic/math/clc_rootn.cl
index 29a51baac87c3..9c50531542cbb 100644
--- a/libclc/clc/lib/generic/math/clc_rootn.cl
+++ b/libclc/clc/lib/generic/math/clc_rootn.cl
@@ -23,7 +23,7 @@
#include "clc/relational/clc_isinf.h"
#include "clc/relational/clc_isunordered.h"
-#define COMPILING_ROOTN
+#define __CLC_COMPILING_ROOTN
#define __CLC_BODY <clc_pow_base.inc>
#include <clc/math/gentype.inc>
>From 49ec3365f786d8541c795bce893dc3259d6263bf Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 17 Mar 2026 08:32:45 +0100
Subject: [PATCH 3/3] New scalarize file
---
.../clc/shared/binary_def_scalarize_loop.inc | 69 +++++++++++++++++++
...def_with_int_second_arg_scalarize_loop.inc | 37 ----------
libclc/clc/lib/generic/math/clc_pown.cl | 10 +--
libclc/clc/lib/generic/math/clc_rootn.cl | 10 +--
4 files changed, 81 insertions(+), 45 deletions(-)
create mode 100644 libclc/clc/include/clc/shared/binary_def_scalarize_loop.inc
delete mode 100644 libclc/clc/include/clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc
diff --git a/libclc/clc/include/clc/shared/binary_def_scalarize_loop.inc b/libclc/clc/include/clc/shared/binary_def_scalarize_loop.inc
new file mode 100644
index 0000000000000..cd79e0f6aafed
--- /dev/null
+++ b/libclc/clc/include/clc/shared/binary_def_scalarize_loop.inc
@@ -0,0 +1,69 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/utils.h"
+
+#ifndef __CLC_IMPL_FUNCTION
+#define __CLC_IMPL_FUNCTION __CLC_FUNCTION
+#endif
+
+#ifndef __CLC_MIN_VECSIZE
+#define __CLC_MIN_VECSIZE 2
+#endif
+
+#ifndef __CLC_RET_TYPE
+#define __CLC_RET_TYPE __CLC_GENTYPE
+#define __CLC_RET_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#ifndef __CLC_ARG1_TYPE
+#define __CLC_ARG1_TYPE __CLC_GENTYPE
+#define __CLC_ARG1_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#ifndef __CLC_ARG2_TYPE
+#define __CLC_ARG2_TYPE __CLC_GENTYPE
+#define __CLC_ARG2_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#if __CLC_MIN_VECSIZE == 1
+
+_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE __CLC_FUNCTION(__CLC_ARG1_TYPE x,
+ __CLC_ARG2_TYPE y) {
+ return __CLC_IMPL_FUNCTION(x, y);
+}
+
+#elif __CLC_VECSIZE_OR_1 >= 2
+
+_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE __CLC_FUNCTION(__CLC_ARG1_TYPE x,
+ __CLC_ARG2_TYPE y) {
+ union {
+ __CLC_ARG1_TYPE vec;
+ __CLC_ARG1_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+ } u_x;
+
+ union {
+ __CLC_ARG2_TYPE vec;
+ __CLC_ARG2_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+ } u_y;
+
+ union {
+ __CLC_RET_TYPE vec;
+ __CLC_RET_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+ } u_result;
+
+ u_x.vec = x;
+ u_y.vec = y;
+ for (int i = 0; i < __CLC_VECSIZE_OR_1; ++i) {
+ u_result.arr[i] = __CLC_IMPL_FUNCTION(u_x.arr[i], u_y.arr[i]);
+ }
+
+ return u_result.vec;
+}
+
+#endif // __CLC_VECSIZE_OR_1 >= 2
diff --git a/libclc/clc/include/clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc b/libclc/clc/include/clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc
deleted file mode 100644
index 498df2fc420de..0000000000000
--- a/libclc/clc/include/clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc
+++ /dev/null
@@ -1,37 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clc/utils.h"
-
-#if __CLC_VECSIZE_OR_1 >= 2
-
-#ifndef __CLC_IMPL_FUNCTION
-#define __CLC_IMPL_FUNCTION __CLC_FUNCTION
-#endif
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
- __CLC_INTN y) {
- union {
- __CLC_GENTYPE vec;
- __CLC_SCALAR_GENTYPE arr[__CLC_VECSIZE_OR_1];
- } u_x, u_result;
-
- union {
- __CLC_INTN vec;
- int arr[__CLC_VECSIZE_OR_1];
- } u_y;
-
- u_x.vec = x;
- u_y.vec = y;
- for (int i = 0; i < __CLC_VECSIZE_OR_1; ++i) {
- u_result.arr[i] = __CLC_IMPL_FUNCTION(u_x.arr[i], u_y.arr[i]);
- }
- return u_result.vec;
-}
-
-#endif // __CLC_VECSIZE_OR_1 >= 2
diff --git a/libclc/clc/lib/generic/math/clc_pown.cl b/libclc/clc/lib/generic/math/clc_pown.cl
index 6ddf5fd59b754..b3d5389b66896 100644
--- a/libclc/clc/lib/generic/math/clc_pown.cl
+++ b/libclc/clc/lib/generic/math/clc_pown.cl
@@ -25,15 +25,17 @@
#include "clc/math/gentype.inc"
#undef __CLC_FUNCTION
+#define __CLC_ARG2_TYPE __CLC_INTN
+#define __CLC_ARG2_SCALAR_TYPE int
#define __CLC_FUNCTION __clc_pown
-#define __CLC_BODY \
- "clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc"
+#define __CLC_BODY "clc/shared/binary_def_scalarize_loop.inc"
#include "clc/math/gentype.inc"
#undef __CLC_FUNCTION
#define __CLC_FLOAT_ONLY
+#define __CLC_ARG2_TYPE __CLC_INTN
+#define __CLC_ARG2_SCALAR_TYPE int
#define __CLC_FUNCTION __clc_pown_fast
-#define __CLC_BODY \
- "clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc"
+#define __CLC_BODY "clc/shared/binary_def_scalarize_loop.inc"
#include "clc/math/gentype.inc"
#undef __CLC_FUNCTION
diff --git a/libclc/clc/lib/generic/math/clc_rootn.cl b/libclc/clc/lib/generic/math/clc_rootn.cl
index 9c50531542cbb..00c5a27fc44d5 100644
--- a/libclc/clc/lib/generic/math/clc_rootn.cl
+++ b/libclc/clc/lib/generic/math/clc_rootn.cl
@@ -28,13 +28,15 @@
#include <clc/math/gentype.inc>
#define __CLC_FUNCTION __clc_rootn
-#define __CLC_BODY \
- "clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc"
+#define __CLC_ARG2_TYPE __CLC_INTN
+#define __CLC_ARG2_SCALAR_TYPE int
+#define __CLC_BODY "clc/shared/binary_def_scalarize_loop.inc"
#include "clc/math/gentype.inc"
#undef __CLC_FUNCTION
#define __CLC_FLOAT_ONLY
+#define __CLC_ARG2_TYPE __CLC_INTN
+#define __CLC_ARG2_SCALAR_TYPE int
#define __CLC_FUNCTION __clc_rootn_fast
-#define __CLC_BODY \
- "clc/shared/binary_def_with_int_second_arg_scalarize_loop.inc"
+#define __CLC_BODY "clc/shared/binary_def_scalarize_loop.inc"
#include "clc/math/gentype.inc"
More information about the cfe-commits
mailing list