[libclc] c8eb865 - [libclc] Move mad to the CLC library (#123607)
via cfe-commits
cfe-commits at lists.llvm.org
Mon Jan 20 08:27:54 PST 2025
Author: Fraser Cormack
Date: 2025-01-20T16:27:51Z
New Revision: c8eb865747ea0006470a0ab130056293fcb536cb
URL: https://github.com/llvm/llvm-project/commit/c8eb865747ea0006470a0ab130056293fcb536cb
DIFF: https://github.com/llvm/llvm-project/commit/c8eb865747ea0006470a0ab130056293fcb536cb.diff
LOG: [libclc] Move mad to the CLC library (#123607)
All targets build `__clc_mad` -- even SPIR-V targets -- since it
compiles to the optimal `llvm.fmuladd` intrinsic. There is no change to
the bytecode generated for non-SPIR-V targets.
The `mix` builtin, which is implemented as a wrapper around `mad`, is
left as an OpenCL-layer wrapper of `__clc_mad`. I don't know if it's
worth having a specific CLC version of `mix`.
The changes to the other CLC files/functions are moving uses of `mad` to
`__clc_mad`, and reformatting. There is an additional instance of
`trunc` becoming `__clc_trunc`, which was missed before.
Added:
libclc/clc/include/clc/math/clc_mad.h
libclc/clc/include/clc/math/ternary_decl.inc
libclc/clc/lib/generic/math/clc_mad.cl
libclc/clc/lib/generic/math/clc_mad.inc
Modified:
libclc/clc/include/clc/clcmacro.h
libclc/clc/lib/clspv/SOURCES
libclc/clc/lib/generic/SOURCES
libclc/clc/lib/spirv/SOURCES
libclc/clc/lib/spirv64/SOURCES
libclc/generic/lib/common/mix.cl
libclc/generic/lib/common/mix.inc
libclc/generic/lib/math/clc_exp10.cl
libclc/generic/lib/math/clc_hypot.cl
libclc/generic/lib/math/clc_pow.cl
libclc/generic/lib/math/clc_pown.cl
libclc/generic/lib/math/clc_powr.cl
libclc/generic/lib/math/clc_rootn.cl
libclc/generic/lib/math/mad.cl
libclc/generic/lib/math/sincos_helpers.cl
libclc/generic/lib/math/sincospiF_piby4.h
Removed:
libclc/generic/include/clc/math/ternary_decl.inc
libclc/generic/lib/math/mad.inc
################################################################################
diff --git a/libclc/clc/include/clc/clcmacro.h b/libclc/clc/include/clc/clcmacro.h
index 3c3a69f4f848bbb..44928b2e428bfaa 100644
--- a/libclc/clc/include/clc/clcmacro.h
+++ b/libclc/clc/include/clc/clcmacro.h
@@ -184,6 +184,33 @@
return BUILTIN(x); \
}
+#define _CLC_DEFINE_TERNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, \
+ ARG2_TYPE, ARG3_TYPE) \
+ _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y, \
+ ARG3_TYPE z) { \
+ return BUILTIN(x, y, z); \
+ } \
+ _CLC_DEF _CLC_OVERLOAD RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, \
+ ARG3_TYPE##2 z) { \
+ return BUILTIN(x, y, z); \
+ } \
+ _CLC_DEF _CLC_OVERLOAD RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y, \
+ ARG3_TYPE##3 z) { \
+ return BUILTIN(x, y, z); \
+ } \
+ _CLC_DEF _CLC_OVERLOAD RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y, \
+ ARG3_TYPE##4 z) { \
+ return BUILTIN(x, y, z); \
+ } \
+ _CLC_DEF _CLC_OVERLOAD RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y, \
+ ARG3_TYPE##8 z) { \
+ return BUILTIN(x, y, z); \
+ } \
+ _CLC_DEF _CLC_OVERLOAD RET_TYPE##16 FUNCTION( \
+ ARG1_TYPE##16 x, ARG2_TYPE##16 y, ARG3_TYPE##16 z) { \
+ return BUILTIN(x, y, z); \
+ }
+
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
diff --git a/libclc/clc/include/clc/math/clc_mad.h b/libclc/clc/include/clc/math/clc_mad.h
new file mode 100644
index 000000000000000..3eb718e87f3705d
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_mad.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_MATH_CLC_MAD_H__
+#define __CLC_MATH_CLC_MAD_H__
+
+#define __CLC_BODY <clc/math/ternary_decl.inc>
+#define __CLC_FUNCTION __clc_mad
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_MAD_H__
diff --git a/libclc/clc/include/clc/math/ternary_decl.inc b/libclc/clc/include/clc/math/ternary_decl.inc
new file mode 100644
index 000000000000000..6c1507b3fcf74bf
--- /dev/null
+++ b/libclc/clc/include/clc/math/ternary_decl.inc
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
+ __CLC_GENTYPE b,
+ __CLC_GENTYPE c);
diff --git a/libclc/clc/lib/clspv/SOURCES b/libclc/clc/lib/clspv/SOURCES
index e6573f586080cfa..209dc0ca61e2b8b 100644
--- a/libclc/clc/lib/clspv/SOURCES
+++ b/libclc/clc/lib/clspv/SOURCES
@@ -1,6 +1,7 @@
../generic/math/clc_ceil.cl
../generic/math/clc_fabs.cl
../generic/math/clc_floor.cl
+../generic/math/clc_mad.cl
../generic/math/clc_rint.cl
../generic/math/clc_trunc.cl
../generic/shared/clc_clamp.cl
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index d74bff20ba87ba0..877a0a390a7452a 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -7,6 +7,7 @@ integer/clc_abs_
diff .cl
math/clc_ceil.cl
math/clc_fabs.cl
math/clc_floor.cl
+math/clc_mad.cl
math/clc_rint.cl
math/clc_trunc.cl
relational/clc_all.cl
diff --git a/libclc/clc/lib/generic/math/clc_mad.cl b/libclc/clc/lib/generic/math/clc_mad.cl
new file mode 100644
index 000000000000000..58618cf24771772
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_mad.cl
@@ -0,0 +1,4 @@
+#include <clc/internal/clc.h>
+
+#define __CLC_BODY <clc_mad.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/math/clc_mad.inc b/libclc/clc/lib/generic/math/clc_mad.inc
new file mode 100644
index 000000000000000..a4dbdf1bc83a56f
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_mad.inc
@@ -0,0 +1,5 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_mad(__CLC_GENTYPE a, __CLC_GENTYPE b,
+ __CLC_GENTYPE c) {
+#pragma OPENCL FP_CONTRACT ON
+ return a * b + c;
+}
diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES
index ac855ea5184edec..905afa03d8a56c1 100644
--- a/libclc/clc/lib/spirv/SOURCES
+++ b/libclc/clc/lib/spirv/SOURCES
@@ -5,6 +5,7 @@
../generic/math/clc_ceil.cl
../generic/math/clc_fabs.cl
../generic/math/clc_floor.cl
+../generic/math/clc_mad.cl
../generic/math/clc_rint.cl
../generic/math/clc_trunc.cl
../generic/shared/clc_clamp.cl
diff --git a/libclc/clc/lib/spirv64/SOURCES b/libclc/clc/lib/spirv64/SOURCES
index ac855ea5184edec..905afa03d8a56c1 100644
--- a/libclc/clc/lib/spirv64/SOURCES
+++ b/libclc/clc/lib/spirv64/SOURCES
@@ -5,6 +5,7 @@
../generic/math/clc_ceil.cl
../generic/math/clc_fabs.cl
../generic/math/clc_floor.cl
+../generic/math/clc_mad.cl
../generic/math/clc_rint.cl
../generic/math/clc_trunc.cl
../generic/shared/clc_clamp.cl
diff --git a/libclc/generic/include/clc/math/ternary_decl.inc b/libclc/generic/include/clc/math/ternary_decl.inc
deleted file mode 100644
index 0598684ea4069fd..000000000000000
--- a/libclc/generic/include/clc/math/ternary_decl.inc
+++ /dev/null
@@ -1 +0,0 @@
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c);
diff --git a/libclc/generic/lib/common/mix.cl b/libclc/generic/lib/common/mix.cl
index 7f3d5b61497b2e5..756e8619f1b3f24 100644
--- a/libclc/generic/lib/common/mix.cl
+++ b/libclc/generic/lib/common/mix.cl
@@ -1,4 +1,5 @@
#include <clc/clc.h>
+#include <clc/math/clc_mad.h>
#define __CLC_BODY <mix.inc>
#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/common/mix.inc b/libclc/generic/lib/common/mix.inc
index 1e8b936149bbfcb..fd45a810b0ed9d5 100644
--- a/libclc/generic/lib/common/mix.inc
+++ b/libclc/generic/lib/common/mix.inc
@@ -1,9 +1,11 @@
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE a) {
- return mad( y - x, a, x );
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y,
+ __CLC_GENTYPE a) {
+ return __clc_mad(y - x, a, x);
}
#ifndef __CLC_SCALAR
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_SCALAR_GENTYPE a) {
- return mix(x, y, (__CLC_GENTYPE)a);
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y,
+ __CLC_SCALAR_GENTYPE a) {
+ return mix(x, y, (__CLC_GENTYPE)a);
}
#endif
diff --git a/libclc/generic/lib/math/clc_exp10.cl b/libclc/generic/lib/math/clc_exp10.cl
index 6ea8743e39c5f72..572aa396942b7db 100644
--- a/libclc/generic/lib/math/clc_exp10.cl
+++ b/libclc/generic/lib/math/clc_exp10.cl
@@ -22,6 +22,7 @@
#include <clc/clc.h>
#include <clc/clcmacro.h>
+#include <clc/math/clc_mad.h>
#include <clc/relational/clc_isnan.h>
#include "config.h"
@@ -53,98 +54,109 @@
//
// e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
-_CLC_DEF _CLC_OVERLOAD float __clc_exp10(float x)
-{
- const float X_MAX = 0x1.344134p+5f; // 128*log2/log10 : 38.53183944498959
- const float X_MIN = -0x1.66d3e8p+5f; // -149*log2/log10 : -44.8534693539332
-
- const float R_64_BY_LOG10_2 = 0x1.a934f0p+7f; // 64*log10/log2 : 212.6033980727912
- const float R_LOG10_2_BY_64_LD = 0x1.340000p-8f; // log2/(64 * log10) lead : 0.004699707
- const float R_LOG10_2_BY_64_TL = 0x1.04d426p-18f; // log2/(64 * log10) tail : 0.00000388665057
- const float R_LN10 = 0x1.26bb1cp+1f;
-
- int return_nan = __clc_isnan(x);
- int return_inf = x > X_MAX;
- int return_zero = x < X_MIN;
-
- int n = convert_int(x * R_64_BY_LOG10_2);
-
- float fn = (float)n;
- int j = n & 0x3f;
- int m = n >> 6;
- int m2 = m << EXPSHIFTBITS_SP32;
- float r;
-
- r = R_LN10 * mad(fn, -R_LOG10_2_BY_64_TL, mad(fn, -R_LOG10_2_BY_64_LD, x));
-
- // Truncated Taylor series for e^r
- float z2 = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
-
- float two_to_jby64 = USE_TABLE(exp_tbl, j);
- z2 = mad(two_to_jby64, z2, two_to_jby64);
-
- float z2s = z2 * as_float(0x1 << (m + 149));
- float z2n = as_float(as_int(z2) + m2);
- z2 = m <= -126 ? z2s : z2n;
-
-
- z2 = return_inf ? as_float(PINFBITPATT_SP32) : z2;
- z2 = return_zero ? 0.0f : z2;
- z2 = return_nan ? x : z2;
- return z2;
+_CLC_DEF _CLC_OVERLOAD float __clc_exp10(float x) {
+ // 128*log2/log10 : 38.53183944498959
+ const float X_MAX = 0x1.344134p+5f;
+ // -149*log2/log10 : -44.8534693539332
+ const float X_MIN = -0x1.66d3e8p+5f;
+ // 64*log10/log2 : 212.6033980727912
+ const float R_64_BY_LOG10_2 = 0x1.a934f0p+7f;
+ // log2/(64 * log10) lead : 0.004699707
+ const float R_LOG10_2_BY_64_LD = 0x1.340000p-8f;
+ // log2/(64 * log10) tail : 0.00000388665057
+ const float R_LOG10_2_BY_64_TL = 0x1.04d426p-18f;
+ const float R_LN10 = 0x1.26bb1cp+1f;
+
+ int return_nan = __clc_isnan(x);
+ int return_inf = x > X_MAX;
+ int return_zero = x < X_MIN;
+
+ int n = convert_int(x * R_64_BY_LOG10_2);
+
+ float fn = (float)n;
+ int j = n & 0x3f;
+ int m = n >> 6;
+ int m2 = m << EXPSHIFTBITS_SP32;
+ float r;
+
+ r = R_LN10 *
+ __clc_mad(fn, -R_LOG10_2_BY_64_TL, __clc_mad(fn, -R_LOG10_2_BY_64_LD, x));
+
+ // Truncated Taylor series for e^r
+ float z2 = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f),
+ r, 0x1.000000p-1f),
+ r * r, r);
+
+ float two_to_jby64 = USE_TABLE(exp_tbl, j);
+ z2 = __clc_mad(two_to_jby64, z2, two_to_jby64);
+
+ float z2s = z2 * as_float(0x1 << (m + 149));
+ float z2n = as_float(as_int(z2) + m2);
+ z2 = m <= -126 ? z2s : z2n;
+
+ z2 = return_inf ? as_float(PINFBITPATT_SP32) : z2;
+ z2 = return_zero ? 0.0f : z2;
+ z2 = return_nan ? x : z2;
+ return z2;
}
_CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_exp10, float)
#ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x)
-{
- const double X_MAX = 0x1.34413509f79ffp+8; // 1024*ln(2)/ln(10)
- const double X_MIN = -0x1.434e6420f4374p+8; // -1074*ln(2)/ln(10)
-
- const double R_64_BY_LOG10_2 = 0x1.a934f0979a371p+7; // 64*ln(10)/ln(2)
- const double R_LOG10_2_BY_64_LD = 0x1.3441350000000p-8; // head ln(2)/(64*ln(10))
- const double R_LOG10_2_BY_64_TL = 0x1.3ef3fde623e25p-37; // tail ln(2)/(64*ln(10))
- const double R_LN10 = 0x1.26bb1bbb55516p+1; // ln(10)
-
- int n = convert_int(x * R_64_BY_LOG10_2);
-
- double dn = (double)n;
-
- int j = n & 0x3f;
- int m = n >> 6;
-
- double r = R_LN10 * fma(-R_LOG10_2_BY_64_TL, dn, fma(-R_LOG10_2_BY_64_LD, dn, x));
-
- // 6 term tail of Taylor expansion of e^r
- double z2 = r * fma(r,
- fma(r,
- fma(r,
- fma(r,
- fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7),
- 0x1.5555555555555p-5),
- 0x1.5555555555555p-3),
- 0x1.0000000000000p-1),
- 1.0);
-
- double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
- z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;
-
- int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0));
-
- int n1 = m >> 2;
- int n2 = m-n1;
- double z3= z2 * as_double(((long)n1 + 1023) << 52);
- z3 *= as_double(((long)n2 + 1023) << 52);
-
- z2 = ldexp(z2, m);
- z2 = small_value ? z3: z2;
-
- z2 = __clc_isnan(x) ? x : z2;
-
- z2 = x > X_MAX ? as_double(PINFBITPATT_DP64) : z2;
- z2 = x < X_MIN ? 0.0 : z2;
-
- return z2;
+_CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x) {
+ // 1024*ln(2)/ln(10)
+ const double X_MAX = 0x1.34413509f79ffp+8;
+ // -1074*ln(2)/ln(10)
+ const double X_MIN = -0x1.434e6420f4374p+8;
+ // 64*ln(10)/ln(2)
+ const double R_64_BY_LOG10_2 = 0x1.a934f0979a371p+7;
+ // head ln(2)/(64*ln(10))
+ const double R_LOG10_2_BY_64_LD = 0x1.3441350000000p-8;
+ // tail ln(2)/(64*ln(10))
+ const double R_LOG10_2_BY_64_TL = 0x1.3ef3fde623e25p-37;
+ // ln(10)
+ const double R_LN10 = 0x1.26bb1bbb55516p+1;
+
+ int n = convert_int(x * R_64_BY_LOG10_2);
+
+ double dn = (double)n;
+
+ int j = n & 0x3f;
+ int m = n >> 6;
+
+ double r =
+ R_LN10 * fma(-R_LOG10_2_BY_64_TL, dn, fma(-R_LOG10_2_BY_64_LD, dn, x));
+
+ // 6 term tail of Taylor expansion of e^r
+ double z2 =
+ r *
+ fma(r,
+ fma(r,
+ fma(r,
+ fma(r, fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7),
+ 0x1.5555555555555p-5),
+ 0x1.5555555555555p-3),
+ 0x1.0000000000000p-1),
+ 1.0);
+
+ double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+ z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;
+
+ int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0));
+
+ int n1 = m >> 2;
+ int n2 = m - n1;
+ double z3 = z2 * as_double(((long)n1 + 1023) << 52);
+ z3 *= as_double(((long)n2 + 1023) << 52);
+
+ z2 = ldexp(z2, m);
+ z2 = small_value ? z3 : z2;
+
+ z2 = __clc_isnan(x) ? x : z2;
+
+ z2 = x > X_MAX ? as_double(PINFBITPATT_DP64) : z2;
+ z2 = x < X_MIN ? 0.0 : z2;
+
+ return z2;
}
_CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_exp10, double)
#endif
diff --git a/libclc/generic/lib/math/clc_hypot.cl b/libclc/generic/lib/math/clc_hypot.cl
index a17e661603fa67c..d889969d6d8c239 100644
--- a/libclc/generic/lib/math/clc_hypot.cl
+++ b/libclc/generic/lib/math/clc_hypot.cl
@@ -23,6 +23,7 @@
#include <clc/clc.h>
#include <clc/clcmacro.h>
#include <clc/integer/clc_abs.h>
+#include <clc/math/clc_mad.h>
#include <clc/relational/clc_isnan.h>
#include <clc/shared/clc_clamp.h>
#include <math/clc_hypot.h>
@@ -48,7 +49,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) {
float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
float fx = as_float(ux) * fi_exp;
float fy = as_float(uy) * fi_exp;
- retval = sqrt(mad(fx, fx, fy * fy)) * fx_exp;
+ retval = sqrt(__clc_mad(fx, fx, fy * fy)) * fx_exp;
retval = ux > PINFBITPATT_SP32 | uy == 0 ? as_float(ux) : retval;
retval = ux == PINFBITPATT_SP32 | uy == PINFBITPATT_SP32
diff --git a/libclc/generic/lib/math/clc_pow.cl b/libclc/generic/lib/math/clc_pow.cl
index 2e2dade0d6b8faa..4abfaf1c10df428 100644
--- a/libclc/generic/lib/math/clc_pow.cl
+++ b/libclc/generic/lib/math/clc_pow.cl
@@ -23,6 +23,7 @@
#include <clc/clc.h>
#include <clc/clcmacro.h>
#include <clc/math/clc_fabs.h>
+#include <clc/math/clc_mad.h>
#include "config.h"
#include "math.h"
@@ -66,334 +67,351 @@
((((expT * poly) + expT) + expH*poly) + expH)
*/
-_CLC_DEF _CLC_OVERLOAD float __clc_pow(float x, float y)
-{
-
- int ix = as_int(x);
- int ax = ix & EXSIGNBIT_SP32;
- int xpos = ix == ax;
-
- int iy = as_int(y);
- int ay = iy & EXSIGNBIT_SP32;
- int ypos = iy == ay;
-
- /* Extra precise log calculation
- * First handle case that x is close to 1
- */
- float r = 1.0f - as_float(ax);
- int near1 = __clc_fabs(r) < 0x1.0p-4f;
- float r2 = r*r;
-
- /* Coefficients are just 1/3, 1/4, 1/5 and 1/6 */
- float poly = mad(r,
- mad(r,
- mad(r,
- mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
- 0x1.99999ap-3f),
- 0x1.000000p-2f),
- 0x1.555556p-2f);
-
- poly *= r2*r;
-
- float lth_near1 = -r2 * 0.5f;
- float ltt_near1 = -poly;
- float lt_near1 = lth_near1 + ltt_near1;
- float lh_near1 = -r;
- float l_near1 = lh_near1 + lt_near1;
-
- /* Computations for x not near 1 */
- int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
- float mf = (float)m;
- int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
- float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
- int c = m == -127;
- int ixn = c ? ixs : ax;
- float mfn = c ? mfs : mf;
-
- int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
- /* F - Y */
- float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (ixn & MANTBITS_SP32));
-
- indx = indx >> 16;
- float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
- float rh = f * tv.s0;
- float rt = f * tv.s1;
- r = rh + rt;
-
- poly = mad(r, mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * (r*r);
- poly += (rh - r) + rt;
-
- const float LOG2_HEAD = 0x1.62e000p-1f; /* 0.693115234 */
- const float LOG2_TAIL = 0x1.0bfbe8p-15f; /* 0.0000319461833 */
- tv = USE_TABLE(loge_tbl, indx);
- float lth = -r;
- float ltt = mad(mfn, LOG2_TAIL, -poly) + tv.s1;
- float lt = lth + ltt;
- float lh = mad(mfn, LOG2_HEAD, tv.s0);
- float l = lh + lt;
-
- /* Select near 1 or not */
- lth = near1 ? lth_near1 : lth;
- ltt = near1 ? ltt_near1 : ltt;
- lt = near1 ? lt_near1 : lt;
- lh = near1 ? lh_near1 : lh;
- l = near1 ? l_near1 : l;
-
- float gh = as_float(as_int(l) & 0xfffff000);
- float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
- float yh = as_float(iy & 0xfffff000);
-
- float yt = y - yh;
-
- float ylogx_s = mad(gt, yh, mad(gh, yt, yt*gt));
- float ylogx = mad(yh, gh, ylogx_s);
- float ylogx_t = mad(yh, gh, -ylogx) + ylogx_s;
-
- /* Extra precise exp of ylogx */
- const float R_64_BY_LOG2 = 0x1.715476p+6f; /* 64/log2 : 92.332482616893657 */
- int n = convert_int(ylogx * R_64_BY_LOG2);
- float nf = (float) n;
-
- int j = n & 0x3f;
- m = n >> 6;
- int m2 = m << EXPSHIFTBITS_SP32;
-
- const float R_LOG2_BY_64_LD = 0x1.620000p-7f; /* log2/64 lead: 0.0108032227 */
- const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; /* log2/64 tail: 0.0000272020388 */
- r = mad(nf, -R_LOG2_BY_64_TL, mad(nf, -R_LOG2_BY_64_LD, ylogx)) + ylogx_t;
-
- /* Truncated Taylor series for e^r */
- poly = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
-
- tv = USE_TABLE(exp_tbl_ep, j);
-
- float expylogx = mad(tv.s0, poly, mad(tv.s1, poly, tv.s1)) + tv.s0;
- float sexpylogx = expylogx * as_float(0x1 << (m + 149));
- float texpylogx = as_float(as_int(expylogx) + m2);
- expylogx = m < -125 ? sexpylogx : texpylogx;
-
- /* Result is +-Inf if (ylogx + ylogx_t) > 128*log2 */
- expylogx = (ylogx > 0x1.62e430p+6f) | (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f) ? as_float(PINFBITPATT_SP32) : expylogx;
-
- /* Result is 0 if ylogx < -149*log2 */
- expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
- /* Classify y:
- * inty = 0 means not an integer.
- * inty = 1 means odd integer.
- * inty = 2 means even integer.
- */
-
- int yexp = (int)(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
- int mask = (1 << (24 - yexp)) - 1;
- int yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
- int inty = yodd ? 1 : 2;
- inty = (iy & mask) != 0 ? 0 : inty;
- inty = yexp < 1 ? 0 : inty;
- inty = yexp > 24 ? 2 : inty;
-
- float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
- expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
- int ret = as_int(expylogx);
-
- /* Corner case handling */
- ret = (!xpos & (inty == 0)) ? QNANBITPATT_SP32 : ret;
- ret = ax < 0x3f800000 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
- ret = ax > 0x3f800000 & iy == NINFBITPATT_SP32 ? 0 : ret;
- ret = ax < 0x3f800000 & iy == PINFBITPATT_SP32 ? 0 : ret;
- ret = ax > 0x3f800000 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
- int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
- ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
- ret = ((ax == 0) & !ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
- int xzero = xpos ? 0 : 0x80000000;
- ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
- ret = ((ax == 0) & ypos & (inty != 1)) ? 0 : ret;
- ret = ((ax == 0) & (iy == NINFBITPATT_SP32)) ? PINFBITPATT_SP32 : ret;
- ret = ((ix == 0xbf800000) & (ay == PINFBITPATT_SP32)) ? 0x3f800000 : ret;
- ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
- ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty != 1)) ? 0 : ret;
- ret = ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
- ret = ((ix == NINFBITPATT_SP32) & ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
- ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
- ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
- ret = (ax > PINFBITPATT_SP32) ? ix : ret;
- ret = (ay > PINFBITPATT_SP32) ? iy : ret;
- ret = ay == 0 ? 0x3f800000 : ret;
- ret = ix == 0x3f800000 ? 0x3f800000 : ret;
-
- return as_float(ret);
+_CLC_DEF _CLC_OVERLOAD float __clc_pow(float x, float y) {
+
+ int ix = as_int(x);
+ int ax = ix & EXSIGNBIT_SP32;
+ int xpos = ix == ax;
+
+ int iy = as_int(y);
+ int ay = iy & EXSIGNBIT_SP32;
+ int ypos = iy == ay;
+
+ /* Extra precise log calculation
+ * First handle case that x is close to 1
+ */
+ float r = 1.0f - as_float(ax);
+ int near1 = __clc_fabs(r) < 0x1.0p-4f;
+ float r2 = r * r;
+
+ /* Coefficients are just 1/3, 1/4, 1/5 and 1/6 */
+ float poly = __clc_mad(
+ r,
+ __clc_mad(r,
+ __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
+ 0x1.99999ap-3f),
+ 0x1.000000p-2f),
+ 0x1.555556p-2f);
+
+ poly *= r2 * r;
+
+ float lth_near1 = -r2 * 0.5f;
+ float ltt_near1 = -poly;
+ float lt_near1 = lth_near1 + ltt_near1;
+ float lh_near1 = -r;
+ float l_near1 = lh_near1 + lt_near1;
+
+ /* Computations for x not near 1 */
+ int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+ float mf = (float)m;
+ int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+ float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
+ int c = m == -127;
+ int ixn = c ? ixs : ax;
+ float mfn = c ? mfs : mf;
+
+ int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
+
+ /* F - Y */
+ float f = as_float(0x3f000000 | indx) -
+ as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+
+ indx = indx >> 16;
+ float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
+ float rh = f * tv.s0;
+ float rt = f * tv.s1;
+ r = rh + rt;
+
+ poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
+ (r * r);
+ poly += (rh - r) + rt;
+
+ const float LOG2_HEAD = 0x1.62e000p-1f; /* 0.693115234 */
+ const float LOG2_TAIL = 0x1.0bfbe8p-15f; /* 0.0000319461833 */
+ tv = USE_TABLE(loge_tbl, indx);
+ float lth = -r;
+ float ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + tv.s1;
+ float lt = lth + ltt;
+ float lh = __clc_mad(mfn, LOG2_HEAD, tv.s0);
+ float l = lh + lt;
+
+ /* Select near 1 or not */
+ lth = near1 ? lth_near1 : lth;
+ ltt = near1 ? ltt_near1 : ltt;
+ lt = near1 ? lt_near1 : lt;
+ lh = near1 ? lh_near1 : lh;
+ l = near1 ? l_near1 : l;
+
+ float gh = as_float(as_int(l) & 0xfffff000);
+ float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
+
+ float yh = as_float(iy & 0xfffff000);
+
+ float yt = y - yh;
+
+ float ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
+ float ylogx = __clc_mad(yh, gh, ylogx_s);
+ float ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
+
+ /* Extra precise exp of ylogx */
+ /* 64/log2 : 92.332482616893657 */
+ const float R_64_BY_LOG2 = 0x1.715476p+6f;
+ int n = convert_int(ylogx * R_64_BY_LOG2);
+ float nf = (float)n;
+
+ int j = n & 0x3f;
+ m = n >> 6;
+ int m2 = m << EXPSHIFTBITS_SP32;
+
+ /* log2/64 lead: 0.0108032227 */
+ const float R_LOG2_BY_64_LD = 0x1.620000p-7f;
+ /* log2/64 tail: 0.0000272020388 */
+ const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
+ r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
+ ylogx_t;
+
+ /* Truncated Taylor series for e^r */
+ poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
+ 0x1.000000p-1f),
+ r * r, r);
+
+ tv = USE_TABLE(exp_tbl_ep, j);
+
+ float expylogx =
+ __clc_mad(tv.s0, poly, __clc_mad(tv.s1, poly, tv.s1)) + tv.s0;
+ float sexpylogx = expylogx * as_float(0x1 << (m + 149));
+ float texpylogx = as_float(as_int(expylogx) + m2);
+ expylogx = m < -125 ? sexpylogx : texpylogx;
+
+ /* Result is +-Inf if (ylogx + ylogx_t) > 128*log2 */
+ expylogx = (ylogx > 0x1.62e430p+6f) |
+ (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f)
+ ? as_float(PINFBITPATT_SP32)
+ : expylogx;
+
+ /* Result is 0 if ylogx < -149*log2 */
+ expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
+
+ /* Classify y:
+ * inty = 0 means not an integer.
+ * inty = 1 means odd integer.
+ * inty = 2 means even integer.
+ */
+
+ int yexp = (int)(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
+ int mask = (1 << (24 - yexp)) - 1;
+ int yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
+ int inty = yodd ? 1 : 2;
+ inty = (iy & mask) != 0 ? 0 : inty;
+ inty = yexp < 1 ? 0 : inty;
+ inty = yexp > 24 ? 2 : inty;
+
+ float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+ expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
+ int ret = as_int(expylogx);
+
+ /* Corner case handling */
+ ret = (!xpos & (inty == 0)) ? QNANBITPATT_SP32 : ret;
+ ret = ax < 0x3f800000 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+ ret = ax > 0x3f800000 & iy == NINFBITPATT_SP32 ? 0 : ret;
+ ret = ax < 0x3f800000 & iy == PINFBITPATT_SP32 ? 0 : ret;
+ ret = ax > 0x3f800000 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+ int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
+ ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
+ ret = ((ax == 0) & !ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
+ int xzero = xpos ? 0 : 0x80000000;
+ ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
+ ret = ((ax == 0) & ypos & (inty != 1)) ? 0 : ret;
+ ret = ((ax == 0) & (iy == NINFBITPATT_SP32)) ? PINFBITPATT_SP32 : ret;
+ ret = ((ix == 0xbf800000) & (ay == PINFBITPATT_SP32)) ? 0x3f800000 : ret;
+ ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
+ ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty != 1)) ? 0 : ret;
+ ret =
+ ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
+ ret =
+ ((ix == NINFBITPATT_SP32) & ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
+ ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
+ ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
+ ret = (ax > PINFBITPATT_SP32) ? ix : ret;
+ ret = (ay > PINFBITPATT_SP32) ? iy : ret;
+ ret = ay == 0 ? 0x3f800000 : ret;
+ ret = ix == 0x3f800000 ? 0x3f800000 : ret;
+
+ return as_float(ret);
}
_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_pow, float, float)
#ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y)
-{
- const double real_log2_tail = 5.76999904754328540596e-08;
- const double real_log2_lead = 6.93147122859954833984e-01;
-
- long ux = as_long(x);
- long ax = ux & (~SIGNBIT_DP64);
- int xpos = ax == ux;
-
- long uy = as_long(y);
- long ay = uy & (~SIGNBIT_DP64);
- int ypos = ay == uy;
-
- // Extended precision log
- double v, vt;
- {
- int exp = (int)(ax >> 52) - 1023;
- int mask_exp_1023 = exp == -1023;
- double xexp = (double) exp;
- long mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
- long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
- exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
- double xexp1 = (double) exp;
- long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
- xexp = mask_exp_1023 ? xexp1 : xexp;
- mantissa = mask_exp_1023 ? mantissa1 : mantissa;
-
- long rax = (mantissa & 0x000ff00000000000) + ((mantissa & 0x0000080000000000) << 1);
- int index = rax >> 44;
-
- double F = as_double(rax | 0x3FE0000000000000L);
- double Y = as_double(mantissa | 0x3FE0000000000000L);
- double f = F - Y;
- double2 tv = USE_TABLE(log_f_inv_tbl, index);
- double log_h = tv.s0;
- double log_t = tv.s1;
- double f_inv = (log_h + log_t) * f;
- double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
- double r2 = fma(-F, r1, f) * (log_h + log_t);
- double r = r1 + r2;
-
- double poly = fma(r,
- fma(r,
- fma(r,
- fma(r, 1.0/7.0, 1.0/6.0),
- 1.0/5.0),
- 1.0/4.0),
- 1.0/3.0);
- poly = poly * r * r * r;
-
- double hr1r1 = 0.5*r1*r1;
- double poly0h = r1 + hr1r1;
- double poly0t = r1 - poly0h + hr1r1;
- poly = fma(r1, r2, fma(0.5*r2, r2, poly)) + r2 + poly0t;
-
- tv = USE_TABLE(powlog_tbl, index);
- log_h = tv.s0;
- log_t = tv.s1;
-
- double resT_t = fma(xexp, real_log2_tail, + log_t) - poly;
- double resT = resT_t - poly0h;
- double resH = fma(xexp, real_log2_lead, log_h);
- double resT_h = poly0h;
-
- double H = resT + resH;
- double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
- double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
- H = H_h;
-
- double y_head = as_double(uy & 0xfffffffff8000000L);
- double y_tail = y - y_head;
-
- double temp = fma(y_tail, H, fma(y_head, T, y_tail*T));
- v = fma(y_head, H, temp);
- vt = fma(y_head, H, -v) + temp;
- }
-
- // Now calculate exp of (v,vt)
-
- double expv;
- {
- const double max_exp_arg = 709.782712893384;
- const double min_exp_arg = -745.1332191019411;
- const double sixtyfour_by_lnof2 = 92.33248261689366;
- const double lnof2_by_64_head = 0.010830424260348081;
- const double lnof2_by_64_tail = -4.359010638708991e-10;
-
- double temp = v * sixtyfour_by_lnof2;
- int n = (int)temp;
- double dn = (double)n;
- int j = n & 0x0000003f;
- int m = n >> 6;
-
- double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
- double f1 = tv.s0;
- double f2 = tv.s1;
- double f = f1 + f2;
-
- double r1 = fma(dn, -lnof2_by_64_head, v);
- double r2 = dn * lnof2_by_64_tail;
- double r = (r1 + r2) + vt;
-
- double q = fma(r,
- fma(r,
- fma(r,
- fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
- 4.16666666662260795726e-02),
- 1.66666666665260878863e-01),
- 5.00000000000000008883e-01);
- q = fma(r*r, q, r);
-
- expv = fma(f, q, f2) + f1;
- expv = ldexp(expv, m);
-
- expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
- expv = v < min_exp_arg ? 0.0 : expv;
- }
-
- // See whether y is an integer.
- // inty = 0 means not an integer.
- // inty = 1 means odd integer.
- // inty = 2 means even integer.
-
- int inty;
- {
- int yexp = (int)(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
- inty = yexp < 1 ? 0 : 2;
- inty = yexp > 53 ? 2 : inty;
- long mask = (1L << (53 - yexp)) - 1L;
- int inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1 : 2;
- inty1 = (ay & mask) != 0 ? 0 : inty1;
- inty = !(yexp < 1) & !(yexp > 53) ? inty1 : inty;
- }
-
- expv *= (inty == 1) & !xpos ? -1.0 : 1.0;
-
- long ret = as_long(expv);
-
- // Now all the edge cases
- ret = !xpos & (inty == 0) ? QNANBITPATT_DP64 : ret;
- ret = ax < 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
- ret = ax > 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? 0L : ret;
- ret = ax < 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? 0L : ret;
- ret = ax > 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
- long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
- ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
- ret = ((ax == 0L) & !ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
- long xzero = xpos ? 0L : 0x8000000000000000L;
- ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
- ret = ((ax == 0L) & ypos & (inty != 1)) ? 0L : ret;
- ret = ((ax == 0L) & (uy == NINFBITPATT_DP64)) ? PINFBITPATT_DP64 : ret;
- ret = ((ux == 0xbff0000000000000L) & (ay == PINFBITPATT_DP64)) ? 0x3ff0000000000000L : ret;
- ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L : ret;
- ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty != 1)) ? 0L : ret;
- ret = ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
- ret = ((ux == NINFBITPATT_DP64) & ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
- ret = (ux == PINFBITPATT_DP64) & !ypos ? 0L : ret;
- ret = (ux == PINFBITPATT_DP64) & ypos ? PINFBITPATT_DP64 : ret;
- ret = ax > PINFBITPATT_DP64 ? ux : ret;
- ret = ay > PINFBITPATT_DP64 ? uy : ret;
- ret = ay == 0L ? 0x3ff0000000000000L : ret;
- ret = ux == 0x3ff0000000000000L ? 0x3ff0000000000000L : ret;
-
- return as_double(ret);
+_CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y) {
+ const double real_log2_tail = 5.76999904754328540596e-08;
+ const double real_log2_lead = 6.93147122859954833984e-01;
+
+ long ux = as_long(x);
+ long ax = ux & (~SIGNBIT_DP64);
+ int xpos = ax == ux;
+
+ long uy = as_long(y);
+ long ay = uy & (~SIGNBIT_DP64);
+ int ypos = ay == uy;
+
+ // Extended precision log
+ double v, vt;
+ {
+ int exp = (int)(ax >> 52) - 1023;
+ int mask_exp_1023 = exp == -1023;
+ double xexp = (double)exp;
+ long mantissa = ax & 0x000FFFFFFFFFFFFFL;
+
+ long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+ exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
+ double xexp1 = (double)exp;
+ long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
+
+ xexp = mask_exp_1023 ? xexp1 : xexp;
+ mantissa = mask_exp_1023 ? mantissa1 : mantissa;
+
+ long rax = (mantissa & 0x000ff00000000000) +
+ ((mantissa & 0x0000080000000000) << 1);
+ int index = rax >> 44;
+
+ double F = as_double(rax | 0x3FE0000000000000L);
+ double Y = as_double(mantissa | 0x3FE0000000000000L);
+ double f = F - Y;
+ double2 tv = USE_TABLE(log_f_inv_tbl, index);
+ double log_h = tv.s0;
+ double log_t = tv.s1;
+ double f_inv = (log_h + log_t) * f;
+ double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+ double r2 = fma(-F, r1, f) * (log_h + log_t);
+ double r = r1 + r2;
+
+ double poly = fma(
+ r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+ 1.0 / 3.0);
+ poly = poly * r * r * r;
+
+ double hr1r1 = 0.5 * r1 * r1;
+ double poly0h = r1 + hr1r1;
+ double poly0t = r1 - poly0h + hr1r1;
+ poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+
+ tv = USE_TABLE(powlog_tbl, index);
+ log_h = tv.s0;
+ log_t = tv.s1;
+
+ double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+ double resT = resT_t - poly0h;
+ double resH = fma(xexp, real_log2_lead, log_h);
+ double resT_h = poly0h;
+
+ double H = resT + resH;
+ double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+ double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
+ H = H_h;
+
+ double y_head = as_double(uy & 0xfffffffff8000000L);
+ double y_tail = y - y_head;
+
+ double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
+ v = fma(y_head, H, temp);
+ vt = fma(y_head, H, -v) + temp;
+ }
+
+ // Now calculate exp of (v,vt)
+
+ double expv;
+ {
+ const double max_exp_arg = 709.782712893384;
+ const double min_exp_arg = -745.1332191019411;
+ const double sixtyfour_by_lnof2 = 92.33248261689366;
+ const double lnof2_by_64_head = 0.010830424260348081;
+ const double lnof2_by_64_tail = -4.359010638708991e-10;
+
+ double temp = v * sixtyfour_by_lnof2;
+ int n = (int)temp;
+ double dn = (double)n;
+ int j = n & 0x0000003f;
+ int m = n >> 6;
+
+ double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+ double f1 = tv.s0;
+ double f2 = tv.s1;
+ double f = f1 + f2;
+
+ double r1 = fma(dn, -lnof2_by_64_head, v);
+ double r2 = dn * lnof2_by_64_tail;
+ double r = (r1 + r2) + vt;
+
+ double q = fma(
+ r,
+ fma(r,
+ fma(r,
+ fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
+ 4.16666666662260795726e-02),
+ 1.66666666665260878863e-01),
+ 5.00000000000000008883e-01);
+ q = fma(r * r, q, r);
+
+ expv = fma(f, q, f2) + f1;
+ expv = ldexp(expv, m);
+
+ expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+ expv = v < min_exp_arg ? 0.0 : expv;
+ }
+
+ // See whether y is an integer.
+ // inty = 0 means not an integer.
+ // inty = 1 means odd integer.
+ // inty = 2 means even integer.
+
+ int inty;
+ {
+ int yexp = (int)(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
+ inty = yexp < 1 ? 0 : 2;
+ inty = yexp > 53 ? 2 : inty;
+ long mask = (1L << (53 - yexp)) - 1L;
+ int inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1 : 2;
+ inty1 = (ay & mask) != 0 ? 0 : inty1;
+ inty = !(yexp < 1) & !(yexp > 53) ? inty1 : inty;
+ }
+
+ expv *= (inty == 1) & !xpos ? -1.0 : 1.0;
+
+ long ret = as_long(expv);
+
+ // Now all the edge cases
+ ret = !xpos & (inty == 0) ? QNANBITPATT_DP64 : ret;
+ ret = ax < 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64
+ : ret;
+ ret = ax > 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? 0L : ret;
+ ret = ax < 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? 0L : ret;
+ ret = ax > 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64
+ : ret;
+ long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
+ ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
+ ret = ((ax == 0L) & !ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
+ long xzero = xpos ? 0L : 0x8000000000000000L;
+ ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
+ ret = ((ax == 0L) & ypos & (inty != 1)) ? 0L : ret;
+ ret = ((ax == 0L) & (uy == NINFBITPATT_DP64)) ? PINFBITPATT_DP64 : ret;
+ ret = ((ux == 0xbff0000000000000L) & (ay == PINFBITPATT_DP64))
+ ? 0x3ff0000000000000L
+ : ret;
+ ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L
+ : ret;
+ ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty != 1)) ? 0L : ret;
+ ret =
+ ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
+ ret =
+ ((ux == NINFBITPATT_DP64) & ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
+ ret = (ux == PINFBITPATT_DP64) & !ypos ? 0L : ret;
+ ret = (ux == PINFBITPATT_DP64) & ypos ? PINFBITPATT_DP64 : ret;
+ ret = ax > PINFBITPATT_DP64 ? ux : ret;
+ ret = ay > PINFBITPATT_DP64 ? uy : ret;
+ ret = ay == 0L ? 0x3ff0000000000000L : ret;
+ ret = ux == 0x3ff0000000000000L ? 0x3ff0000000000000L : ret;
+
+ return as_double(ret);
}
_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_pow, double, double)
#endif
diff --git a/libclc/generic/lib/math/clc_pown.cl b/libclc/generic/lib/math/clc_pown.cl
index 031bf9b25e6a1ae..c02089266460274 100644
--- a/libclc/generic/lib/math/clc_pown.cl
+++ b/libclc/generic/lib/math/clc_pown.cl
@@ -23,6 +23,7 @@
#include <clc/clc.h>
#include <clc/clcmacro.h>
#include <clc/math/clc_fabs.h>
+#include <clc/math/clc_mad.h>
#include "config.h"
#include "math.h"
@@ -64,308 +65,321 @@
// At the end of exp, do
// ((((expT * poly) + expT) + expH*poly) + expH)
-_CLC_DEF _CLC_OVERLOAD float __clc_pown(float x, int ny)
-{
- float y = (float)ny;
-
- int ix = as_int(x);
- int ax = ix & EXSIGNBIT_SP32;
- int xpos = ix == ax;
-
- int iy = as_int(y);
- int ay = iy & EXSIGNBIT_SP32;
- int ypos = iy == ay;
-
- // Extra precise log calculation
- // First handle case that x is close to 1
- float r = 1.0f - as_float(ax);
- int near1 = __clc_fabs(r) < 0x1.0p-4f;
- float r2 = r*r;
-
- // Coefficients are just 1/3, 1/4, 1/5 and 1/6
- float poly = mad(r,
- mad(r,
- mad(r,
- mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
- 0x1.99999ap-3f),
- 0x1.000000p-2f),
- 0x1.555556p-2f);
-
- poly *= r2*r;
-
- float lth_near1 = -r2 * 0.5f;
- float ltt_near1 = -poly;
- float lt_near1 = lth_near1 + ltt_near1;
- float lh_near1 = -r;
- float l_near1 = lh_near1 + lt_near1;
-
- // Computations for x not near 1
- int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
- float mf = (float)m;
- int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
- float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
- int c = m == -127;
- int ixn = c ? ixs : ax;
- float mfn = c ? mfs : mf;
-
- int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
- // F - Y
- float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (ixn & MANTBITS_SP32));
-
- indx = indx >> 16;
- float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
- float rh = f * tv.s0;
- float rt = f * tv.s1;
- r = rh + rt;
-
- poly = mad(r, mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * (r*r);
- poly += (rh - r) + rt;
-
- const float LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234
- const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
- tv = USE_TABLE(loge_tbl, indx);
- float lth = -r;
- float ltt = mad(mfn, LOG2_TAIL, -poly) + tv.s1;
- float lt = lth + ltt;
- float lh = mad(mfn, LOG2_HEAD, tv.s0);
- float l = lh + lt;
-
- // Select near 1 or not
- lth = near1 ? lth_near1 : lth;
- ltt = near1 ? ltt_near1 : ltt;
- lt = near1 ? lt_near1 : lt;
- lh = near1 ? lh_near1 : lh;
- l = near1 ? l_near1 : l;
-
- float gh = as_float(as_int(l) & 0xfffff000);
- float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
- float yh = as_float(iy & 0xfffff000);
-
- float yt = (float)(ny - (int)yh);
-
- float ylogx_s = mad(gt, yh, mad(gh, yt, yt*gt));
- float ylogx = mad(yh, gh, ylogx_s);
- float ylogx_t = mad(yh, gh, -ylogx) + ylogx_s;
-
- // Extra precise exp of ylogx
- const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657
- int n = convert_int(ylogx * R_64_BY_LOG2);
- float nf = (float) n;
-
- int j = n & 0x3f;
- m = n >> 6;
- int m2 = m << EXPSHIFTBITS_SP32;
-
- const float R_LOG2_BY_64_LD = 0x1.620000p-7f; // log2/64 lead: 0.0108032227
- const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388
- r = mad(nf, -R_LOG2_BY_64_TL, mad(nf, -R_LOG2_BY_64_LD, ylogx)) + ylogx_t;
-
- // Truncated Taylor series for e^r
- poly = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
-
- tv = USE_TABLE(exp_tbl_ep, j);
-
- float expylogx = mad(tv.s0, poly, mad(tv.s1, poly, tv.s1)) + tv.s0;
- float sexpylogx = expylogx * as_float(0x1 << (m + 149));
- float texpylogx = as_float(as_int(expylogx) + m2);
- expylogx = m < -125 ? sexpylogx : texpylogx;
-
- // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
- expylogx = ((ylogx > 0x1.62e430p+6f) | (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f)) ? as_float(PINFBITPATT_SP32) : expylogx;
-
- // Result is 0 if ylogx < -149*log2
- expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
- // Classify y:
- // inty = 0 means not an integer.
- // inty = 1 means odd integer.
- // inty = 2 means even integer.
-
- int inty = 2 - (ny & 1);
-
- float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
- expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
- int ret = as_int(expylogx);
-
- // Corner case handling
- int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
- ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
- ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret;
- ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret;
- int xzero = !xpos ? 0x80000000 : 0L;
- ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
- ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
- ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty != 1)) ? 0 : ret;
- ret = ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
- ret = ((ix == NINFBITPATT_SP32) & ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
- ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
- ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
- ret = ax > PINFBITPATT_SP32 ? ix : ret;
- ret = ny == 0 ? 0x3f800000 : ret;
-
- return as_float(ret);
+_CLC_DEF _CLC_OVERLOAD float __clc_pown(float x, int ny) {
+ float y = (float)ny;
+
+ int ix = as_int(x);
+ int ax = ix & EXSIGNBIT_SP32;
+ int xpos = ix == ax;
+
+ int iy = as_int(y);
+ int ay = iy & EXSIGNBIT_SP32;
+ int ypos = iy == ay;
+
+ // Extra precise log calculation
+ // First handle case that x is close to 1
+ float r = 1.0f - as_float(ax);
+ int near1 = __clc_fabs(r) < 0x1.0p-4f;
+ float r2 = r * r;
+
+ // Coefficients are just 1/3, 1/4, 1/5 and 1/6
+ float poly = __clc_mad(
+ r,
+ __clc_mad(r,
+ __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
+ 0x1.99999ap-3f),
+ 0x1.000000p-2f),
+ 0x1.555556p-2f);
+
+ poly *= r2 * r;
+
+ float lth_near1 = -r2 * 0.5f;
+ float ltt_near1 = -poly;
+ float lt_near1 = lth_near1 + ltt_near1;
+ float lh_near1 = -r;
+ float l_near1 = lh_near1 + lt_near1;
+
+ // Computations for x not near 1
+ int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+ float mf = (float)m;
+ int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+ float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
+ int c = m == -127;
+ int ixn = c ? ixs : ax;
+ float mfn = c ? mfs : mf;
+
+ int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
+
+ // F - Y
+ float f = as_float(0x3f000000 | indx) -
+ as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+
+ indx = indx >> 16;
+ float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
+ float rh = f * tv.s0;
+ float rt = f * tv.s1;
+ r = rh + rt;
+
+ poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
+ (r * r);
+ poly += (rh - r) + rt;
+
+ const float LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234
+ const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
+ tv = USE_TABLE(loge_tbl, indx);
+ float lth = -r;
+ float ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + tv.s1;
+ float lt = lth + ltt;
+ float lh = __clc_mad(mfn, LOG2_HEAD, tv.s0);
+ float l = lh + lt;
+
+ // Select near 1 or not
+ lth = near1 ? lth_near1 : lth;
+ ltt = near1 ? ltt_near1 : ltt;
+ lt = near1 ? lt_near1 : lt;
+ lh = near1 ? lh_near1 : lh;
+ l = near1 ? l_near1 : l;
+
+ float gh = as_float(as_int(l) & 0xfffff000);
+ float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
+
+ float yh = as_float(iy & 0xfffff000);
+
+ float yt = (float)(ny - (int)yh);
+
+ float ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
+ float ylogx = __clc_mad(yh, gh, ylogx_s);
+ float ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
+
+ // Extra precise exp of ylogx
+ // 64/log2 : 92.332482616893657
+ const float R_64_BY_LOG2 = 0x1.715476p+6f;
+ int n = convert_int(ylogx * R_64_BY_LOG2);
+ float nf = (float)n;
+
+ int j = n & 0x3f;
+ m = n >> 6;
+ int m2 = m << EXPSHIFTBITS_SP32;
+
+ // log2/64 lead: 0.0108032227
+ const float R_LOG2_BY_64_LD = 0x1.620000p-7f;
+ // log2/64 tail: 0.0000272020388
+ const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
+ r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
+ ylogx_t;
+
+ // Truncated Taylor series for e^r
+ poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
+ 0x1.000000p-1f),
+ r * r, r);
+
+ tv = USE_TABLE(exp_tbl_ep, j);
+
+ float expylogx =
+ __clc_mad(tv.s0, poly, __clc_mad(tv.s1, poly, tv.s1)) + tv.s0;
+ float sexpylogx = expylogx * as_float(0x1 << (m + 149));
+ float texpylogx = as_float(as_int(expylogx) + m2);
+ expylogx = m < -125 ? sexpylogx : texpylogx;
+
+ // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
+ expylogx = ((ylogx > 0x1.62e430p+6f) |
+ (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f))
+ ? as_float(PINFBITPATT_SP32)
+ : expylogx;
+
+ // Result is 0 if ylogx < -149*log2
+ expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
+
+ // Classify y:
+ // inty = 0 means not an integer.
+ // inty = 1 means odd integer.
+ // inty = 2 means even integer.
+
+ int inty = 2 - (ny & 1);
+
+ float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+ expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
+ int ret = as_int(expylogx);
+
+ // Corner case handling
+ int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
+ ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
+ ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret;
+ ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret;
+ int xzero = !xpos ? 0x80000000 : 0L;
+ ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
+ ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
+ ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty != 1)) ? 0 : ret;
+ ret =
+ ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
+ ret =
+ ((ix == NINFBITPATT_SP32) & ypos & (inty != 1)) ? PINFBITPATT_SP32 : ret;
+ ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
+ ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
+ ret = ax > PINFBITPATT_SP32 ? ix : ret;
+ ret = ny == 0 ? 0x3f800000 : ret;
+
+ return as_float(ret);
}
_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_pown, float, int)
#ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny)
-{
- const double real_log2_tail = 5.76999904754328540596e-08;
- const double real_log2_lead = 6.93147122859954833984e-01;
-
- double y = (double) ny;
-
- long ux = as_long(x);
- long ax = ux & (~SIGNBIT_DP64);
- int xpos = ax == ux;
-
- long uy = as_long(y);
- long ay = uy & (~SIGNBIT_DP64);
- int ypos = ay == uy;
-
- // Extended precision log
- double v, vt;
- {
- int exp = (int)(ax >> 52) - 1023;
- int mask_exp_1023 = exp == -1023;
- double xexp = (double) exp;
- long mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
- long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
- exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
- double xexp1 = (double) exp;
- long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
- xexp = mask_exp_1023 ? xexp1 : xexp;
- mantissa = mask_exp_1023 ? mantissa1 : mantissa;
-
- long rax = (mantissa & 0x000ff00000000000) + ((mantissa & 0x0000080000000000) << 1);
- int index = rax >> 44;
-
- double F = as_double(rax | 0x3FE0000000000000L);
- double Y = as_double(mantissa | 0x3FE0000000000000L);
- double f = F - Y;
- double2 tv = USE_TABLE(log_f_inv_tbl, index);
- double log_h = tv.s0;
- double log_t = tv.s1;
- double f_inv = (log_h + log_t) * f;
- double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
- double r2 = fma(-F, r1, f) * (log_h + log_t);
- double r = r1 + r2;
-
- double poly = fma(r,
- fma(r,
- fma(r,
- fma(r, 1.0/7.0, 1.0/6.0),
- 1.0/5.0),
- 1.0/4.0),
- 1.0/3.0);
- poly = poly * r * r * r;
-
- double hr1r1 = 0.5*r1*r1;
- double poly0h = r1 + hr1r1;
- double poly0t = r1 - poly0h + hr1r1;
- poly = fma(r1, r2, fma(0.5*r2, r2, poly)) + r2 + poly0t;
-
- tv = USE_TABLE(powlog_tbl, index);
- log_h = tv.s0;
- log_t = tv.s1;
-
- double resT_t = fma(xexp, real_log2_tail, + log_t) - poly;
- double resT = resT_t - poly0h;
- double resH = fma(xexp, real_log2_lead, log_h);
- double resT_h = poly0h;
-
- double H = resT + resH;
- double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
- double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
- H = H_h;
-
- double y_head = as_double(uy & 0xfffffffff8000000L);
- double y_tail = y - y_head;
-
- int mask_2_24 = ay > 0x4170000000000000; // 2^24
- int nyh = convert_int(y_head);
- int nyt = ny - nyh;
- double y_tail1 = (double)nyt;
- y_tail = mask_2_24 ? y_tail1 : y_tail;
-
- double temp = fma(y_tail, H, fma(y_head, T, y_tail*T));
- v = fma(y_head, H, temp);
- vt = fma(y_head, H, -v) + temp;
- }
-
- // Now calculate exp of (v,vt)
-
- double expv;
- {
- const double max_exp_arg = 709.782712893384;
- const double min_exp_arg = -745.1332191019411;
- const double sixtyfour_by_lnof2 = 92.33248261689366;
- const double lnof2_by_64_head = 0.010830424260348081;
- const double lnof2_by_64_tail = -4.359010638708991e-10;
-
- double temp = v * sixtyfour_by_lnof2;
- int n = (int)temp;
- double dn = (double)n;
- int j = n & 0x0000003f;
- int m = n >> 6;
-
- double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
- double f1 = tv.s0;
- double f2 = tv.s1;
- double f = f1 + f2;
-
- double r1 = fma(dn, -lnof2_by_64_head, v);
- double r2 = dn * lnof2_by_64_tail;
- double r = (r1 + r2) + vt;
-
- double q = fma(r,
- fma(r,
- fma(r,
- fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
- 4.16666666662260795726e-02),
- 1.66666666665260878863e-01),
- 5.00000000000000008883e-01);
- q = fma(r*r, q, r);
-
- expv = fma(f, q, f2) + f1;
- expv = ldexp(expv, m);
-
- expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
- expv = v < min_exp_arg ? 0.0 : expv;
- }
-
- // See whether y is an integer.
- // inty = 0 means not an integer.
- // inty = 1 means odd integer.
- // inty = 2 means even integer.
-
- int inty = 2 - (ny & 1);
-
- expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
-
- long ret = as_long(expv);
-
- // Now all the edge cases
- long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
- ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
- ret = ((ax == 0L) & !ypos & (inty == 2)) ? PINFBITPATT_DP64 : ret;
- ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret;
- long xzero = !xpos ? 0x8000000000000000L : 0L;
- ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
- ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L : ret;
- ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty != 1)) ? 0L : ret;
- ret = ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
- ret = ((ux == NINFBITPATT_DP64) & ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
- ret = ((ux == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
- ret = ((ux == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
- ret = ax > PINFBITPATT_DP64 ? ux : ret;
- ret = ny == 0 ? 0x3ff0000000000000L : ret;
-
- return as_double(ret);
+_CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny) {
+ const double real_log2_tail = 5.76999904754328540596e-08;
+ const double real_log2_lead = 6.93147122859954833984e-01;
+
+ double y = (double)ny;
+
+ long ux = as_long(x);
+ long ax = ux & (~SIGNBIT_DP64);
+ int xpos = ax == ux;
+
+ long uy = as_long(y);
+ long ay = uy & (~SIGNBIT_DP64);
+ int ypos = ay == uy;
+
+ // Extended precision log
+ double v, vt;
+ {
+ int exp = (int)(ax >> 52) - 1023;
+ int mask_exp_1023 = exp == -1023;
+ double xexp = (double)exp;
+ long mantissa = ax & 0x000FFFFFFFFFFFFFL;
+
+ long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+ exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
+ double xexp1 = (double)exp;
+ long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
+
+ xexp = mask_exp_1023 ? xexp1 : xexp;
+ mantissa = mask_exp_1023 ? mantissa1 : mantissa;
+
+ long rax = (mantissa & 0x000ff00000000000) +
+ ((mantissa & 0x0000080000000000) << 1);
+ int index = rax >> 44;
+
+ double F = as_double(rax | 0x3FE0000000000000L);
+ double Y = as_double(mantissa | 0x3FE0000000000000L);
+ double f = F - Y;
+ double2 tv = USE_TABLE(log_f_inv_tbl, index);
+ double log_h = tv.s0;
+ double log_t = tv.s1;
+ double f_inv = (log_h + log_t) * f;
+ double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+ double r2 = fma(-F, r1, f) * (log_h + log_t);
+ double r = r1 + r2;
+
+ double poly = fma(
+ r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+ 1.0 / 3.0);
+ poly = poly * r * r * r;
+
+ double hr1r1 = 0.5 * r1 * r1;
+ double poly0h = r1 + hr1r1;
+ double poly0t = r1 - poly0h + hr1r1;
+ poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+
+ tv = USE_TABLE(powlog_tbl, index);
+ log_h = tv.s0;
+ log_t = tv.s1;
+
+ double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+ double resT = resT_t - poly0h;
+ double resH = fma(xexp, real_log2_lead, log_h);
+ double resT_h = poly0h;
+
+ double H = resT + resH;
+ double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+ double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
+ H = H_h;
+
+ double y_head = as_double(uy & 0xfffffffff8000000L);
+ double y_tail = y - y_head;
+
+ int mask_2_24 = ay > 0x4170000000000000; // 2^24
+ int nyh = convert_int(y_head);
+ int nyt = ny - nyh;
+ double y_tail1 = (double)nyt;
+ y_tail = mask_2_24 ? y_tail1 : y_tail;
+
+ double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
+ v = fma(y_head, H, temp);
+ vt = fma(y_head, H, -v) + temp;
+ }
+
+ // Now calculate exp of (v,vt)
+
+ double expv;
+ {
+ const double max_exp_arg = 709.782712893384;
+ const double min_exp_arg = -745.1332191019411;
+ const double sixtyfour_by_lnof2 = 92.33248261689366;
+ const double lnof2_by_64_head = 0.010830424260348081;
+ const double lnof2_by_64_tail = -4.359010638708991e-10;
+
+ double temp = v * sixtyfour_by_lnof2;
+ int n = (int)temp;
+ double dn = (double)n;
+ int j = n & 0x0000003f;
+ int m = n >> 6;
+
+ double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+ double f1 = tv.s0;
+ double f2 = tv.s1;
+ double f = f1 + f2;
+
+ double r1 = fma(dn, -lnof2_by_64_head, v);
+ double r2 = dn * lnof2_by_64_tail;
+ double r = (r1 + r2) + vt;
+
+ double q = fma(
+ r,
+ fma(r,
+ fma(r,
+ fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
+ 4.16666666662260795726e-02),
+ 1.66666666665260878863e-01),
+ 5.00000000000000008883e-01);
+ q = fma(r * r, q, r);
+
+ expv = fma(f, q, f2) + f1;
+ expv = ldexp(expv, m);
+
+ expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+ expv = v < min_exp_arg ? 0.0 : expv;
+ }
+
+ // See whether y is an integer.
+ // inty = 0 means not an integer.
+ // inty = 1 means odd integer.
+ // inty = 2 means even integer.
+
+ int inty = 2 - (ny & 1);
+
+ expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
+
+ long ret = as_long(expv);
+
+ // Now all the edge cases
+ long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
+ ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
+ ret = ((ax == 0L) & !ypos & (inty == 2)) ? PINFBITPATT_DP64 : ret;
+ ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret;
+ long xzero = !xpos ? 0x8000000000000000L : 0L;
+ ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
+ ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L
+ : ret;
+ ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty != 1)) ? 0L : ret;
+ ret =
+ ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
+ ret =
+ ((ux == NINFBITPATT_DP64) & ypos & (inty != 1)) ? PINFBITPATT_DP64 : ret;
+ ret = ((ux == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
+ ret = ((ux == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
+ ret = ax > PINFBITPATT_DP64 ? ux : ret;
+ ret = ny == 0 ? 0x3ff0000000000000L : ret;
+
+ return as_double(ret);
}
_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_pown, double, int)
#endif
diff --git a/libclc/generic/lib/math/clc_powr.cl b/libclc/generic/lib/math/clc_powr.cl
index c431f529f3b97d3..9516be34456b8da 100644
--- a/libclc/generic/lib/math/clc_powr.cl
+++ b/libclc/generic/lib/math/clc_powr.cl
@@ -23,6 +23,7 @@
#include <clc/clc.h>
#include <clc/clcmacro.h>
#include <clc/math/clc_fabs.h>
+#include <clc/math/clc_mad.h>
#include "config.h"
#include "math.h"
@@ -64,319 +65,332 @@
// At the end of exp, do
// ((((expT * poly) + expT) + expH*poly) + expH)
-_CLC_DEF _CLC_OVERLOAD float __clc_powr(float x, float y)
-{
- int ix = as_int(x);
- int ax = ix & EXSIGNBIT_SP32;
- int xpos = ix == ax;
-
- int iy = as_int(y);
- int ay = iy & EXSIGNBIT_SP32;
- int ypos = iy == ay;
-
- // Extra precise log calculation
- // First handle case that x is close to 1
- float r = 1.0f - as_float(ax);
- int near1 = __clc_fabs(r) < 0x1.0p-4f;
- float r2 = r*r;
-
- // Coefficients are just 1/3, 1/4, 1/5 and 1/6
- float poly = mad(r,
- mad(r,
- mad(r,
- mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
- 0x1.99999ap-3f),
- 0x1.000000p-2f),
- 0x1.555556p-2f);
-
- poly *= r2*r;
-
- float lth_near1 = -r2 * 0.5f;
- float ltt_near1 = -poly;
- float lt_near1 = lth_near1 + ltt_near1;
- float lh_near1 = -r;
- float l_near1 = lh_near1 + lt_near1;
-
- // Computations for x not near 1
- int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
- float mf = (float)m;
- int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
- float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
- int c = m == -127;
- int ixn = c ? ixs : ax;
- float mfn = c ? mfs : mf;
-
- int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
- // F - Y
- float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (ixn & MANTBITS_SP32));
-
- indx = indx >> 16;
- float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
- float rh = f * tv.s0;
- float rt = f * tv.s1;
- r = rh + rt;
-
- poly = mad(r, mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * (r*r);
- poly += (rh - r) + rt;
-
- const float LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234
- const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
- tv = USE_TABLE(loge_tbl, indx);
- float lth = -r;
- float ltt = mad(mfn, LOG2_TAIL, -poly) + tv.s1;
- float lt = lth + ltt;
- float lh = mad(mfn, LOG2_HEAD, tv.s0);
- float l = lh + lt;
-
- // Select near 1 or not
- lth = near1 ? lth_near1 : lth;
- ltt = near1 ? ltt_near1 : ltt;
- lt = near1 ? lt_near1 : lt;
- lh = near1 ? lh_near1 : lh;
- l = near1 ? l_near1 : l;
-
- float gh = as_float(as_int(l) & 0xfffff000);
- float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
- float yh = as_float(iy & 0xfffff000);
-
- float yt = y - yh;
-
- float ylogx_s = mad(gt, yh, mad(gh, yt, yt*gt));
- float ylogx = mad(yh, gh, ylogx_s);
- float ylogx_t = mad(yh, gh, -ylogx) + ylogx_s;
-
- // Extra precise exp of ylogx
- const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657
- int n = convert_int(ylogx * R_64_BY_LOG2);
- float nf = (float) n;
-
- int j = n & 0x3f;
- m = n >> 6;
- int m2 = m << EXPSHIFTBITS_SP32;
-
- const float R_LOG2_BY_64_LD = 0x1.620000p-7f; // log2/64 lead: 0.0108032227
- const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388
- r = mad(nf, -R_LOG2_BY_64_TL, mad(nf, -R_LOG2_BY_64_LD, ylogx)) + ylogx_t;
-
- // Truncated Taylor series for e^r
- poly = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
-
- tv = USE_TABLE(exp_tbl_ep, j);
-
- float expylogx = mad(tv.s0, poly, mad(tv.s1, poly, tv.s1)) + tv.s0;
- float sexpylogx = expylogx * as_float(0x1 << (m + 149));
- float texpylogx = as_float(as_int(expylogx) + m2);
- expylogx = m < -125 ? sexpylogx : texpylogx;
-
- // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
- expylogx = ((ylogx > 0x1.62e430p+6f) | (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f)) ? as_float(PINFBITPATT_SP32) : expylogx;
-
- // Result is 0 if ylogx < -149*log2
- expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
- // Classify y:
- // inty = 0 means not an integer.
- // inty = 1 means odd integer.
- // inty = 2 means even integer.
-
- int yexp = (int)(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
- int mask = (1 << (24 - yexp)) - 1;
- int yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
- int inty = yodd ? 1 : 2;
- inty = (iy & mask) != 0 ? 0 : inty;
- inty = yexp < 1 ? 0 : inty;
- inty = yexp > 24 ? 2 : inty;
-
- float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
- expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
- int ret = as_int(expylogx);
-
- // Corner case handling
- ret = ax < 0x3f800000 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
- ret = ax < 0x3f800000 & iy == PINFBITPATT_SP32 ? 0 : ret;
- ret = ax == 0x3f800000 & ay < PINFBITPATT_SP32 ? 0x3f800000 : ret;
- ret = ax == 0x3f800000 & ay == PINFBITPATT_SP32 ? QNANBITPATT_SP32 : ret;
- ret = ax > 0x3f800000 & iy == NINFBITPATT_SP32 ? 0 : ret;
- ret = ax > 0x3f800000 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
- ret = ((ix < PINFBITPATT_SP32) & (ay == 0)) ? 0x3f800000 : ret;
- ret = ((ax == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
- ret = ((ax == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
- ret = ((ax == PINFBITPATT_SP32) & (iy == PINFBITPATT_SP32)) ? PINFBITPATT_SP32 : ret;
- ret = ((ax == PINFBITPATT_SP32) & (ay == 0)) ? QNANBITPATT_SP32 : ret;
- ret = ((ax == 0) & !ypos) ? PINFBITPATT_SP32 : ret;
- ret = ((ax == 0) & ypos) ? 0 : ret;
- ret = ((ax == 0) & (ay == 0)) ? QNANBITPATT_SP32 : ret;
- ret = ((ax != 0) & !xpos) ? QNANBITPATT_SP32 : ret;
- ret = ax > PINFBITPATT_SP32 ? ix : ret;
- ret = ay > PINFBITPATT_SP32 ? iy : ret;
-
- return as_float(ret);
+_CLC_DEF _CLC_OVERLOAD float __clc_powr(float x, float y) {
+ int ix = as_int(x);
+ int ax = ix & EXSIGNBIT_SP32;
+ int xpos = ix == ax;
+
+ int iy = as_int(y);
+ int ay = iy & EXSIGNBIT_SP32;
+ int ypos = iy == ay;
+
+ // Extra precise log calculation
+ // First handle case that x is close to 1
+ float r = 1.0f - as_float(ax);
+ int near1 = __clc_fabs(r) < 0x1.0p-4f;
+ float r2 = r * r;
+
+ // Coefficients are just 1/3, 1/4, 1/5 and 1/6
+ float poly = __clc_mad(
+ r,
+ __clc_mad(r,
+ __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
+ 0x1.99999ap-3f),
+ 0x1.000000p-2f),
+ 0x1.555556p-2f);
+
+ poly *= r2 * r;
+
+ float lth_near1 = -r2 * 0.5f;
+ float ltt_near1 = -poly;
+ float lt_near1 = lth_near1 + ltt_near1;
+ float lh_near1 = -r;
+ float l_near1 = lh_near1 + lt_near1;
+
+ // Computations for x not near 1
+ int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+ float mf = (float)m;
+ int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+ float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
+ int c = m == -127;
+ int ixn = c ? ixs : ax;
+ float mfn = c ? mfs : mf;
+
+ int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
+
+ // F - Y
+ float f = as_float(0x3f000000 | indx) -
+ as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+
+ indx = indx >> 16;
+ float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
+ float rh = f * tv.s0;
+ float rt = f * tv.s1;
+ r = rh + rt;
+
+ poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
+ (r * r);
+ poly += (rh - r) + rt;
+
+ const float LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234
+ const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
+ tv = USE_TABLE(loge_tbl, indx);
+ float lth = -r;
+ float ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + tv.s1;
+ float lt = lth + ltt;
+ float lh = __clc_mad(mfn, LOG2_HEAD, tv.s0);
+ float l = lh + lt;
+
+ // Select near 1 or not
+ lth = near1 ? lth_near1 : lth;
+ ltt = near1 ? ltt_near1 : ltt;
+ lt = near1 ? lt_near1 : lt;
+ lh = near1 ? lh_near1 : lh;
+ l = near1 ? l_near1 : l;
+
+ float gh = as_float(as_int(l) & 0xfffff000);
+ float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
+
+ float yh = as_float(iy & 0xfffff000);
+
+ float yt = y - yh;
+
+ float ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
+ float ylogx = __clc_mad(yh, gh, ylogx_s);
+ float ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
+
+ // Extra precise exp of ylogx
+ // 64/log2 : 92.332482616893657
+ const float R_64_BY_LOG2 = 0x1.715476p+6f;
+ int n = convert_int(ylogx * R_64_BY_LOG2);
+ float nf = (float)n;
+
+ int j = n & 0x3f;
+ m = n >> 6;
+ int m2 = m << EXPSHIFTBITS_SP32;
+ // log2/64 lead: 0.0108032227
+ const float R_LOG2_BY_64_LD = 0x1.620000p-7f;
+ // log2/64 tail: 0.0000272020388
+ const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
+ r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
+ ylogx_t;
+
+ // Truncated Taylor series for e^r
+ poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
+ 0x1.000000p-1f),
+ r * r, r);
+
+ tv = USE_TABLE(exp_tbl_ep, j);
+
+ float expylogx =
+ __clc_mad(tv.s0, poly, __clc_mad(tv.s1, poly, tv.s1)) + tv.s0;
+ float sexpylogx = expylogx * as_float(0x1 << (m + 149));
+ float texpylogx = as_float(as_int(expylogx) + m2);
+ expylogx = m < -125 ? sexpylogx : texpylogx;
+
+ // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
+ expylogx = ((ylogx > 0x1.62e430p+6f) |
+ (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f))
+ ? as_float(PINFBITPATT_SP32)
+ : expylogx;
+
+ // Result is 0 if ylogx < -149*log2
+ expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
+
+ // Classify y:
+ // inty = 0 means not an integer.
+ // inty = 1 means odd integer.
+ // inty = 2 means even integer.
+
+ int yexp = (int)(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
+ int mask = (1 << (24 - yexp)) - 1;
+ int yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
+ int inty = yodd ? 1 : 2;
+ inty = (iy & mask) != 0 ? 0 : inty;
+ inty = yexp < 1 ? 0 : inty;
+ inty = yexp > 24 ? 2 : inty;
+
+ float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+ expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
+ int ret = as_int(expylogx);
+
+ // Corner case handling
+ ret = ax < 0x3f800000 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+ ret = ax < 0x3f800000 & iy == PINFBITPATT_SP32 ? 0 : ret;
+ ret = ax == 0x3f800000 & ay < PINFBITPATT_SP32 ? 0x3f800000 : ret;
+ ret = ax == 0x3f800000 & ay == PINFBITPATT_SP32 ? QNANBITPATT_SP32 : ret;
+ ret = ax > 0x3f800000 & iy == NINFBITPATT_SP32 ? 0 : ret;
+ ret = ax > 0x3f800000 & iy == PINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
+ ret = ((ix < PINFBITPATT_SP32) & (ay == 0)) ? 0x3f800000 : ret;
+ ret = ((ax == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
+ ret = ((ax == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
+ ret = ((ax == PINFBITPATT_SP32) & (iy == PINFBITPATT_SP32)) ? PINFBITPATT_SP32
+ : ret;
+ ret = ((ax == PINFBITPATT_SP32) & (ay == 0)) ? QNANBITPATT_SP32 : ret;
+ ret = ((ax == 0) & !ypos) ? PINFBITPATT_SP32 : ret;
+ ret = ((ax == 0) & ypos) ? 0 : ret;
+ ret = ((ax == 0) & (ay == 0)) ? QNANBITPATT_SP32 : ret;
+ ret = ((ax != 0) & !xpos) ? QNANBITPATT_SP32 : ret;
+ ret = ax > PINFBITPATT_SP32 ? ix : ret;
+ ret = ay > PINFBITPATT_SP32 ? iy : ret;
+
+ return as_float(ret);
}
_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_powr, float, float)
#ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y)
-{
- const double real_log2_tail = 5.76999904754328540596e-08;
- const double real_log2_lead = 6.93147122859954833984e-01;
-
- long ux = as_long(x);
- long ax = ux & (~SIGNBIT_DP64);
- int xpos = ax == ux;
-
- long uy = as_long(y);
- long ay = uy & (~SIGNBIT_DP64);
- int ypos = ay == uy;
-
- // Extended precision log
- double v, vt;
- {
- int exp = (int)(ax >> 52) - 1023;
- int mask_exp_1023 = exp == -1023;
- double xexp = (double) exp;
- long mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
- long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
- exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
- double xexp1 = (double) exp;
- long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
- xexp = mask_exp_1023 ? xexp1 : xexp;
- mantissa = mask_exp_1023 ? mantissa1 : mantissa;
-
- long rax = (mantissa & 0x000ff00000000000) + ((mantissa & 0x0000080000000000) << 1);
- int index = rax >> 44;
-
- double F = as_double(rax | 0x3FE0000000000000L);
- double Y = as_double(mantissa | 0x3FE0000000000000L);
- double f = F - Y;
- double2 tv = USE_TABLE(log_f_inv_tbl, index);
- double log_h = tv.s0;
- double log_t = tv.s1;
- double f_inv = (log_h + log_t) * f;
- double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
- double r2 = fma(-F, r1, f) * (log_h + log_t);
- double r = r1 + r2;
-
- double poly = fma(r,
- fma(r,
- fma(r,
- fma(r, 1.0/7.0, 1.0/6.0),
- 1.0/5.0),
- 1.0/4.0),
- 1.0/3.0);
- poly = poly * r * r * r;
-
- double hr1r1 = 0.5*r1*r1;
- double poly0h = r1 + hr1r1;
- double poly0t = r1 - poly0h + hr1r1;
- poly = fma(r1, r2, fma(0.5*r2, r2, poly)) + r2 + poly0t;
-
- tv = USE_TABLE(powlog_tbl, index);
- log_h = tv.s0;
- log_t = tv.s1;
-
- double resT_t = fma(xexp, real_log2_tail, + log_t) - poly;
- double resT = resT_t - poly0h;
- double resH = fma(xexp, real_log2_lead, log_h);
- double resT_h = poly0h;
-
- double H = resT + resH;
- double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
- double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
- H = H_h;
-
- double y_head = as_double(uy & 0xfffffffff8000000L);
- double y_tail = y - y_head;
-
- double temp = fma(y_tail, H, fma(y_head, T, y_tail*T));
- v = fma(y_head, H, temp);
- vt = fma(y_head, H, -v) + temp;
- }
-
- // Now calculate exp of (v,vt)
-
- double expv;
- {
- const double max_exp_arg = 709.782712893384;
- const double min_exp_arg = -745.1332191019411;
- const double sixtyfour_by_lnof2 = 92.33248261689366;
- const double lnof2_by_64_head = 0.010830424260348081;
- const double lnof2_by_64_tail = -4.359010638708991e-10;
-
- double temp = v * sixtyfour_by_lnof2;
- int n = (int)temp;
- double dn = (double)n;
- int j = n & 0x0000003f;
- int m = n >> 6;
-
- double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
- double f1 = tv.s0;
- double f2 = tv.s1;
- double f = f1 + f2;
-
- double r1 = fma(dn, -lnof2_by_64_head, v);
- double r2 = dn * lnof2_by_64_tail;
- double r = (r1 + r2) + vt;
-
- double q = fma(r,
- fma(r,
- fma(r,
- fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
- 4.16666666662260795726e-02),
- 1.66666666665260878863e-01),
- 5.00000000000000008883e-01);
- q = fma(r*r, q, r);
-
- expv = fma(f, q, f2) + f1;
- expv = ldexp(expv, m);
-
- expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
- expv = v < min_exp_arg ? 0.0 : expv;
- }
-
- // See whether y is an integer.
- // inty = 0 means not an integer.
- // inty = 1 means odd integer.
- // inty = 2 means even integer.
-
- int inty;
- {
- int yexp = (int)(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
- inty = yexp < 1 ? 0 : 2;
- inty = yexp > 53 ? 2 : inty;
- long mask = (1L << (53 - yexp)) - 1L;
- int inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1 : 2;
- inty1 = (ay & mask) != 0 ? 0 : inty1;
- inty = !(yexp < 1) & !(yexp > 53) ? inty1 : inty;
- }
-
- expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
-
- long ret = as_long(expv);
-
- // Now all the edge cases
- ret = ax < 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
- ret = ax < 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? 0L : ret;
- ret = ax == 0x3ff0000000000000L & ay < PINFBITPATT_DP64 ? 0x3ff0000000000000L : ret;
- ret = ax == 0x3ff0000000000000L & ay == PINFBITPATT_DP64 ? QNANBITPATT_DP64 : ret;
- ret = ax > 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? 0L : ret;
- ret = ax > 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64 : ret;
- ret = ux < PINFBITPATT_DP64 & ay == 0L ? 0x3ff0000000000000L : ret;
- ret = ((ax == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
- ret = ((ax == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
- ret = ((ax == PINFBITPATT_DP64) & (uy == PINFBITPATT_DP64)) ? PINFBITPATT_DP64 : ret;
- ret = ((ax == PINFBITPATT_DP64) & (ay == 0L)) ? QNANBITPATT_DP64 : ret;
- ret = ((ax == 0L) & !ypos) ? PINFBITPATT_DP64 : ret;
- ret = ((ax == 0L) & ypos) ? 0L : ret;
- ret = ((ax == 0L) & (ay == 0L)) ? QNANBITPATT_DP64 : ret;
- ret = ((ax != 0L) & !xpos) ? QNANBITPATT_DP64 : ret;
- ret = ax > PINFBITPATT_DP64 ? ux : ret;
- ret = ay > PINFBITPATT_DP64 ? uy : ret;
-
- return as_double(ret);
+_CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y) {
+ const double real_log2_tail = 5.76999904754328540596e-08;
+ const double real_log2_lead = 6.93147122859954833984e-01;
+
+ long ux = as_long(x);
+ long ax = ux & (~SIGNBIT_DP64);
+ int xpos = ax == ux;
+
+ long uy = as_long(y);
+ long ay = uy & (~SIGNBIT_DP64);
+ int ypos = ay == uy;
+
+ // Extended precision log
+ double v, vt;
+ {
+ int exp = (int)(ax >> 52) - 1023;
+ int mask_exp_1023 = exp == -1023;
+ double xexp = (double)exp;
+ long mantissa = ax & 0x000FFFFFFFFFFFFFL;
+
+ long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+ exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
+ double xexp1 = (double)exp;
+ long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
+
+ xexp = mask_exp_1023 ? xexp1 : xexp;
+ mantissa = mask_exp_1023 ? mantissa1 : mantissa;
+
+ long rax = (mantissa & 0x000ff00000000000) +
+ ((mantissa & 0x0000080000000000) << 1);
+ int index = rax >> 44;
+
+ double F = as_double(rax | 0x3FE0000000000000L);
+ double Y = as_double(mantissa | 0x3FE0000000000000L);
+ double f = F - Y;
+ double2 tv = USE_TABLE(log_f_inv_tbl, index);
+ double log_h = tv.s0;
+ double log_t = tv.s1;
+ double f_inv = (log_h + log_t) * f;
+ double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+ double r2 = fma(-F, r1, f) * (log_h + log_t);
+ double r = r1 + r2;
+
+ double poly = fma(
+ r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+ 1.0 / 3.0);
+ poly = poly * r * r * r;
+
+ double hr1r1 = 0.5 * r1 * r1;
+ double poly0h = r1 + hr1r1;
+ double poly0t = r1 - poly0h + hr1r1;
+ poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+
+ tv = USE_TABLE(powlog_tbl, index);
+ log_h = tv.s0;
+ log_t = tv.s1;
+
+ double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+ double resT = resT_t - poly0h;
+ double resH = fma(xexp, real_log2_lead, log_h);
+ double resT_h = poly0h;
+
+ double H = resT + resH;
+ double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+ double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
+ H = H_h;
+
+ double y_head = as_double(uy & 0xfffffffff8000000L);
+ double y_tail = y - y_head;
+
+ double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
+ v = fma(y_head, H, temp);
+ vt = fma(y_head, H, -v) + temp;
+ }
+
+ // Now calculate exp of (v,vt)
+
+ double expv;
+ {
+ const double max_exp_arg = 709.782712893384;
+ const double min_exp_arg = -745.1332191019411;
+ const double sixtyfour_by_lnof2 = 92.33248261689366;
+ const double lnof2_by_64_head = 0.010830424260348081;
+ const double lnof2_by_64_tail = -4.359010638708991e-10;
+
+ double temp = v * sixtyfour_by_lnof2;
+ int n = (int)temp;
+ double dn = (double)n;
+ int j = n & 0x0000003f;
+ int m = n >> 6;
+
+ double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+ double f1 = tv.s0;
+ double f2 = tv.s1;
+ double f = f1 + f2;
+
+ double r1 = fma(dn, -lnof2_by_64_head, v);
+ double r2 = dn * lnof2_by_64_tail;
+ double r = (r1 + r2) + vt;
+
+ double q = fma(
+ r,
+ fma(r,
+ fma(r,
+ fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
+ 4.16666666662260795726e-02),
+ 1.66666666665260878863e-01),
+ 5.00000000000000008883e-01);
+ q = fma(r * r, q, r);
+
+ expv = fma(f, q, f2) + f1;
+ expv = ldexp(expv, m);
+
+ expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+ expv = v < min_exp_arg ? 0.0 : expv;
+ }
+
+ // See whether y is an integer.
+ // inty = 0 means not an integer.
+ // inty = 1 means odd integer.
+ // inty = 2 means even integer.
+
+ int inty;
+ {
+ int yexp = (int)(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
+ inty = yexp < 1 ? 0 : 2;
+ inty = yexp > 53 ? 2 : inty;
+ long mask = (1L << (53 - yexp)) - 1L;
+ int inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1 : 2;
+ inty1 = (ay & mask) != 0 ? 0 : inty1;
+ inty = !(yexp < 1) & !(yexp > 53) ? inty1 : inty;
+ }
+
+ expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
+
+ long ret = as_long(expv);
+
+ // Now all the edge cases
+ ret = ax < 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64
+ : ret;
+ ret = ax < 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? 0L : ret;
+ ret = ax == 0x3ff0000000000000L & ay < PINFBITPATT_DP64 ? 0x3ff0000000000000L
+ : ret;
+ ret = ax == 0x3ff0000000000000L & ay == PINFBITPATT_DP64 ? QNANBITPATT_DP64
+ : ret;
+ ret = ax > 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? 0L : ret;
+ ret = ax > 0x3ff0000000000000L & uy == PINFBITPATT_DP64 ? PINFBITPATT_DP64
+ : ret;
+ ret = ux < PINFBITPATT_DP64 & ay == 0L ? 0x3ff0000000000000L : ret;
+ ret = ((ax == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
+ ret = ((ax == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
+ ret = ((ax == PINFBITPATT_DP64) & (uy == PINFBITPATT_DP64)) ? PINFBITPATT_DP64
+ : ret;
+ ret = ((ax == PINFBITPATT_DP64) & (ay == 0L)) ? QNANBITPATT_DP64 : ret;
+ ret = ((ax == 0L) & !ypos) ? PINFBITPATT_DP64 : ret;
+ ret = ((ax == 0L) & ypos) ? 0L : ret;
+ ret = ((ax == 0L) & (ay == 0L)) ? QNANBITPATT_DP64 : ret;
+ ret = ((ax != 0L) & !xpos) ? QNANBITPATT_DP64 : ret;
+ ret = ax > PINFBITPATT_DP64 ? ux : ret;
+ ret = ay > PINFBITPATT_DP64 ? uy : ret;
+
+ return as_double(ret);
}
_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_powr, double, double)
#endif
diff --git a/libclc/generic/lib/math/clc_rootn.cl b/libclc/generic/lib/math/clc_rootn.cl
index eee9c9fcaa2d429..70ae02ac2370c90 100644
--- a/libclc/generic/lib/math/clc_rootn.cl
+++ b/libclc/generic/lib/math/clc_rootn.cl
@@ -23,6 +23,7 @@
#include <clc/clc.h>
#include <clc/clcmacro.h>
#include <clc/math/clc_fabs.h>
+#include <clc/math/clc_mad.h>
#include "config.h"
#include "math.h"
@@ -64,308 +65,320 @@
// At the end of exp, do
// ((((expT * poly) + expT) + expH*poly) + expH)
-_CLC_DEF _CLC_OVERLOAD float __clc_rootn(float x, int ny)
-{
- float y = MATH_RECIP((float)ny);
-
- int ix = as_int(x);
- int ax = ix & EXSIGNBIT_SP32;
- int xpos = ix == ax;
-
- int iy = as_int(y);
- int ay = iy & EXSIGNBIT_SP32;
- int ypos = iy == ay;
-
- // Extra precise log calculation
- // First handle case that x is close to 1
- float r = 1.0f - as_float(ax);
- int near1 = __clc_fabs(r) < 0x1.0p-4f;
- float r2 = r*r;
-
- // Coefficients are just 1/3, 1/4, 1/5 and 1/6
- float poly = mad(r,
- mad(r,
- mad(r,
- mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
- 0x1.99999ap-3f),
- 0x1.000000p-2f),
- 0x1.555556p-2f);
-
- poly *= r2*r;
-
- float lth_near1 = -r2 * 0.5f;
- float ltt_near1 = -poly;
- float lt_near1 = lth_near1 + ltt_near1;
- float lh_near1 = -r;
- float l_near1 = lh_near1 + lt_near1;
-
- // Computations for x not near 1
- int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
- float mf = (float)m;
- int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
- float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
- int c = m == -127;
- int ixn = c ? ixs : ax;
- float mfn = c ? mfs : mf;
-
- int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
- // F - Y
- float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (ixn & MANTBITS_SP32));
-
- indx = indx >> 16;
- float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
- float rh = f * tv.s0;
- float rt = f * tv.s1;
- r = rh + rt;
-
- poly = mad(r, mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * (r*r);
- poly += (rh - r) + rt;
-
- const float LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234
- const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
- tv = USE_TABLE(loge_tbl, indx);
- float lth = -r;
- float ltt = mad(mfn, LOG2_TAIL, -poly) + tv.s1;
- float lt = lth + ltt;
- float lh = mad(mfn, LOG2_HEAD, tv.s0);
- float l = lh + lt;
-
- // Select near 1 or not
- lth = near1 ? lth_near1 : lth;
- ltt = near1 ? ltt_near1 : ltt;
- lt = near1 ? lt_near1 : lt;
- lh = near1 ? lh_near1 : lh;
- l = near1 ? l_near1 : l;
-
- float gh = as_float(as_int(l) & 0xfffff000);
- float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
- float yh = as_float(iy & 0xfffff000);
-
- float fny = (float)ny;
- float fnyh = as_float(as_int(fny) & 0xfffff000);
- float fnyt = (float)(ny - (int)fnyh);
- float yt = MATH_DIVIDE(mad(-fnyt, yh, mad(-fnyh, yh, 1.0f)), fny);
-
- float ylogx_s = mad(gt, yh, mad(gh, yt, yt*gt));
- float ylogx = mad(yh, gh, ylogx_s);
- float ylogx_t = mad(yh, gh, -ylogx) + ylogx_s;
-
- // Extra precise exp of ylogx
- const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657
- int n = convert_int(ylogx * R_64_BY_LOG2);
- float nf = (float) n;
-
- int j = n & 0x3f;
- m = n >> 6;
- int m2 = m << EXPSHIFTBITS_SP32;
-
- const float R_LOG2_BY_64_LD = 0x1.620000p-7f; // log2/64 lead: 0.0108032227
- const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388
- r = mad(nf, -R_LOG2_BY_64_TL, mad(nf, -R_LOG2_BY_64_LD, ylogx)) + ylogx_t;
-
- // Truncated Taylor series for e^r
- poly = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
-
- tv = USE_TABLE(exp_tbl_ep, j);
-
- float expylogx = mad(tv.s0, poly, mad(tv.s1, poly, tv.s1)) + tv.s0;
- float sexpylogx = __clc_fp32_subnormals_supported() ? expylogx * as_float(0x1 << (m + 149)) : 0.0f;
-
- float texpylogx = as_float(as_int(expylogx) + m2);
- expylogx = m < -125 ? sexpylogx : texpylogx;
-
- // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
- expylogx = ((ylogx > 0x1.62e430p+6f) | (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f)) ? as_float(PINFBITPATT_SP32) : expylogx;
-
- // Result is 0 if ylogx < -149*log2
- expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
- // Classify y:
- // inty = 0 means not an integer.
- // inty = 1 means odd integer.
- // inty = 2 means even integer.
-
- int inty = 2 - (ny & 1);
-
- float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
- expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
- int ret = as_int(expylogx);
-
- // Corner case handling
- ret = (!xpos & (inty == 2)) ? QNANBITPATT_SP32 : ret;
- int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
- ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
- ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret;
- ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret;
- int xzero = xpos ? 0 : 0x80000000;
- ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
- ret = ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
- ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
- ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
- ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
- ret = ax > PINFBITPATT_SP32 ? ix : ret;
- ret = ny == 0 ? QNANBITPATT_SP32 : ret;
-
- return as_float(ret);
+_CLC_DEF _CLC_OVERLOAD float __clc_rootn(float x, int ny) {
+ float y = MATH_RECIP((float)ny);
+
+ int ix = as_int(x);
+ int ax = ix & EXSIGNBIT_SP32;
+ int xpos = ix == ax;
+
+ int iy = as_int(y);
+ int ay = iy & EXSIGNBIT_SP32;
+ int ypos = iy == ay;
+
+ // Extra precise log calculation
+ // First handle case that x is close to 1
+ float r = 1.0f - as_float(ax);
+ int near1 = __clc_fabs(r) < 0x1.0p-4f;
+ float r2 = r * r;
+
+ // Coefficients are just 1/3, 1/4, 1/5 and 1/6
+ float poly = __clc_mad(
+ r,
+ __clc_mad(r,
+ __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
+ 0x1.99999ap-3f),
+ 0x1.000000p-2f),
+ 0x1.555556p-2f);
+
+ poly *= r2 * r;
+
+ float lth_near1 = -r2 * 0.5f;
+ float ltt_near1 = -poly;
+ float lt_near1 = lth_near1 + ltt_near1;
+ float lh_near1 = -r;
+ float l_near1 = lh_near1 + lt_near1;
+
+ // Computations for x not near 1
+ int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+ float mf = (float)m;
+ int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+ float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
+ int c = m == -127;
+ int ixn = c ? ixs : ax;
+ float mfn = c ? mfs : mf;
+
+ int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
+
+ // F - Y
+ float f = as_float(0x3f000000 | indx) -
+ as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+
+ indx = indx >> 16;
+ float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
+ float rh = f * tv.s0;
+ float rt = f * tv.s1;
+ r = rh + rt;
+
+ poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
+ (r * r);
+ poly += (rh - r) + rt;
+
+ const float LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234
+ const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
+ tv = USE_TABLE(loge_tbl, indx);
+ float lth = -r;
+ float ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + tv.s1;
+ float lt = lth + ltt;
+ float lh = __clc_mad(mfn, LOG2_HEAD, tv.s0);
+ float l = lh + lt;
+
+ // Select near 1 or not
+ lth = near1 ? lth_near1 : lth;
+ ltt = near1 ? ltt_near1 : ltt;
+ lt = near1 ? lt_near1 : lt;
+ lh = near1 ? lh_near1 : lh;
+ l = near1 ? l_near1 : l;
+
+ float gh = as_float(as_int(l) & 0xfffff000);
+ float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
+
+ float yh = as_float(iy & 0xfffff000);
+
+ float fny = (float)ny;
+ float fnyh = as_float(as_int(fny) & 0xfffff000);
+ float fnyt = (float)(ny - (int)fnyh);
+ float yt = MATH_DIVIDE(__clc_mad(-fnyt, yh, __clc_mad(-fnyh, yh, 1.0f)), fny);
+
+ float ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
+ float ylogx = __clc_mad(yh, gh, ylogx_s);
+ float ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
+
+ // Extra precise exp of ylogx
+ const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657
+ int n = convert_int(ylogx * R_64_BY_LOG2);
+ float nf = (float)n;
+
+ int j = n & 0x3f;
+ m = n >> 6;
+ int m2 = m << EXPSHIFTBITS_SP32;
+
+ // log2/64 lead: 0.0108032227
+ const float R_LOG2_BY_64_LD = 0x1.620000p-7f;
+ // log2/64 tail: 0.0000272020388
+ const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
+ r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
+ ylogx_t;
+
+ // Truncated Taylor series for e^r
+ poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
+ 0x1.000000p-1f),
+ r * r, r);
+
+ tv = USE_TABLE(exp_tbl_ep, j);
+
+ float expylogx =
+ __clc_mad(tv.s0, poly, __clc_mad(tv.s1, poly, tv.s1)) + tv.s0;
+ float sexpylogx = __clc_fp32_subnormals_supported()
+ ? expylogx * as_float(0x1 << (m + 149))
+ : 0.0f;
+
+ float texpylogx = as_float(as_int(expylogx) + m2);
+ expylogx = m < -125 ? sexpylogx : texpylogx;
+
+ // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
+ expylogx = ((ylogx > 0x1.62e430p+6f) |
+ (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f))
+ ? as_float(PINFBITPATT_SP32)
+ : expylogx;
+
+ // Result is 0 if ylogx < -149*log2
+ expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
+
+ // Classify y:
+ // inty = 0 means not an integer.
+ // inty = 1 means odd integer.
+ // inty = 2 means even integer.
+
+ int inty = 2 - (ny & 1);
+
+ float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+ expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
+ int ret = as_int(expylogx);
+
+ // Corner case handling
+ ret = (!xpos & (inty == 2)) ? QNANBITPATT_SP32 : ret;
+ int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
+ ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
+ ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret;
+ ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret;
+ int xzero = xpos ? 0 : 0x80000000;
+ ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
+ ret =
+ ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret;
+ ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret;
+ ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret;
+ ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret;
+ ret = ax > PINFBITPATT_SP32 ? ix : ret;
+ ret = ny == 0 ? QNANBITPATT_SP32 : ret;
+
+ return as_float(ret);
}
_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_rootn, float, int)
#ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny)
-{
- const double real_log2_tail = 5.76999904754328540596e-08;
- const double real_log2_lead = 6.93147122859954833984e-01;
-
- double dny = (double)ny;
- double y = 1.0 / dny;
-
- long ux = as_long(x);
- long ax = ux & (~SIGNBIT_DP64);
- int xpos = ax == ux;
-
- long uy = as_long(y);
- long ay = uy & (~SIGNBIT_DP64);
- int ypos = ay == uy;
-
- // Extended precision log
- double v, vt;
- {
- int exp = (int)(ax >> 52) - 1023;
- int mask_exp_1023 = exp == -1023;
- double xexp = (double) exp;
- long mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
- long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
- exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
- double xexp1 = (double) exp;
- long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
- xexp = mask_exp_1023 ? xexp1 : xexp;
- mantissa = mask_exp_1023 ? mantissa1 : mantissa;
-
- long rax = (mantissa & 0x000ff00000000000) + ((mantissa & 0x0000080000000000) << 1);
- int index = rax >> 44;
-
- double F = as_double(rax | 0x3FE0000000000000L);
- double Y = as_double(mantissa | 0x3FE0000000000000L);
- double f = F - Y;
- double2 tv = USE_TABLE(log_f_inv_tbl, index);
- double log_h = tv.s0;
- double log_t = tv.s1;
- double f_inv = (log_h + log_t) * f;
- double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
- double r2 = fma(-F, r1, f) * (log_h + log_t);
- double r = r1 + r2;
-
- double poly = fma(r,
- fma(r,
- fma(r,
- fma(r, 1.0/7.0, 1.0/6.0),
- 1.0/5.0),
- 1.0/4.0),
- 1.0/3.0);
- poly = poly * r * r * r;
-
- double hr1r1 = 0.5*r1*r1;
- double poly0h = r1 + hr1r1;
- double poly0t = r1 - poly0h + hr1r1;
- poly = fma(r1, r2, fma(0.5*r2, r2, poly)) + r2 + poly0t;
-
- tv = USE_TABLE(powlog_tbl, index);
- log_h = tv.s0;
- log_t = tv.s1;
-
- double resT_t = fma(xexp, real_log2_tail, + log_t) - poly;
- double resT = resT_t - poly0h;
- double resH = fma(xexp, real_log2_lead, log_h);
- double resT_h = poly0h;
-
- double H = resT + resH;
- double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
- double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
- H = H_h;
-
- double y_head = as_double(uy & 0xfffffffff8000000L);
- double y_tail = y - y_head;
-
- double fnyh = as_double(as_long(dny) & 0xfffffffffff00000);
- double fnyt = (double)(ny - (int)fnyh);
- y_tail = fma(-fnyt, y_head, fma(-fnyh, y_head, 1.0))/ dny;
-
- double temp = fma(y_tail, H, fma(y_head, T, y_tail*T));
- v = fma(y_head, H, temp);
- vt = fma(y_head, H, -v) + temp;
- }
-
- // Now calculate exp of (v,vt)
-
- double expv;
- {
- const double max_exp_arg = 709.782712893384;
- const double min_exp_arg = -745.1332191019411;
- const double sixtyfour_by_lnof2 = 92.33248261689366;
- const double lnof2_by_64_head = 0.010830424260348081;
- const double lnof2_by_64_tail = -4.359010638708991e-10;
-
- double temp = v * sixtyfour_by_lnof2;
- int n = (int)temp;
- double dn = (double)n;
- int j = n & 0x0000003f;
- int m = n >> 6;
-
- double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
- double f1 = tv.s0;
- double f2 = tv.s1;
- double f = f1 + f2;
-
- double r1 = fma(dn, -lnof2_by_64_head, v);
- double r2 = dn * lnof2_by_64_tail;
- double r = (r1 + r2) + vt;
-
- double q = fma(r,
- fma(r,
- fma(r,
- fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
- 4.16666666662260795726e-02),
- 1.66666666665260878863e-01),
- 5.00000000000000008883e-01);
- q = fma(r*r, q, r);
-
- expv = fma(f, q, f2) + f1;
- expv = ldexp(expv, m);
-
- expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
- expv = v < min_exp_arg ? 0.0 : expv;
- }
-
- // See whether y is an integer.
- // inty = 0 means not an integer.
- // inty = 1 means odd integer.
- // inty = 2 means even integer.
-
- int inty = 2 - (ny & 1);
-
- expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
-
- long ret = as_long(expv);
-
- // Now all the edge cases
- ret = (!xpos & (inty == 2)) ? QNANBITPATT_DP64 : ret;
- long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
- ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
- ret = ((ax == 0L) & !ypos & (inty == 2)) ? PINFBITPATT_DP64 : ret;
- ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret;
- long xzero = xpos ? 0L : 0x8000000000000000L;
- ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
- ret = ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
- ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L : ret;
- ret = ((ux == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
- ret = ((ux == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
- ret = ax > PINFBITPATT_DP64 ? ux : ret;
- ret = ny == 0 ? QNANBITPATT_DP64 : ret;
- return as_double(ret);
+_CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny) {
+ const double real_log2_tail = 5.76999904754328540596e-08;
+ const double real_log2_lead = 6.93147122859954833984e-01;
+
+ double dny = (double)ny;
+ double y = 1.0 / dny;
+
+ long ux = as_long(x);
+ long ax = ux & (~SIGNBIT_DP64);
+ int xpos = ax == ux;
+
+ long uy = as_long(y);
+ long ay = uy & (~SIGNBIT_DP64);
+ int ypos = ay == uy;
+
+ // Extended precision log
+ double v, vt;
+ {
+ int exp = (int)(ax >> 52) - 1023;
+ int mask_exp_1023 = exp == -1023;
+ double xexp = (double)exp;
+ long mantissa = ax & 0x000FFFFFFFFFFFFFL;
+
+ long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+ exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
+ double xexp1 = (double)exp;
+ long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
+
+ xexp = mask_exp_1023 ? xexp1 : xexp;
+ mantissa = mask_exp_1023 ? mantissa1 : mantissa;
+
+ long rax = (mantissa & 0x000ff00000000000) +
+ ((mantissa & 0x0000080000000000) << 1);
+ int index = rax >> 44;
+
+ double F = as_double(rax | 0x3FE0000000000000L);
+ double Y = as_double(mantissa | 0x3FE0000000000000L);
+ double f = F - Y;
+ double2 tv = USE_TABLE(log_f_inv_tbl, index);
+ double log_h = tv.s0;
+ double log_t = tv.s1;
+ double f_inv = (log_h + log_t) * f;
+ double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+ double r2 = fma(-F, r1, f) * (log_h + log_t);
+ double r = r1 + r2;
+
+ double poly = fma(
+ r, fma(r, fma(r, fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), 1.0 / 4.0),
+ 1.0 / 3.0);
+ poly = poly * r * r * r;
+
+ double hr1r1 = 0.5 * r1 * r1;
+ double poly0h = r1 + hr1r1;
+ double poly0t = r1 - poly0h + hr1r1;
+ poly = fma(r1, r2, fma(0.5 * r2, r2, poly)) + r2 + poly0t;
+
+ tv = USE_TABLE(powlog_tbl, index);
+ log_h = tv.s0;
+ log_t = tv.s1;
+
+ double resT_t = fma(xexp, real_log2_tail, +log_t) - poly;
+ double resT = resT_t - poly0h;
+ double resH = fma(xexp, real_log2_lead, log_h);
+ double resT_h = poly0h;
+
+ double H = resT + resH;
+ double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+ double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
+ H = H_h;
+
+ double y_head = as_double(uy & 0xfffffffff8000000L);
+ double y_tail = y - y_head;
+
+ double fnyh = as_double(as_long(dny) & 0xfffffffffff00000);
+ double fnyt = (double)(ny - (int)fnyh);
+ y_tail = fma(-fnyt, y_head, fma(-fnyh, y_head, 1.0)) / dny;
+
+ double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
+ v = fma(y_head, H, temp);
+ vt = fma(y_head, H, -v) + temp;
+ }
+
+ // Now calculate exp of (v,vt)
+
+ double expv;
+ {
+ const double max_exp_arg = 709.782712893384;
+ const double min_exp_arg = -745.1332191019411;
+ const double sixtyfour_by_lnof2 = 92.33248261689366;
+ const double lnof2_by_64_head = 0.010830424260348081;
+ const double lnof2_by_64_tail = -4.359010638708991e-10;
+
+ double temp = v * sixtyfour_by_lnof2;
+ int n = (int)temp;
+ double dn = (double)n;
+ int j = n & 0x0000003f;
+ int m = n >> 6;
+
+ double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+ double f1 = tv.s0;
+ double f2 = tv.s1;
+ double f = f1 + f2;
+
+ double r1 = fma(dn, -lnof2_by_64_head, v);
+ double r2 = dn * lnof2_by_64_tail;
+ double r = (r1 + r2) + vt;
+
+ double q = fma(
+ r,
+ fma(r,
+ fma(r,
+ fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
+ 4.16666666662260795726e-02),
+ 1.66666666665260878863e-01),
+ 5.00000000000000008883e-01);
+ q = fma(r * r, q, r);
+
+ expv = fma(f, q, f2) + f1;
+ expv = ldexp(expv, m);
+
+ expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+ expv = v < min_exp_arg ? 0.0 : expv;
+ }
+
+ // See whether y is an integer.
+ // inty = 0 means not an integer.
+ // inty = 1 means odd integer.
+ // inty = 2 means even integer.
+
+ int inty = 2 - (ny & 1);
+
+ expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
+
+ long ret = as_long(expv);
+
+ // Now all the edge cases
+ ret = (!xpos & (inty == 2)) ? QNANBITPATT_DP64 : ret;
+ long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
+ ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
+ ret = ((ax == 0L) & !ypos & (inty == 2)) ? PINFBITPATT_DP64 : ret;
+ ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret;
+ long xzero = xpos ? 0L : 0x8000000000000000L;
+ ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
+ ret =
+ ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret;
+ ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L
+ : ret;
+ ret = ((ux == PINFBITPATT_DP64) & !ypos) ? 0L : ret;
+ ret = ((ux == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
+ ret = ax > PINFBITPATT_DP64 ? ux : ret;
+ ret = ny == 0 ? QNANBITPATT_DP64 : ret;
+ return as_double(ret);
}
_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_rootn, double, int)
#endif
diff --git a/libclc/generic/lib/math/mad.cl b/libclc/generic/lib/math/mad.cl
index 86bc70d94bea1c8..94012ab3df25a99 100644
--- a/libclc/generic/lib/math/mad.cl
+++ b/libclc/generic/lib/math/mad.cl
@@ -1,4 +1,19 @@
#include <clc/clc.h>
+#include <clc/clcmacro.h>
+#include <clc/math/clc_mad.h>
-#define __CLC_BODY <mad.inc>
-#include <clc/math/gentype.inc>
+_CLC_DEFINE_TERNARY_BUILTIN(float, mad, __clc_mad, float, float, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEFINE_TERNARY_BUILTIN(double, mad, __clc_mad, double, double, double)
+
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_TERNARY_BUILTIN(half, mad, __clc_mad, half, half, half)
+
+#endif
diff --git a/libclc/generic/lib/math/mad.inc b/libclc/generic/lib/math/mad.inc
deleted file mode 100644
index d32c7839d1b970f..000000000000000
--- a/libclc/generic/lib/math/mad.inc
+++ /dev/null
@@ -1,3 +0,0 @@
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mad(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c) {
- return a * b + c;
-}
diff --git a/libclc/generic/lib/math/sincos_helpers.cl b/libclc/generic/lib/math/sincos_helpers.cl
index 0adecf6978bcab3..e291e81ed980de6 100644
--- a/libclc/generic/lib/math/sincos_helpers.cl
+++ b/libclc/generic/lib/math/sincos_helpers.cl
@@ -21,307 +21,319 @@
*/
#include <clc/clc.h>
+#include <clc/math/clc_mad.h>
+#include <clc/math/clc_trunc.h>
#include <clc/shared/clc_max.h>
#include "math.h"
-#include "tables.h"
#include "sincos_helpers.h"
+#include "tables.h"
-#define bitalign(hi, lo, shift) \
- ((hi) << (32 - (shift))) | ((lo) >> (shift));
+#define bitalign(hi, lo, shift) ((hi) << (32 - (shift))) | ((lo) >> (shift));
-#define bytealign(src0, src1, src2) \
- ((uint) (((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3)*8)))
+#define bytealign(src0, src1, src2) \
+ ((uint)(((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3) * 8)))
_CLC_DEF float __clc_sinf_piby4(float x, float y) {
- // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
- // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
- // = x * f(w)
- // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
- // We use a minimax approximation of (f(w) - 1) / w
- // because this produces an expansion in even powers of x.
-
- const float c1 = -0.1666666666e0f;
- const float c2 = 0.8333331876e-2f;
- const float c3 = -0.198400874e-3f;
- const float c4 = 0.272500015e-5f;
- const float c5 = -2.5050759689e-08f; // 0xb2d72f34
- const float c6 = 1.5896910177e-10f; // 0x2f2ec9d3
-
- float z = x * x;
- float v = z * x;
- float r = mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2);
- float ret = x - mad(v, -c1, mad(z, mad(y, 0.5f, -v*r), -y));
-
- return ret;
+ // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+ // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+ // = x * f(w)
+ // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+ // We use a minimax approximation of (f(w) - 1) / w
+ // because this produces an expansion in even powers of x.
+
+ const float c1 = -0.1666666666e0f;
+ const float c2 = 0.8333331876e-2f;
+ const float c3 = -0.198400874e-3f;
+ const float c4 = 0.272500015e-5f;
+ const float c5 = -2.5050759689e-08f; // 0xb2d72f34
+ const float c6 = 1.5896910177e-10f; // 0x2f2ec9d3
+
+ float z = x * x;
+ float v = z * x;
+ float r = __clc_mad(
+ z, __clc_mad(z, __clc_mad(z, __clc_mad(z, c6, c5), c4), c3), c2);
+ float ret =
+ x - __clc_mad(v, -c1, __clc_mad(z, __clc_mad(y, 0.5f, -v * r), -y));
+
+ return ret;
}
_CLC_DEF float __clc_cosf_piby4(float x, float y) {
- // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
- // = f(w)
- // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
- // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
- // because this produces an expansion in even powers of x.
-
- const float c1 = 0.416666666e-1f;
- const float c2 = -0.138888876e-2f;
- const float c3 = 0.248006008e-4f;
- const float c4 = -0.2730101334e-6f;
- const float c5 = 2.0875723372e-09f; // 0x310f74f6
- const float c6 = -1.1359647598e-11f; // 0xad47d74e
-
- float z = x * x;
- float r = z * mad(z, mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2), c1);
-
- // if |x| < 0.3
- float qx = 0.0f;
-
- int ix = as_int(x) & EXSIGNBIT_SP32;
-
- // 0.78125 > |x| >= 0.3
- float xby4 = as_float(ix - 0x01000000);
- qx = (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4 : qx;
-
- // x > 0.78125
- qx = ix > 0x3f480000 ? 0.28125f : qx;
-
- float hz = mad(z, 0.5f, -qx);
- float a = 1.0f - qx;
- float ret = a - (hz - mad(z, r, -x*y));
- return ret;
+ // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+ // = f(w)
+ // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+ // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+ // because this produces an expansion in even powers of x.
+
+ const float c1 = 0.416666666e-1f;
+ const float c2 = -0.138888876e-2f;
+ const float c3 = 0.248006008e-4f;
+ const float c4 = -0.2730101334e-6f;
+ const float c5 = 2.0875723372e-09f; // 0x310f74f6
+ const float c6 = -1.1359647598e-11f; // 0xad47d74e
+
+ float z = x * x;
+ float r =
+ z *
+ __clc_mad(
+ z,
+ __clc_mad(z, __clc_mad(z, __clc_mad(z, __clc_mad(z, c6, c5), c4), c3),
+ c2),
+ c1);
+
+ // if |x| < 0.3
+ float qx = 0.0f;
+
+ int ix = as_int(x) & EXSIGNBIT_SP32;
+
+ // 0.78125 > |x| >= 0.3
+ float xby4 = as_float(ix - 0x01000000);
+ qx = (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4 : qx;
+
+ // x > 0.78125
+ qx = ix > 0x3f480000 ? 0.28125f : qx;
+
+ float hz = __clc_mad(z, 0.5f, -qx);
+ float a = 1.0f - qx;
+ float ret = a - (hz - __clc_mad(z, r, -x * y));
+ return ret;
}
-_CLC_DEF float __clc_tanf_piby4(float x, int regn)
-{
- // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4].
- float r = x * x;
+_CLC_DEF float __clc_tanf_piby4(float x, int regn) {
+ // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4].
+ float r = x * x;
- float a = mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f);
+ float a =
+ __clc_mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f);
- float b = mad(r,
- mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f),
- 1.15588821434688393452299f);
+ float b = __clc_mad(
+ r,
+ __clc_mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f),
+ 1.15588821434688393452299f);
- float t = mad(x*r, native_divide(a, b), x);
- float tr = -MATH_RECIP(t);
+ float t = __clc_mad(x * r, native_divide(a, b), x);
+ float tr = -MATH_RECIP(t);
- return regn & 1 ? tr : t;
+ return regn & 1 ? tr : t;
}
-_CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh, float bt)
-{
- if (HAVE_HW_FMA32()) {
- float ph = a * b;
- *hi = ph;
- *lo = fma(a, b, -ph);
- } else {
- float ah = as_float(as_uint(a) & 0xfffff000U);
- float at = a - ah;
- float ph = a * b;
- float pt = mad(at, bt, mad(at, bh, mad(ah, bt, mad(ah, bh, -ph))));
- *hi = ph;
- *lo = pt;
- }
+_CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh,
+ float bt) {
+ if (HAVE_HW_FMA32()) {
+ float ph = a * b;
+ *hi = ph;
+ *lo = fma(a, b, -ph);
+ } else {
+ float ah = as_float(as_uint(a) & 0xfffff000U);
+ float at = a - ah;
+ float ph = a * b;
+ float pt = __clc_mad(
+ at, bt, __clc_mad(at, bh, __clc_mad(ah, bt, __clc_mad(ah, bh, -ph))));
+ *hi = ph;
+ *lo = pt;
+ }
}
-_CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x)
-{
- // 72 bits of pi/2
- const float fpiby2_1 = (float) 0xC90FDA / 0x1.0p+23f;
- const float fpiby2_1_h = (float) 0xC90 / 0x1.0p+11f;
- const float fpiby2_1_t = (float) 0xFDA / 0x1.0p+23f;
+_CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x) {
+ // 72 bits of pi/2
+ const float fpiby2_1 = (float)0xC90FDA / 0x1.0p+23f;
+ const float fpiby2_1_h = (float)0xC90 / 0x1.0p+11f;
+ const float fpiby2_1_t = (float)0xFDA / 0x1.0p+23f;
- const float fpiby2_2 = (float) 0xA22168 / 0x1.0p+47f;
- const float fpiby2_2_h = (float) 0xA22 / 0x1.0p+35f;
- const float fpiby2_2_t = (float) 0x168 / 0x1.0p+47f;
+ const float fpiby2_2 = (float)0xA22168 / 0x1.0p+47f;
+ const float fpiby2_2_h = (float)0xA22 / 0x1.0p+35f;
+ const float fpiby2_2_t = (float)0x168 / 0x1.0p+47f;
- const float fpiby2_3 = (float) 0xC234C4 / 0x1.0p+71f;
- const float fpiby2_3_h = (float) 0xC23 / 0x1.0p+59f;
- const float fpiby2_3_t = (float) 0x4C4 / 0x1.0p+71f;
+ const float fpiby2_3 = (float)0xC234C4 / 0x1.0p+71f;
+ const float fpiby2_3_h = (float)0xC23 / 0x1.0p+59f;
+ const float fpiby2_3_t = (float)0x4C4 / 0x1.0p+71f;
- const float twobypi = 0x1.45f306p-1f;
+ const float twobypi = 0x1.45f306p-1f;
- float fnpi2 = trunc(mad(x, twobypi, 0.5f));
+ float fnpi2 = __clc_trunc(__clc_mad(x, twobypi, 0.5f));
- // subtract n * pi/2 from x
- float rhead, rtail;
- __clc_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t);
- float v = x - rhead;
- float rem = v + (((x - v) - rhead) - rtail);
+ // subtract n * pi/2 from x
+ float rhead, rtail;
+ __clc_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t);
+ float v = x - rhead;
+ float rem = v + (((x - v) - rhead) - rtail);
- float rhead2, rtail2;
- __clc_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t);
- v = rem - rhead2;
- rem = v + (((rem - v) - rhead2) - rtail2);
+ float rhead2, rtail2;
+ __clc_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t);
+ v = rem - rhead2;
+ rem = v + (((rem - v) - rhead2) - rtail2);
- float rhead3, rtail3;
- __clc_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t);
- v = rem - rhead3;
+ float rhead3, rtail3;
+ __clc_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t);
+ v = rem - rhead3;
- *hi = v + ((rem - v) - rhead3);
- *lo = -rtail3;
- return fnpi2;
+ *hi = v + ((rem - v) - rhead3);
+ *lo = -rtail3;
+ return fnpi2;
}
-_CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x)
-{
- float fnpi2 = __clc_removePi2S(r, rr, x);
- return (int)fnpi2 & 0x3;
+_CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x) {
+ float fnpi2 = __clc_removePi2S(r, rr, x);
+ return (int)fnpi2 & 0x3;
}
-#define FULL_MUL(A, B, HI, LO) \
- LO = A * B; \
- HI = mul_hi(A, B)
-
-#define FULL_MAD(A, B, C, HI, LO) \
- LO = ((A) * (B) + (C)); \
- HI = mul_hi(A, B); \
- HI += LO < C
-
-_CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x)
-{
- int xe = (int)(as_uint(x) >> 23) - 127;
- uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU);
-
- // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041 FE5163AB
- const uint b6 = 0xA2F9836EU;
- const uint b5 = 0x4E441529U;
- const uint b4 = 0xFC2757D1U;
- const uint b3 = 0xF534DDC0U;
- const uint b2 = 0xDB629599U;
- const uint b1 = 0x3C439041U;
- const uint b0 = 0xFE5163ABU;
-
- uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1;
-
- FULL_MUL(xm, b0, c0, p0);
- FULL_MAD(xm, b1, c0, c1, p1);
- FULL_MAD(xm, b2, c1, c0, p2);
- FULL_MAD(xm, b3, c0, c1, p3);
- FULL_MAD(xm, b4, c1, c0, p4);
- FULL_MAD(xm, b5, c0, c1, p5);
- FULL_MAD(xm, b6, c1, p7, p6);
-
- uint fbits = 224 + 23 - xe;
-
- // shift amount to get 2 lsb of integer part at top 2 bits
- // min: 25 (xe=18) max: 134 (xe=127)
- uint shift = 256U - 2 - fbits;
-
- // Shift by up to 134/32 = 4 words
- int c = shift > 31;
- p7 = c ? p6 : p7;
- p6 = c ? p5 : p6;
- p5 = c ? p4 : p5;
- p4 = c ? p3 : p4;
- p3 = c ? p2 : p3;
- p2 = c ? p1 : p2;
- p1 = c ? p0 : p1;
- shift -= (-c) & 32;
-
- c = shift > 31;
- p7 = c ? p6 : p7;
- p6 = c ? p5 : p6;
- p5 = c ? p4 : p5;
- p4 = c ? p3 : p4;
- p3 = c ? p2 : p3;
- p2 = c ? p1 : p2;
- shift -= (-c) & 32;
-
- c = shift > 31;
- p7 = c ? p6 : p7;
- p6 = c ? p5 : p6;
- p5 = c ? p4 : p5;
- p4 = c ? p3 : p4;
- p3 = c ? p2 : p3;
- shift -= (-c) & 32;
-
- c = shift > 31;
- p7 = c ? p6 : p7;
- p6 = c ? p5 : p6;
- p5 = c ? p4 : p5;
- p4 = c ? p3 : p4;
- shift -= (-c) & 32;
-
- // bitalign cannot handle a shift of 32
- c = shift > 0;
- shift = 32 - shift;
- uint t7 = bitalign(p7, p6, shift);
- uint t6 = bitalign(p6, p5, shift);
- uint t5 = bitalign(p5, p4, shift);
- p7 = c ? t7 : p7;
- p6 = c ? t6 : p6;
- p5 = c ? t5 : p5;
-
- // Get 2 lsb of int part and msb of fraction
- int i = p7 >> 29;
-
- // Scoot up 2 more bits so only fraction remains
- p7 = bitalign(p7, p6, 30);
- p6 = bitalign(p6, p5, 30);
- p5 = bitalign(p5, p4, 30);
-
- // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5
- uint flip = i & 1 ? 0xffffffffU : 0U;
- uint sign = i & 1 ? 0x80000000U : 0U;
- p7 = p7 ^ flip;
- p6 = p6 ^ flip;
- p5 = p5 ^ flip;
-
- // Find exponent and shift away leading zeroes and hidden bit
- xe = clz(p7) + 1;
- shift = 32 - xe;
- p7 = bitalign(p7, p6, shift);
- p6 = bitalign(p6, p5, shift);
-
- // Most significant part of fraction
- float q1 = as_float(sign | ((127 - xe) << 23) | (p7 >> 9));
-
- // Shift out bits we captured on q1
- p7 = bitalign(p7, p6, 32-23);
-
- // Get 24 more bits of fraction in another float, there are not long strings of zeroes here
- int xxe = clz(p7) + 1;
- p7 = bitalign(p7, p6, 32-xxe);
- float q0 = as_float(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9));
-
- // At this point, the fraction q1 + q0 is correct to at least 48 bits
- // Now we need to multiply the fraction by pi/2
- // This loses us about 4 bits
- // pi/2 = C90 FDA A22 168 C23 4C4
-
- const float pio2h = (float)0xc90fda / 0x1.0p+23f;
- const float pio2hh = (float)0xc90 / 0x1.0p+11f;
- const float pio2ht = (float)0xfda / 0x1.0p+23f;
- const float pio2t = (float)0xa22168 / 0x1.0p+47f;
-
- float rh, rt;
-
- if (HAVE_HW_FMA32()) {
- rh = q1 * pio2h;
- rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh)));
- } else {
- float q1h = as_float(as_uint(q1) & 0xfffff000);
- float q1t = q1 - q1h;
- rh = q1 * pio2h;
- rt = mad(q1t, pio2ht, mad(q1t, pio2hh, mad(q1h, pio2ht, mad(q1h, pio2hh, -rh))));
- rt = mad(q0, pio2h, mad(q1, pio2t, rt));
- }
-
- float t = rh + rt;
- rt = rt - (t - rh);
-
- *r = t;
- *rr = rt;
- return ((i >> 1) + (i & 1)) & 0x3;
+#define FULL_MUL(A, B, HI, LO) \
+ LO = A * B; \
+ HI = mul_hi(A, B)
+
+#define FULL_MAD(A, B, C, HI, LO) \
+ LO = ((A) * (B) + (C)); \
+ HI = mul_hi(A, B); \
+ HI += LO < C
+
+_CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x) {
+ int xe = (int)(as_uint(x) >> 23) - 127;
+ uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU);
+
+ // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041
+ // FE5163AB
+ const uint b6 = 0xA2F9836EU;
+ const uint b5 = 0x4E441529U;
+ const uint b4 = 0xFC2757D1U;
+ const uint b3 = 0xF534DDC0U;
+ const uint b2 = 0xDB629599U;
+ const uint b1 = 0x3C439041U;
+ const uint b0 = 0xFE5163ABU;
+
+ uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1;
+
+ FULL_MUL(xm, b0, c0, p0);
+ FULL_MAD(xm, b1, c0, c1, p1);
+ FULL_MAD(xm, b2, c1, c0, p2);
+ FULL_MAD(xm, b3, c0, c1, p3);
+ FULL_MAD(xm, b4, c1, c0, p4);
+ FULL_MAD(xm, b5, c0, c1, p5);
+ FULL_MAD(xm, b6, c1, p7, p6);
+
+ uint fbits = 224 + 23 - xe;
+
+ // shift amount to get 2 lsb of integer part at top 2 bits
+ // min: 25 (xe=18) max: 134 (xe=127)
+ uint shift = 256U - 2 - fbits;
+
+ // Shift by up to 134/32 = 4 words
+ int c = shift > 31;
+ p7 = c ? p6 : p7;
+ p6 = c ? p5 : p6;
+ p5 = c ? p4 : p5;
+ p4 = c ? p3 : p4;
+ p3 = c ? p2 : p3;
+ p2 = c ? p1 : p2;
+ p1 = c ? p0 : p1;
+ shift -= (-c) & 32;
+
+ c = shift > 31;
+ p7 = c ? p6 : p7;
+ p6 = c ? p5 : p6;
+ p5 = c ? p4 : p5;
+ p4 = c ? p3 : p4;
+ p3 = c ? p2 : p3;
+ p2 = c ? p1 : p2;
+ shift -= (-c) & 32;
+
+ c = shift > 31;
+ p7 = c ? p6 : p7;
+ p6 = c ? p5 : p6;
+ p5 = c ? p4 : p5;
+ p4 = c ? p3 : p4;
+ p3 = c ? p2 : p3;
+ shift -= (-c) & 32;
+
+ c = shift > 31;
+ p7 = c ? p6 : p7;
+ p6 = c ? p5 : p6;
+ p5 = c ? p4 : p5;
+ p4 = c ? p3 : p4;
+ shift -= (-c) & 32;
+
+ // bitalign cannot handle a shift of 32
+ c = shift > 0;
+ shift = 32 - shift;
+ uint t7 = bitalign(p7, p6, shift);
+ uint t6 = bitalign(p6, p5, shift);
+ uint t5 = bitalign(p5, p4, shift);
+ p7 = c ? t7 : p7;
+ p6 = c ? t6 : p6;
+ p5 = c ? t5 : p5;
+
+ // Get 2 lsb of int part and msb of fraction
+ int i = p7 >> 29;
+
+ // Scoot up 2 more bits so only fraction remains
+ p7 = bitalign(p7, p6, 30);
+ p6 = bitalign(p6, p5, 30);
+ p5 = bitalign(p5, p4, 30);
+
+ // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5
+ uint flip = i & 1 ? 0xffffffffU : 0U;
+ uint sign = i & 1 ? 0x80000000U : 0U;
+ p7 = p7 ^ flip;
+ p6 = p6 ^ flip;
+ p5 = p5 ^ flip;
+
+ // Find exponent and shift away leading zeroes and hidden bit
+ xe = clz(p7) + 1;
+ shift = 32 - xe;
+ p7 = bitalign(p7, p6, shift);
+ p6 = bitalign(p6, p5, shift);
+
+ // Most significant part of fraction
+ float q1 = as_float(sign | ((127 - xe) << 23) | (p7 >> 9));
+
+ // Shift out bits we captured on q1
+ p7 = bitalign(p7, p6, 32 - 23);
+
+ // Get 24 more bits of fraction in another float, there are not long strings
+ // of zeroes here
+ int xxe = clz(p7) + 1;
+ p7 = bitalign(p7, p6, 32 - xxe);
+ float q0 = as_float(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9));
+
+ // At this point, the fraction q1 + q0 is correct to at least 48 bits
+ // Now we need to multiply the fraction by pi/2
+ // This loses us about 4 bits
+ // pi/2 = C90 FDA A22 168 C23 4C4
+
+ const float pio2h = (float)0xc90fda / 0x1.0p+23f;
+ const float pio2hh = (float)0xc90 / 0x1.0p+11f;
+ const float pio2ht = (float)0xfda / 0x1.0p+23f;
+ const float pio2t = (float)0xa22168 / 0x1.0p+47f;
+
+ float rh, rt;
+
+ if (HAVE_HW_FMA32()) {
+ rh = q1 * pio2h;
+ rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh)));
+ } else {
+ float q1h = as_float(as_uint(q1) & 0xfffff000);
+ float q1t = q1 - q1h;
+ rh = q1 * pio2h;
+ rt = __clc_mad(
+ q1t, pio2ht,
+ __clc_mad(q1t, pio2hh,
+ __clc_mad(q1h, pio2ht, __clc_mad(q1h, pio2hh, -rh))));
+ rt = __clc_mad(q0, pio2h, __clc_mad(q1, pio2t, rt));
+ }
+
+ float t = rh + rt;
+ rt = rt - (t - rh);
+
+ *r = t;
+ *rr = rt;
+ return ((i >> 1) + (i & 1)) & 0x3;
}
-_CLC_DEF int __clc_argReductionS(float *r, float *rr, float x)
-{
- if (x < 0x1.0p+23f)
- return __clc_argReductionSmallS(r, rr, x);
- else
- return __clc_argReductionLargeS(r, rr, x);
+_CLC_DEF int __clc_argReductionS(float *r, float *rr, float x) {
+ if (x < 0x1.0p+23f)
+ return __clc_argReductionSmallS(r, rr, x);
+ else
+ return __clc_argReductionLargeS(r, rr, x);
}
#ifdef cl_khr_fp64
@@ -329,39 +341,44 @@ _CLC_DEF int __clc_argReductionS(float *r, float *rr, float x)
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
// Reduction for medium sized arguments
-_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn) {
- // How many pi/2 is x a multiple of?
- const double two_by_pi = 0x1.45f306dc9c883p-1;
- double dnpi2 = trunc(fma(x, two_by_pi, 0.5));
-
- const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
- const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
- const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
-
- // Compute product of npi2 with 159 bits of 2/pi
- double p_hh = piby2_h * dnpi2;
- double p_ht = fma(piby2_h, dnpi2, -p_hh);
- double p_mh = piby2_m * dnpi2;
- double p_mt = fma(piby2_m, dnpi2, -p_mh);
- double p_th = piby2_t * dnpi2;
- double p_tt = fma(piby2_t, dnpi2, -p_th);
-
- // Reduce to 159 bits
- double ph = p_hh;
- double pm = p_ht + p_mh;
- double t = p_mh - (pm - p_ht);
- double pt = p_th + t + p_mt + p_tt;
- t = ph + pm; pm = pm - (t - ph); ph = t;
- t = pm + pt; pt = pt - (t - pm); pm = t;
-
- // Subtract from x
- t = x + ph;
- double qh = t + pm;
- double qt = pm - (qh - t) + pt;
-
- *r = qh;
- *rr = qt;
- *regn = (int)(long)dnpi2 & 0x3;
+_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr,
+ int *regn) {
+ // How many pi/2 is x a multiple of?
+ const double two_by_pi = 0x1.45f306dc9c883p-1;
+ double dnpi2 = __clc_trunc(fma(x, two_by_pi, 0.5));
+
+ const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
+ const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
+ const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
+
+ // Compute product of npi2 with 159 bits of 2/pi
+ double p_hh = piby2_h * dnpi2;
+ double p_ht = fma(piby2_h, dnpi2, -p_hh);
+ double p_mh = piby2_m * dnpi2;
+ double p_mt = fma(piby2_m, dnpi2, -p_mh);
+ double p_th = piby2_t * dnpi2;
+ double p_tt = fma(piby2_t, dnpi2, -p_th);
+
+ // Reduce to 159 bits
+ double ph = p_hh;
+ double pm = p_ht + p_mh;
+ double t = p_mh - (pm - p_ht);
+ double pt = p_th + t + p_mt + p_tt;
+ t = ph + pm;
+ pm = pm - (t - ph);
+ ph = t;
+ t = pm + pt;
+ pt = pt - (t - pm);
+ pm = t;
+
+ // Subtract from x
+ t = x + ph;
+ double qh = t + pm;
+ double qt = pm - (qh - t) + pt;
+
+ *r = qh;
+ *rr = qt;
+ *regn = (int)(long)dnpi2 & 0x3;
}
// Given positive argument x, reduce it to the range [-pi/4,pi/4] using
@@ -369,195 +386,208 @@ _CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, int
// Return value "regn" tells how many lots of pi/2 were subtracted
// from x to put it in the range [-pi/4,pi/4], mod 4.
-_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn) {
-
- long ux = as_long(x);
- int e = (int)(ux >> 52) - 1023;
- int i = __clc_max(23, (e >> 3) + 17);
- int j = 150 - i;
- int j16 = j & ~0xf;
- double fract_temp;
-
- // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary byte boundary
- uint4 q0 = USE_TABLE(pibits_tbl, j16);
- uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16));
- uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32));
-
- int k = (j >> 2) & 0x3;
- int4 c = (int4)k == (int4)(0, 1, 2, 3);
-
- uint u0, u1, u2, u3, u4, u5, u6;
-
- u0 = c.s1 ? q0.s1 : q0.s0;
- u0 = c.s2 ? q0.s2 : u0;
- u0 = c.s3 ? q0.s3 : u0;
-
- u1 = c.s1 ? q0.s2 : q0.s1;
- u1 = c.s2 ? q0.s3 : u1;
- u1 = c.s3 ? q1.s0 : u1;
-
- u2 = c.s1 ? q0.s3 : q0.s2;
- u2 = c.s2 ? q1.s0 : u2;
- u2 = c.s3 ? q1.s1 : u2;
-
- u3 = c.s1 ? q1.s0 : q0.s3;
- u3 = c.s2 ? q1.s1 : u3;
- u3 = c.s3 ? q1.s2 : u3;
-
- u4 = c.s1 ? q1.s1 : q1.s0;
- u4 = c.s2 ? q1.s2 : u4;
- u4 = c.s3 ? q1.s3 : u4;
-
- u5 = c.s1 ? q1.s2 : q1.s1;
- u5 = c.s2 ? q1.s3 : u5;
- u5 = c.s3 ? q2.s0 : u5;
-
- u6 = c.s1 ? q1.s3 : q1.s2;
- u6 = c.s2 ? q2.s0 : u6;
- u6 = c.s3 ? q2.s1 : u6;
-
- uint v0 = bytealign(u1, u0, j);
- uint v1 = bytealign(u2, u1, j);
- uint v2 = bytealign(u3, u2, j);
- uint v3 = bytealign(u4, u3, j);
- uint v4 = bytealign(u5, u4, j);
- uint v5 = bytealign(u6, u5, j);
-
- // Place those 192 bits in 4 48-bit doubles along with correct exponent
- // If i > 1018 we would get subnormals so we scale p up and x down to get the same product
- i = 2 + 8*i;
- x *= i > 1018 ? 0x1.0p-136 : 1.0;
- i -= i > 1018 ? 136 : 0;
-
- uint ua = (uint)(1023 + 52 - i) << 20;
- double a = as_double((uint2)(0, ua));
- double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a;
- ua += 0x03000000U;
- a = as_double((uint2)(0, ua));
- double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a;
- ua += 0x03000000U;
- a = as_double((uint2)(0, ua));
- double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a;
- ua += 0x03000000U;
- a = as_double((uint2)(0, ua));
- double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a;
-
- // Exact multiply
- double f0h = p0 * x;
- double f0l = fma(p0, x, -f0h);
- double f1h = p1 * x;
- double f1l = fma(p1, x, -f1h);
- double f2h = p2 * x;
- double f2l = fma(p2, x, -f2h);
- double f3h = p3 * x;
- double f3l = fma(p3, x, -f3h);
-
- // Accumulate product into 4 doubles
- double s, t;
-
- double f3 = f3h + f2h;
- t = f2h - (f3 - f3h);
- s = f3l + t;
- t = t - (s - f3l);
-
- double f2 = s + f1h;
- t = f1h - (f2 - s) + t;
- s = f2l + t;
- t = t - (s - f2l);
-
- double f1 = s + f0h;
- t = f0h - (f1 - s) + t;
- s = f1l + t;
-
- double f0 = s + f0l;
-
- // Strip off unwanted large integer bits
- f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp);
- f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
-
- // Compute least significant integer bits
- t = f3 + f2;
- double di = t - fract(t, &fract_temp);
- i = (float)di;
-
- // Shift out remaining integer part
- f3 -= di;
- s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t;
- s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t;
- f1 += f0;
-
- // Subtract 1 if fraction is >= 0.5, and update regn
- int g = f3 >= 0.5;
- i += g;
- f3 -= (float)g;
-
- // Shift up bits
- s = f3 + f2; t = f2 -(s - f3); f3 = s; f2 = t + f1;
-
- // Multiply precise fraction by pi/2 to get radians
- const double p2h = 7074237752028440.0 / 0x1.0p+52;
- const double p2t = 4967757600021510.0 / 0x1.0p+106;
-
- double rhi = f3 * p2h;
- double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
-
- *r = rhi + rlo;
- *rr = rlo - (*r - rhi);
- *regn = i & 0x3;
+_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr,
+ int *regn) {
+
+ long ux = as_long(x);
+ int e = (int)(ux >> 52) - 1023;
+ int i = __clc_max(23, (e >> 3) + 17);
+ int j = 150 - i;
+ int j16 = j & ~0xf;
+ double fract_temp;
+
+ // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary
+ // byte boundary
+ uint4 q0 = USE_TABLE(pibits_tbl, j16);
+ uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16));
+ uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32));
+
+ int k = (j >> 2) & 0x3;
+ int4 c = (int4)k == (int4)(0, 1, 2, 3);
+
+ uint u0, u1, u2, u3, u4, u5, u6;
+
+ u0 = c.s1 ? q0.s1 : q0.s0;
+ u0 = c.s2 ? q0.s2 : u0;
+ u0 = c.s3 ? q0.s3 : u0;
+
+ u1 = c.s1 ? q0.s2 : q0.s1;
+ u1 = c.s2 ? q0.s3 : u1;
+ u1 = c.s3 ? q1.s0 : u1;
+
+ u2 = c.s1 ? q0.s3 : q0.s2;
+ u2 = c.s2 ? q1.s0 : u2;
+ u2 = c.s3 ? q1.s1 : u2;
+
+ u3 = c.s1 ? q1.s0 : q0.s3;
+ u3 = c.s2 ? q1.s1 : u3;
+ u3 = c.s3 ? q1.s2 : u3;
+
+ u4 = c.s1 ? q1.s1 : q1.s0;
+ u4 = c.s2 ? q1.s2 : u4;
+ u4 = c.s3 ? q1.s3 : u4;
+
+ u5 = c.s1 ? q1.s2 : q1.s1;
+ u5 = c.s2 ? q1.s3 : u5;
+ u5 = c.s3 ? q2.s0 : u5;
+
+ u6 = c.s1 ? q1.s3 : q1.s2;
+ u6 = c.s2 ? q2.s0 : u6;
+ u6 = c.s3 ? q2.s1 : u6;
+
+ uint v0 = bytealign(u1, u0, j);
+ uint v1 = bytealign(u2, u1, j);
+ uint v2 = bytealign(u3, u2, j);
+ uint v3 = bytealign(u4, u3, j);
+ uint v4 = bytealign(u5, u4, j);
+ uint v5 = bytealign(u6, u5, j);
+
+ // Place those 192 bits in 4 48-bit doubles along with correct exponent
+ // If i > 1018 we would get subnormals so we scale p up and x down to get the
+ // same product
+ i = 2 + 8 * i;
+ x *= i > 1018 ? 0x1.0p-136 : 1.0;
+ i -= i > 1018 ? 136 : 0;
+
+ uint ua = (uint)(1023 + 52 - i) << 20;
+ double a = as_double((uint2)(0, ua));
+ double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a;
+ ua += 0x03000000U;
+ a = as_double((uint2)(0, ua));
+ double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a;
+ ua += 0x03000000U;
+ a = as_double((uint2)(0, ua));
+ double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a;
+ ua += 0x03000000U;
+ a = as_double((uint2)(0, ua));
+ double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a;
+
+ // Exact multiply
+ double f0h = p0 * x;
+ double f0l = fma(p0, x, -f0h);
+ double f1h = p1 * x;
+ double f1l = fma(p1, x, -f1h);
+ double f2h = p2 * x;
+ double f2l = fma(p2, x, -f2h);
+ double f3h = p3 * x;
+ double f3l = fma(p3, x, -f3h);
+
+ // Accumulate product into 4 doubles
+ double s, t;
+
+ double f3 = f3h + f2h;
+ t = f2h - (f3 - f3h);
+ s = f3l + t;
+ t = t - (s - f3l);
+
+ double f2 = s + f1h;
+ t = f1h - (f2 - s) + t;
+ s = f2l + t;
+ t = t - (s - f2l);
+
+ double f1 = s + f0h;
+ t = f0h - (f1 - s) + t;
+ s = f1l + t;
+
+ double f0 = s + f0l;
+
+ // Strip off unwanted large integer bits
+ f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp);
+ f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
+
+ // Compute least significant integer bits
+ t = f3 + f2;
+ double di = t - fract(t, &fract_temp);
+ i = (float)di;
+
+ // Shift out remaining integer part
+ f3 -= di;
+ s = f3 + f2;
+ t = f2 - (s - f3);
+ f3 = s;
+ f2 = t;
+ s = f2 + f1;
+ t = f1 - (s - f2);
+ f2 = s;
+ f1 = t;
+ f1 += f0;
+
+ // Subtract 1 if fraction is >= 0.5, and update regn
+ int g = f3 >= 0.5;
+ i += g;
+ f3 -= (float)g;
+
+ // Shift up bits
+ s = f3 + f2;
+ t = f2 - (s - f3);
+ f3 = s;
+ f2 = t + f1;
+
+ // Multiply precise fraction by pi/2 to get radians
+ const double p2h = 7074237752028440.0 / 0x1.0p+52;
+ const double p2t = 4967757600021510.0 / 0x1.0p+106;
+
+ double rhi = f3 * p2h;
+ double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
+
+ *r = rhi + rlo;
+ *rr = rlo - (*r - rhi);
+ *regn = i & 0x3;
}
-
_CLC_DEF double2 __clc_sincos_piby4(double x, double xx) {
- // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
- // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
- // = x * f(w)
- // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
- // We use a minimax approximation of (f(w) - 1) / w
- // because this produces an expansion in even powers of x.
- // If xx (the tail of x) is non-zero, we add a correction
- // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
- // is an approximation to cos(x)*sin(xx) valid because
- // xx is tiny relative to x.
-
- // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
- // = f(w)
- // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
- // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
- // because this produces an expansion in even powers of x.
- // If xx (the tail of x) is non-zero, we subtract a correction
- // term g(x,xx) = x*xx to the result, where g(x,xx)
- // is an approximation to sin(x)*sin(xx) valid because
- // xx is tiny relative to x.
-
- const double sc1 = -0.166666666666666646259241729;
- const double sc2 = 0.833333333333095043065222816e-2;
- const double sc3 = -0.19841269836761125688538679e-3;
- const double sc4 = 0.275573161037288022676895908448e-5;
- const double sc5 = -0.25051132068021699772257377197e-7;
- const double sc6 = 0.159181443044859136852668200e-9;
-
- const double cc1 = 0.41666666666666665390037e-1;
- const double cc2 = -0.13888888888887398280412e-2;
- const double cc3 = 0.248015872987670414957399e-4;
- const double cc4 = -0.275573172723441909470836e-6;
- const double cc5 = 0.208761463822329611076335e-8;
- const double cc6 = -0.113826398067944859590880e-10;
-
- double x2 = x * x;
- double x3 = x2 * x;
- double r = 0.5 * x2;
- double t = 1.0 - r;
-
- double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
-
- double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1),
- x2*x2, fma(x, xx, (1.0 - t) - r));
-
- double2 ret;
- ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx));
- ret.hi = cp;
-
- return ret;
+ // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+ // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+ // = x * f(w)
+ // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+ // We use a minimax approximation of (f(w) - 1) / w
+ // because this produces an expansion in even powers of x.
+ // If xx (the tail of x) is non-zero, we add a correction
+ // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
+ // is an approximation to cos(x)*sin(xx) valid because
+ // xx is tiny relative to x.
+
+ // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+ // = f(w)
+ // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+ // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+ // because this produces an expansion in even powers of x.
+ // If xx (the tail of x) is non-zero, we subtract a correction
+ // term g(x,xx) = x*xx to the result, where g(x,xx)
+ // is an approximation to sin(x)*sin(xx) valid because
+ // xx is tiny relative to x.
+
+ const double sc1 = -0.166666666666666646259241729;
+ const double sc2 = 0.833333333333095043065222816e-2;
+ const double sc3 = -0.19841269836761125688538679e-3;
+ const double sc4 = 0.275573161037288022676895908448e-5;
+ const double sc5 = -0.25051132068021699772257377197e-7;
+ const double sc6 = 0.159181443044859136852668200e-9;
+
+ const double cc1 = 0.41666666666666665390037e-1;
+ const double cc2 = -0.13888888888887398280412e-2;
+ const double cc3 = 0.248015872987670414957399e-4;
+ const double cc4 = -0.275573172723441909470836e-6;
+ const double cc5 = 0.208761463822329611076335e-8;
+ const double cc6 = -0.113826398067944859590880e-10;
+
+ double x2 = x * x;
+ double x3 = x2 * x;
+ double r = 0.5 * x2;
+ double t = 1.0 - r;
+
+ double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
+
+ double cp =
+ t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2),
+ x2, cc1),
+ x2 * x2, fma(x, xx, (1.0 - t) - r));
+
+ double2 ret;
+ ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5 * xx), x2, -xx));
+ ret.hi = cp;
+
+ return ret;
}
#endif
diff --git a/libclc/generic/lib/math/sincospiF_piby4.h b/libclc/generic/lib/math/sincospiF_piby4.h
index 90ecb1d7a6360e5..d7c01c1bb13d363 100644
--- a/libclc/generic/lib/math/sincospiF_piby4.h
+++ b/libclc/generic/lib/math/sincospiF_piby4.h
@@ -19,38 +19,41 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
+#include <clc/math/clc_mad.h>
// Evaluate single precisions in and cos of value in interval [-pi/4, pi/4]
-_CLC_INLINE float2
-__libclc__sincosf_piby4(float x)
-{
- // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
- // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
- // = x * f(w)
- // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
- // We use a minimax approximation of (f(w) - 1) / w
- // because this produces an expansion in even powers of x.
+_CLC_INLINE float2 __libclc__sincosf_piby4(float x) {
+ // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+ // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+ // = x * f(w)
+ // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+ // We use a minimax approximation of (f(w) - 1) / w
+ // because this produces an expansion in even powers of x.
- // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
- // = f(w)
- // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
- // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
- // because this produces an expansion in even powers of x.
+ // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+ // = f(w)
+ // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+ // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+ // because this produces an expansion in even powers of x.
- const float sc1 = -0.166666666638608441788607926e0F;
- const float sc2 = 0.833333187633086262120839299e-2F;
- const float sc3 = -0.198400874359527693921333720e-3F;
- const float sc4 = 0.272500015145584081596826911e-5F;
+ const float sc1 = -0.166666666638608441788607926e0F;
+ const float sc2 = 0.833333187633086262120839299e-2F;
+ const float sc3 = -0.198400874359527693921333720e-3F;
+ const float sc4 = 0.272500015145584081596826911e-5F;
- const float cc1 = 0.41666666664325175238031e-1F;
- const float cc2 = -0.13888887673175665567647e-2F;
- const float cc3 = 0.24800600878112441958053e-4F;
- const float cc4 = -0.27301013343179832472841e-6F;
+ const float cc1 = 0.41666666664325175238031e-1F;
+ const float cc2 = -0.13888887673175665567647e-2F;
+ const float cc3 = 0.24800600878112441958053e-4F;
+ const float cc4 = -0.27301013343179832472841e-6F;
- float x2 = x * x;
+ float x2 = x * x;
- float2 ret;
- ret.x = mad(x*x2, mad(x2, mad(x2, mad(x2, sc4, sc3), sc2), sc1), x);
- ret.y = mad(x2*x2, mad(x2, mad(x2, mad(x2, cc4, cc3), cc2), cc1), mad(x2, -0.5f, 1.0f));
- return ret;
+ float2 ret;
+ ret.x = __clc_mad(
+ x * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, sc4, sc3), sc2), sc1),
+ x);
+ ret.y = __clc_mad(
+ x2 * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, cc4, cc3), cc2), cc1),
+ __clc_mad(x2, -0.5f, 1.0f));
+ return ret;
}
More information about the cfe-commits
mailing list