[Libclc-dev] [PATCH 09/15] Implement cos for double types
Tom Stellard
thomas.stellard at amd.com
Tue Apr 7 11:05:40 PDT 2015
This implementation was ported from the AMD builtin library
and has been tested with piglit, OpenCV, and the ocl conformance tests.
---
generic/lib/math/cos.cl | 24 ++--
generic/lib/math/sincos_helpers.cl | 241 +++++++++++++++++++++++++++++++++++++
generic/lib/math/sincos_helpers.h | 10 ++
generic/lib/math/tables.cl | 20 +++
generic/lib/math/tables.h | 1 +
5 files changed, 289 insertions(+), 7 deletions(-)
diff --git a/generic/lib/math/cos.cl b/generic/lib/math/cos.cl
index cbf7d59..157447f 100644
--- a/generic/lib/math/cos.cl
+++ b/generic/lib/math/cos.cl
@@ -52,14 +52,24 @@ _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, cos, float);
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-#define __CLC_FUNCTION __clc_cos_intrinsic
-#define __CLC_INTRINSIC "llvm.cos"
-#include <clc/math/unary_intrin.inc>
-#undef __CLC_FUNCTION
-#undef __CLC_INTRINSIC
-
_CLC_OVERLOAD _CLC_DEF double cos(double x) {
- return __clc_cos_intrinsic(x);
+ x = fabs(x);
+
+ double r, rr;
+ int regn;
+
+ if (x < 0x1.0p+47)
+ __clc_remainder_piby2_medium(x, &r, &rr, ®n);
+ else
+ __clc_remainder_piby2_large(x, &r, &rr, ®n);
+
+ double2 sc = __clc_sincos_piby4(r, rr);
+ sc.lo = -sc.lo;
+
+ int2 c = as_int2(regn & 1 ? sc.lo : sc.hi);
+ c.hi ^= (regn > 1) << 31;
+
+ return isnan(x) | isinf(x) ? as_double(QNANBITPATT_DP64) : as_double(c);
}
_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cos, double);
diff --git a/generic/lib/math/sincos_helpers.cl b/generic/lib/math/sincos_helpers.cl
index 8619b34..251b7f9 100644
--- a/generic/lib/math/sincos_helpers.cl
+++ b/generic/lib/math/sincos_helpers.cl
@@ -23,11 +23,15 @@
#include <clc/clc.h>
#include "math.h"
+#include "tables.h"
#include "sincos_helpers.h"
#define bitalign(hi, lo, shift) \
((hi) << (32 - (shift))) | ((lo) >> (shift));
+#define bytealign(src0, src1, src2) \
+ ((uint) (((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3)*8)))
+
_CLC_DEF float __clc_sinf_piby4(float x, float y) {
// Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
// = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
@@ -302,3 +306,240 @@ _CLC_DEF int __clc_argReductionS(float *r, float *rr, float x)
return __clc_argReductionLargeS(r, rr, x);
}
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// Reduction for medium sized arguments
+_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn) {
+ // How many pi/2 is x a multiple of?
+ const double two_by_pi = 0x1.45f306dc9c883p-1;
+ double dnpi2 = trunc(fma(x, two_by_pi, 0.5));
+
+ const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
+ const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
+ const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
+
+ // Compute product of npi2 with 159 bits of 2/pi
+ double p_hh = piby2_h * dnpi2;
+ double p_ht = fma(piby2_h, dnpi2, -p_hh);
+ double p_mh = piby2_m * dnpi2;
+ double p_mt = fma(piby2_m, dnpi2, -p_mh);
+ double p_th = piby2_t * dnpi2;
+ double p_tt = fma(piby2_t, dnpi2, -p_th);
+
+ // Reduce to 159 bits
+ double ph = p_hh;
+ double pm = p_ht + p_mh;
+ double t = p_mh - (pm - p_ht);
+ double pt = p_th + t + p_mt + p_tt;
+ t = ph + pm; pm = pm - (t - ph); ph = t;
+ t = pm + pt; pt = pt - (t - pm); pm = t;
+
+ // Subtract from x
+ t = x + ph;
+ double qh = t + pm;
+ double qt = pm - (qh - t) + pt;
+
+ *r = qh;
+ *rr = qt;
+ *regn = (int)(long)dnpi2 & 0x3;
+}
+
+// Given positive argument x, reduce it to the range [-pi/4,pi/4] using
+// extra precision, and return the result in r, rr.
+// Return value "regn" tells how many lots of pi/2 were subtracted
+// from x to put it in the range [-pi/4,pi/4], mod 4.
+
+_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn) {
+
+ long ux = as_long(x);
+ int e = (int)(ux >> 52) - 1023;
+ int i = max(23, (e >> 3) + 17);
+ int j = 150 - i;
+ int j16 = j & ~0xf;
+ double fract_temp;
+
+ // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary byte boundary
+ uint4 q0 = USE_TABLE(pibits_tbl, j16);
+ uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16));
+ uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32));
+
+ int k = (j >> 2) & 0x3;
+ int4 c = (int4)k == (int4)(0, 1, 2, 3);
+
+ uint u0, u1, u2, u3, u4, u5, u6;
+
+ u0 = c.s1 ? q0.s1 : q0.s0;
+ u0 = c.s2 ? q0.s2 : u0;
+ u0 = c.s3 ? q0.s3 : u0;
+
+ u1 = c.s1 ? q0.s2 : q0.s1;
+ u1 = c.s2 ? q0.s3 : u1;
+ u1 = c.s3 ? q1.s0 : u1;
+
+ u2 = c.s1 ? q0.s3 : q0.s2;
+ u2 = c.s2 ? q1.s0 : u2;
+ u2 = c.s3 ? q1.s1 : u2;
+
+ u3 = c.s1 ? q1.s0 : q0.s3;
+ u3 = c.s2 ? q1.s1 : u3;
+ u3 = c.s3 ? q1.s2 : u3;
+
+ u4 = c.s1 ? q1.s1 : q1.s0;
+ u4 = c.s2 ? q1.s2 : u4;
+ u4 = c.s3 ? q1.s3 : u4;
+
+ u5 = c.s1 ? q1.s2 : q1.s1;
+ u5 = c.s2 ? q1.s3 : u5;
+ u5 = c.s3 ? q2.s0 : u5;
+
+ u6 = c.s1 ? q1.s3 : q1.s2;
+ u6 = c.s2 ? q2.s0 : u6;
+ u6 = c.s3 ? q2.s1 : u6;
+
+ uint v0 = bytealign(u1, u0, j);
+ uint v1 = bytealign(u2, u1, j);
+ uint v2 = bytealign(u3, u2, j);
+ uint v3 = bytealign(u4, u3, j);
+ uint v4 = bytealign(u5, u4, j);
+ uint v5 = bytealign(u6, u5, j);
+
+ // Place those 192 bits in 4 48-bit doubles along with correct exponent
+ // If i > 1018 we would get subnormals so we scale p up and x down to get the same product
+ i = 2 + 8*i;
+ x *= i > 1018 ? 0x1.0p-136 : 1.0;
+ i -= i > 1018 ? 136 : 0;
+
+ uint ua = (uint)(1023 + 52 - i) << 20;
+ double a = as_double((uint2)(0, ua));
+ double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a;
+ ua += 0x03000000U;
+ a = as_double((uint2)(0, ua));
+ double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a;
+ ua += 0x03000000U;
+ a = as_double((uint2)(0, ua));
+ double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a;
+ ua += 0x03000000U;
+ a = as_double((uint2)(0, ua));
+ double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a;
+
+ // Exact multiply
+ double f0h = p0 * x;
+ double f0l = fma(p0, x, -f0h);
+ double f1h = p1 * x;
+ double f1l = fma(p1, x, -f1h);
+ double f2h = p2 * x;
+ double f2l = fma(p2, x, -f2h);
+ double f3h = p3 * x;
+ double f3l = fma(p3, x, -f3h);
+
+ // Accumulate product into 4 doubles
+ double s, t;
+
+ double f3 = f3h + f2h;
+ t = f2h - (f3 - f3h);
+ s = f3l + t;
+ t = t - (s - f3l);
+
+ double f2 = s + f1h;
+ t = f1h - (f2 - s) + t;
+ s = f2l + t;
+ t = t - (s - f2l);
+
+ double f1 = s + f0h;
+ t = f0h - (f1 - s) + t;
+ s = f1l + t;
+
+ double f0 = s + f0l;
+
+ // Strip off unwanted large integer bits
+ f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp);
+ f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
+
+ // Compute least significant integer bits
+ t = f3 + f2;
+ double di = t - fract(t, &fract_temp);
+ i = (float)di;
+
+ // Shift out remaining integer part
+ f3 -= di;
+ s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t;
+ s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t;
+ f1 += f0;
+
+ // Subtract 1 if fraction is >= 0.5, and update regn
+ int g = f3 >= 0.5;
+ i += g;
+ f3 -= (float)g;
+
+ // Shift up bits
+ s = f3 + f2; t = f2 -(s - f3); f3 = s; f2 = t + f1;
+
+ // Multiply precise fraction by pi/2 to get radians
+ const double p2h = 7074237752028440.0 / 0x1.0p+52;
+ const double p2t = 4967757600021510.0 / 0x1.0p+106;
+
+ double rhi = f3 * p2h;
+ double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
+
+ *r = rhi + rlo;
+ *rr = rlo - (*r - rhi);
+ *regn = i & 0x3;
+}
+
+
+_CLC_DEF double2 __clc_sincos_piby4(double x, double xx) {
+ // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+ // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+ // = x * f(w)
+ // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+ // We use a minimax approximation of (f(w) - 1) / w
+ // because this produces an expansion in even powers of x.
+ // If xx (the tail of x) is non-zero, we add a correction
+ // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
+ // is an approximation to cos(x)*sin(xx) valid because
+ // xx is tiny relative to x.
+
+ // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+ // = f(w)
+ // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+ // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+ // because this produces an expansion in even powers of x.
+ // If xx (the tail of x) is non-zero, we subtract a correction
+ // term g(x,xx) = x*xx to the result, where g(x,xx)
+ // is an approximation to sin(x)*sin(xx) valid because
+ // xx is tiny relative to x.
+
+ const double sc1 = -0.166666666666666646259241729;
+ const double sc2 = 0.833333333333095043065222816e-2;
+ const double sc3 = -0.19841269836761125688538679e-3;
+ const double sc4 = 0.275573161037288022676895908448e-5;
+ const double sc5 = -0.25051132068021699772257377197e-7;
+ const double sc6 = 0.159181443044859136852668200e-9;
+
+ const double cc1 = 0.41666666666666665390037e-1;
+ const double cc2 = -0.13888888888887398280412e-2;
+ const double cc3 = 0.248015872987670414957399e-4;
+ const double cc4 = -0.275573172723441909470836e-6;
+ const double cc5 = 0.208761463822329611076335e-8;
+ const double cc6 = -0.113826398067944859590880e-10;
+
+ double x2 = x * x;
+ double x3 = x2 * x;
+ double r = 0.5 * x2;
+ double t = 1.0 - r;
+
+ double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
+
+ double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1),
+ x2*x2, fma(x, xx, (1.0 - t) - r));
+
+ double2 ret;
+ ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx));
+ ret.hi = cp;
+
+ return ret;
+}
+
+#endif
diff --git a/generic/lib/math/sincos_helpers.h b/generic/lib/math/sincos_helpers.h
index f936d66..2565d44 100644
--- a/generic/lib/math/sincos_helpers.h
+++ b/generic/lib/math/sincos_helpers.h
@@ -23,3 +23,13 @@
_CLC_DECL float __clc_sinf_piby4(float x, float y);
_CLC_DECL float __clc_cosf_piby4(float x, float y);
_CLC_DECL int __clc_argReductionS(float *r, float *rr, float x);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DECL void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn);
+_CLC_DECL void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn);
+_CLC_DECL double2 __clc_sincos_piby4(double x, double xx);
+
+#endif
diff --git a/generic/lib/math/tables.cl b/generic/lib/math/tables.cl
index f22a6af..090e64a 100644
--- a/generic/lib/math/tables.cl
+++ b/generic/lib/math/tables.cl
@@ -288,9 +288,29 @@ DECLARE_TABLE(float, LOG_INV_TBL, 129) = {
0x1.000000p+0f,
};
+
+DECLARE_TABLE(uchar, PIBITS_TBL, ) = {
+ 224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175,
+ 169, 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31,
+ 235, 158, 212, 181, 168, 127, 121, 154, 253, 24, 61, 221, 38,
+ 44, 159, 60, 251, 217, 180, 125, 180, 41, 104, 45, 70, 188,
+ 188, 63, 96, 22, 120, 255, 95, 226, 127, 236, 160, 228, 247,
+ 46, 126, 17, 114, 210, 231, 76, 13, 230, 88, 71, 230, 4, 249,
+ 125, 209, 154, 192, 113, 166, 19, 18, 237, 186, 212, 215, 8,
+ 162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72, 70, 39,
+ 168, 187, 36, 25, 128, 75, 55, 9, 233, 184, 145, 220, 134, 21,
+ 239, 122, 175, 142, 69, 249, 7, 65, 14, 241, 100, 86, 138, 109,
+ 3, 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13,
+ 230, 139, 2, 0, 0, 0, 0, 0, 0, 0
+};
+
TABLE_FUNCTION(float2, LOGE_TBL, loge_tbl);
TABLE_FUNCTION(float, LOG_INV_TBL, log_inv_tbl);
+uint4 TABLE_MANGLE(pibits_tbl)(size_t idx) {
+ return *(__constant uint4 *)(PIBITS_TBL + idx);
+}
+
#ifdef cl_khr_fp64
DECLARE_TABLE(double2, LN_TBL, 65) = {
diff --git a/generic/lib/math/tables.h b/generic/lib/math/tables.h
index 1e82901..d09adf1 100644
--- a/generic/lib/math/tables.h
+++ b/generic/lib/math/tables.h
@@ -40,6 +40,7 @@
TABLE_FUNCTION_DECL(float2, loge_tbl);
TABLE_FUNCTION_DECL(float, log_inv_tbl);
+TABLE_FUNCTION_DECL(uint4, pibits_tbl);
#ifdef cl_khr_fp64
--
2.0.4
More information about the Libclc-dev
mailing list