[Libclc-dev] [PATCH 10/11] Implement cos for double types

Thu Mar 12 13:11:47 PDT 2015

This implementation was ported from the AMD builtin library
and has been tested with piglit, OpenCV, and the ocl conformance tests.
---
 generic/lib/math/cos.cl            |  24 ++--
 generic/lib/math/sincos_helpers.cl | 241 +++++++++++++++++++++++++++++++++++++
 generic/lib/math/sincos_helpers.h  |  10 ++
 generic/lib/math/tables.cl         |  20 +++
 generic/lib/math/tables.h          |   1 +
 5 files changed, 289 insertions(+), 7 deletions(-)

diff --git a/generic/lib/math/cos.cl b/generic/lib/math/cos.cl
index cbf7d59..157447f 100644
--- a/generic/lib/math/cos.cl
+++ b/generic/lib/math/cos.cl
@@ -52,14 +52,24 @@ _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, cos, float);
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
-#define __CLC_FUNCTION __clc_cos_intrinsic
-#define __CLC_INTRINSIC "llvm.cos"
-#include <clc/math/unary_intrin.inc>
-#undef __CLC_FUNCTION
-#undef __CLC_INTRINSIC
-
 _CLC_OVERLOAD _CLC_DEF double cos(double x) {
-    return __clc_cos_intrinsic(x);
+    x = fabs(x);
+
+    double r, rr;
+    int regn;
+
+    if (x < 0x1.0p+47)
+        __clc_remainder_piby2_medium(x, &r, &rr, &regn);
+    else
+        __clc_remainder_piby2_large(x, &r, &rr, &regn);
+
+    double2 sc = __clc_sincos_piby4(r, rr);
+    sc.lo = -sc.lo;
+
+    int2 c = as_int2(regn & 1 ? sc.lo : sc.hi);
+    c.hi ^= (regn > 1) << 31;
+
+    return isnan(x) | isinf(x) ? as_double(QNANBITPATT_DP64) : as_double(c);
 }
 
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cos, double);
diff --git a/generic/lib/math/sincos_helpers.cl b/generic/lib/math/sincos_helpers.cl
index 8619b34..251b7f9 100644
--- a/generic/lib/math/sincos_helpers.cl
+++ b/generic/lib/math/sincos_helpers.cl
@@ -23,11 +23,15 @@
 #include <clc/clc.h>
 
 #include "math.h"
+#include "tables.h"
 #include "sincos_helpers.h"
 
 #define bitalign(hi, lo, shift) \
   ((hi) << (32 - (shift))) | ((lo) >> (shift));
 
+#define bytealign(src0, src1, src2) \
+  ((uint) (((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3)*8)))
+
 _CLC_DEF float __clc_sinf_piby4(float x, float y) {
     // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
     // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
@@ -302,3 +306,240 @@ _CLC_DEF int __clc_argReductionS(float *r, float *rr, float x)
         return __clc_argReductionLargeS(r, rr, x);
 }
 
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// Reduction for medium sized arguments
+_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn) {
+    // How many pi/2 is x a multiple of?
+    const double two_by_pi = 0x1.45f306dc9c883p-1;
+    double dnpi2 = trunc(fma(x, two_by_pi, 0.5));
+
+    const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
+    const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
+    const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
+
+    // Compute product of npi2 with 159 bits of 2/pi
+    double p_hh = piby2_h * dnpi2;
+    double p_ht = fma(piby2_h, dnpi2, -p_hh);
+    double p_mh = piby2_m * dnpi2;
+    double p_mt = fma(piby2_m, dnpi2, -p_mh);
+    double p_th = piby2_t * dnpi2;
+    double p_tt = fma(piby2_t, dnpi2, -p_th);
+
+    // Reduce to 159 bits
+    double ph = p_hh;
+    double pm = p_ht + p_mh;
+    double t = p_mh - (pm - p_ht);
+    double pt = p_th + t + p_mt + p_tt;
+    t = ph + pm; pm = pm - (t - ph); ph = t;
+    t = pm + pt; pt = pt - (t - pm); pm = t;
+
+    // Subtract from x
+    t = x + ph;
+    double qh = t + pm;
+    double qt = pm - (qh - t) + pt;
+
+    *r = qh;
+    *rr = qt;
+    *regn = (int)(long)dnpi2 & 0x3;
+}
+
+// Given positive argument x, reduce it to the range [-pi/4,pi/4] using
+// extra precision, and return the result in r, rr.
+// Return value "regn" tells how many lots of pi/2 were subtracted
+// from x to put it in the range [-pi/4,pi/4], mod 4.
+
+_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn) {
+
+    long ux = as_long(x);
+    int e = (int)(ux >> 52) -  1023;
+    int i = max(23, (e >> 3) + 17);
+    int j = 150 - i;
+    int j16 = j & ~0xf;
+    double fract_temp;
+
+    // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary byte boundary
+    uint4 q0 = USE_TABLE(pibits_tbl, j16);
+    uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16));
+    uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32));
+
+    int k = (j >> 2) & 0x3;
+    int4 c = (int4)k == (int4)(0, 1, 2, 3);
+
+    uint u0, u1, u2, u3, u4, u5, u6;
+
+    u0 = c.s1 ? q0.s1 : q0.s0;
+    u0 = c.s2 ? q0.s2 : u0;
+    u0 = c.s3 ? q0.s3 : u0;
+
+    u1 = c.s1 ? q0.s2 : q0.s1;
+    u1 = c.s2 ? q0.s3 : u1;
+    u1 = c.s3 ? q1.s0 : u1;
+
+    u2 = c.s1 ? q0.s3 : q0.s2;
+    u2 = c.s2 ? q1.s0 : u2;
+    u2 = c.s3 ? q1.s1 : u2;
+
+    u3 = c.s1 ? q1.s0 : q0.s3;
+    u3 = c.s2 ? q1.s1 : u3;
+    u3 = c.s3 ? q1.s2 : u3;
+
+    u4 = c.s1 ? q1.s1 : q1.s0;
+    u4 = c.s2 ? q1.s2 : u4;
+    u4 = c.s3 ? q1.s3 : u4;
+
+    u5 = c.s1 ? q1.s2 : q1.s1;
+    u5 = c.s2 ? q1.s3 : u5;
+    u5 = c.s3 ? q2.s0 : u5;
+
+    u6 = c.s1 ? q1.s3 : q1.s2;
+    u6 = c.s2 ? q2.s0 : u6;
+    u6 = c.s3 ? q2.s1 : u6;
+
+    uint v0 = bytealign(u1, u0, j);
+    uint v1 = bytealign(u2, u1, j);
+    uint v2 = bytealign(u3, u2, j);
+    uint v3 = bytealign(u4, u3, j);
+    uint v4 = bytealign(u5, u4, j);
+    uint v5 = bytealign(u6, u5, j);
+
+    // Place those 192 bits in 4 48-bit doubles along with correct exponent
+    // If i > 1018 we would get subnormals so we scale p up and x down to get the same product
+    i = 2 + 8*i;
+    x *= i > 1018 ? 0x1.0p-136 : 1.0;
+    i -= i > 1018 ? 136 : 0;
+
+    uint ua = (uint)(1023 + 52 - i) << 20;
+    double a = as_double((uint2)(0, ua));
+    double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a;
+    ua += 0x03000000U;
+    a = as_double((uint2)(0, ua));
+    double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a;
+    ua += 0x03000000U;
+    a = as_double((uint2)(0, ua));
+    double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a;
+    ua += 0x03000000U;
+    a = as_double((uint2)(0, ua));
+    double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a;
+
+    // Exact multiply
+    double f0h = p0 * x;
+    double f0l = fma(p0, x, -f0h);
+    double f1h = p1 * x;
+    double f1l = fma(p1, x, -f1h);
+    double f2h = p2 * x;
+    double f2l = fma(p2, x, -f2h);
+    double f3h = p3 * x;
+    double f3l = fma(p3, x, -f3h);
+
+    // Accumulate product into 4 doubles
+    double s, t;
+
+    double f3 = f3h + f2h;
+    t = f2h - (f3 - f3h);
+    s = f3l + t;
+    t = t - (s - f3l);
+
+    double f2 = s + f1h;
+    t = f1h - (f2 - s) + t;
+    s = f2l + t;
+    t = t - (s - f2l);
+
+    double f1 = s + f0h;
+    t = f0h - (f1 - s) + t;
+    s = f1l + t;
+
+    double f0 = s + f0l;
+
+    // Strip off unwanted large integer bits
+    f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp);
+    f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
+
+    // Compute least significant integer bits
+    t = f3 + f2;
+    double di = t - fract(t, &fract_temp);
+    i = (float)di;
+
+    // Shift out remaining integer part
+    f3 -= di;
+    s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t;
+    s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t;
+    f1 += f0;
+
+    // Subtract 1 if fraction is >= 0.5, and update regn
+    int g = f3 >= 0.5;
+    i += g;
+    f3 -= (float)g;
+
+    // Shift up bits
+    s = f3 + f2; t = f2 -(s - f3); f3 = s; f2 = t + f1;
+
+    // Multiply precise fraction by pi/2 to get radians
+    const double p2h = 7074237752028440.0 / 0x1.0p+52;
+    const double p2t = 4967757600021510.0 / 0x1.0p+106;
+
+    double rhi = f3 * p2h;
+    double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
+
+    *r = rhi + rlo;
+    *rr = rlo - (*r - rhi);
+    *regn = i & 0x3;
+}
+
+
+_CLC_DEF double2 __clc_sincos_piby4(double x, double xx) {
+    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+    //                      = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+    //                      = x * f(w)
+    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+    // We use a minimax approximation of (f(w) - 1) / w
+    // because this produces an expansion in even powers of x.
+    // If xx (the tail of x) is non-zero, we add a correction
+    // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
+    // is an approximation to cos(x)*sin(xx) valid because
+    // xx is tiny relative to x.
+
+    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+    //                      = f(w)
+    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+    // because this produces an expansion in even powers of x.
+    // If xx (the tail of x) is non-zero, we subtract a correction
+    // term g(x,xx) = x*xx to the result, where g(x,xx)
+    // is an approximation to sin(x)*sin(xx) valid because
+    // xx is tiny relative to x.
+
+    const double sc1 = -0.166666666666666646259241729;
+    const double sc2 =  0.833333333333095043065222816e-2;
+    const double sc3 = -0.19841269836761125688538679e-3;
+    const double sc4 =  0.275573161037288022676895908448e-5;
+    const double sc5 = -0.25051132068021699772257377197e-7;
+    const double sc6 =  0.159181443044859136852668200e-9;
+
+    const double cc1 =  0.41666666666666665390037e-1;
+    const double cc2 = -0.13888888888887398280412e-2;
+    const double cc3 =  0.248015872987670414957399e-4;
+    const double cc4 = -0.275573172723441909470836e-6;
+    const double cc5 =  0.208761463822329611076335e-8;
+    const double cc6 = -0.113826398067944859590880e-10;
+
+    double x2 = x * x;
+    double x3 = x2 * x;
+    double r = 0.5 * x2;
+    double t = 1.0 - r;
+
+    double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
+
+    double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1),
+                        x2*x2, fma(x, xx, (1.0 - t) - r));
+
+    double2 ret;
+    ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx));
+    ret.hi = cp;
+
+    return ret;
+}
+
+#endif
diff --git a/generic/lib/math/sincos_helpers.h b/generic/lib/math/sincos_helpers.h
index f936d66..2565d44 100644
--- a/generic/lib/math/sincos_helpers.h
+++ b/generic/lib/math/sincos_helpers.h
@@ -23,3 +23,13 @@
 _CLC_DECL float __clc_sinf_piby4(float x, float y);
 _CLC_DECL float __clc_cosf_piby4(float x, float y);
 _CLC_DECL int __clc_argReductionS(float *r, float *rr, float x);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DECL void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn);
+_CLC_DECL void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn);
+_CLC_DECL double2 __clc_sincos_piby4(double x, double xx);
+
+#endif
diff --git a/generic/lib/math/tables.cl b/generic/lib/math/tables.cl
index f22a6af..090e64a 100644
--- a/generic/lib/math/tables.cl
+++ b/generic/lib/math/tables.cl
@@ -288,9 +288,29 @@ DECLARE_TABLE(float, LOG_INV_TBL, 129) = {
     0x1.000000p+0f,
 };
 
+
+DECLARE_TABLE(uchar, PIBITS_TBL, ) = {
+    224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175,
+    169, 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31,
+    235, 158, 212, 181, 168, 127, 121, 154, 253, 24, 61, 221, 38,
+    44, 159, 60, 251, 217, 180, 125, 180, 41, 104, 45, 70, 188,
+    188, 63, 96, 22, 120, 255, 95, 226, 127, 236, 160, 228, 247,
+    46, 126, 17, 114, 210, 231, 76, 13, 230, 88, 71, 230, 4, 249,
+    125, 209, 154, 192, 113, 166, 19, 18, 237, 186, 212, 215, 8,
+    162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72, 70, 39,
+    168, 187, 36, 25, 128, 75, 55, 9, 233, 184, 145, 220, 134, 21,
+    239, 122, 175, 142, 69, 249, 7, 65, 14, 241, 100, 86, 138, 109,
+    3, 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13,
+    230, 139, 2, 0, 0, 0, 0, 0, 0, 0
+};
+
 TABLE_FUNCTION(float2, LOGE_TBL, loge_tbl);
 TABLE_FUNCTION(float, LOG_INV_TBL, log_inv_tbl);
 
+uint4 TABLE_MANGLE(pibits_tbl)(size_t idx) {
+    return *(__constant uint4 *)(PIBITS_TBL + idx);
+}
+
 #ifdef cl_khr_fp64
 
 DECLARE_TABLE(double2, LN_TBL, 65) = {
diff --git a/generic/lib/math/tables.h b/generic/lib/math/tables.h
index 1e82901..d09adf1 100644
--- a/generic/lib/math/tables.h
+++ b/generic/lib/math/tables.h
@@ -40,6 +40,7 @@
 
 TABLE_FUNCTION_DECL(float2, loge_tbl);
 TABLE_FUNCTION_DECL(float, log_inv_tbl);
+TABLE_FUNCTION_DECL(uint4,  pibits_tbl);
 
 #ifdef cl_khr_fp64
 
-- 
2.0.4