[Libclc-dev] [PATCH] Implement log1p builtin
    Tom Stellard 
    thomas.stellard at amd.com
       
    Tue Sep  9 18:26:48 PDT 2014
    
    
  
---
 generic/include/clc/clc.h          |   1 +
 generic/include/clc/math/log1p.h   |  24 +++
 generic/include/clc/math/log1p.inc |  23 +++
 generic/lib/SOURCES                |   2 +
 generic/lib/math/log1p.cl          | 178 ++++++++++++++++++
 generic/lib/math/math.h            |  26 +++
 generic/lib/math/tables.cl         | 366 +++++++++++++++++++++++++++++++++++++
 generic/lib/math/tables.h          |  50 +++++
 8 files changed, 670 insertions(+)
 create mode 100644 generic/include/clc/math/log1p.h
 create mode 100644 generic/include/clc/math/log1p.inc
 create mode 100644 generic/lib/math/log1p.cl
 create mode 100644 generic/lib/math/tables.cl
 create mode 100644 generic/lib/math/tables.h
diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
index 490893b..5948a2a 100644
--- a/generic/include/clc/clc.h
+++ b/generic/include/clc/clc.h
@@ -48,6 +48,7 @@
 #include <clc/math/fmin.h>
 #include <clc/math/hypot.h>
 #include <clc/math/log.h>
+#include <clc/math/log1p.h>
 #include <clc/math/log2.h>
 #include <clc/math/mad.h>
 #include <clc/math/mix.h>
diff --git a/generic/include/clc/math/log1p.h b/generic/include/clc/math/log1p.h
new file mode 100644
index 0000000..4d716dd
--- /dev/null
+++ b/generic/include/clc/math/log1p.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define __CLC_BODY <clc/math/log1p.inc>
+#include <clc/math/gentype.inc>
diff --git a/generic/include/clc/math/log1p.inc b/generic/include/clc/math/log1p.inc
new file mode 100644
index 0000000..4cbfbf3
--- /dev/null
+++ b/generic/include/clc/math/log1p.inc
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE log1p(__CLC_GENTYPE a);
diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
index 8eaaa61..65e30a8 100644
--- a/generic/lib/SOURCES
+++ b/generic/lib/SOURCES
@@ -39,8 +39,10 @@ math/exp10.cl
 math/fmax.cl
 math/fmin.cl
 math/hypot.cl
+math/log1p.cl
 math/mad.cl
 math/mix.cl
+math/tables.cl
 math/clc_nextafter.cl
 math/nextafter.cl
 math/pown.cl
diff --git a/generic/lib/math/log1p.cl b/generic/lib/math/log1p.cl
new file mode 100644
index 0000000..de385f7
--- /dev/null
+++ b/generic/lib/math/log1p.cl
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "tables.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float log1p(float x)
+{
+    float w = x;
+    uint ux = as_uint(x);
+    uint ax = ux & EXSIGNBIT_SP32;
+
+    // |x| < 2^-4
+    float u2 = MATH_DIVIDE(x, 2.0f + x);
+    float u = u2 + u2;
+    float v = u * u;
+    // 2/(5 * 2^5), 2/(3 * 2^3)
+    float zsmall = mad(-u2, x, mad(v, 0x1.99999ap-7f, 0x1.555556p-4f) * v * u) + x;
+
+    // |x| >= 2^-4
+    //x = x + 1.0f;
+    ux = as_uint(x + 1.0f);
+
+    int m = (int)((ux >> EXPSHIFTBITS_SP32) & 0xff) - EXPBIAS_SP32;
+    float mf = (float)m;
+    uint indx = (ux & 0x007f0000) + ((ux & 0x00008000) << 1);
+    float F = as_float(indx | 0x3f000000);
+
+    // x > 2^24
+    float fg24 = F - as_float(0x3f000000 | (ux & MANTBITS_SP32));
+
+    // x <= 2^24
+    uint xhi = ux & 0xffff8000;
+    float xh = as_float(xhi);
+    float xt = (1.0f - xh) + w;
+    uint xnm = ((~(xhi & 0x7f800000)) - 0x00800000) & 0x7f800000;
+    xt = xt * as_float(xnm) * 0.5f;
+    float fl24 = F - as_float(0x3f000000 | (xhi & MANTBITS_SP32)) - xt;
+
+    float f = mf > 24.0f ? fg24 : fl24;
+
+    indx = indx >> 16;
+    float r = f * USE_TABLE(log_inv_tbl, indx);
+
+    // 1/3, 1/2
+    float poly = mad(mad(r, 0x1.555556p-2f, 0x1.0p-1f), r*r, r);
+
+    const float LOG2_HEAD = 0x1.62e000p-1f;   // 0.693115234
+    const float LOG2_TAIL = 0x1.0bfbe8p-15f;  // 0.0000319461833
+
+    float2 tv = USE_TABLE(loge_tbl, indx);
+    float z1 = mad(mf, LOG2_HEAD, tv.s0);
+    float z2 = mad(mf, LOG2_TAIL, -poly) + tv.s1;
+    float z = z1 + z2;
+
+    z = ax < 0x3d800000U ? zsmall : z;
+
+
+
+    // Edge cases
+    z = ax >= PINFBITPATT_SP32 ? w : z;
+    z = w  < -1.0f ? as_float(QNANBITPATT_SP32) : z;
+    z = w == -1.0f ? as_float(NINFBITPATT_SP32) : z;
+        //fix subnormals
+        z = ax  < 0x33800000 ? x : z;
+
+    return z;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, log1p, float);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double log1p(double x)
+{
+    // Computes natural log(1+x). Algorithm based on:
+    // Ping-Tak Peter Tang
+    // "Table-driven implementation of the logarithm function in IEEE
+    // floating-point arithmetic"
+    // ACM Transactions on Mathematical Software (TOMS)
+    // Volume 16, Issue 4 (December 1990)
+    // Note that we use a lookup table of size 64 rather than 128,
+    // and compensate by having extra terms in the minimax polynomial
+    // for the kernel approximation.
+
+    // Process Inside the threshold now
+    ulong ux = as_ulong(1.0 + x);
+    int xexp = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64;
+    double f = as_double(ONEEXPBITS_DP64 | (ux & MANTBITS_DP64));
+
+    int j = as_int2(ux).hi >> 13;
+    j = ((0x80 | (j & 0x7e)) >> 1) + (j & 0x1);
+    double f1 = (double)j * 0x1.0p-6;
+    j -= 64;
+
+    double f2temp = f - f1;
+    double m2 = as_double(convert_ulong(0x3ff - xexp) << EXPSHIFTBITS_DP64);
+    double f2l = fma(m2, x, m2 - f1);
+    double f2g = fma(m2, x, -f1) + m2;
+    double f2 = xexp <= MANTLENGTH_DP64-1 ? f2l : f2g;
+    f2 = (xexp <= -2) | (xexp >= MANTLENGTH_DP64+8) ? f2temp : f2;
+
+    double2 tv = USE_TABLE(ln_tbl, j);
+    double z1 = tv.s0;
+    double q = tv.s1;
+
+    double u = MATH_DIVIDE(f2, fma(0.5, f2, f1));
+    double v = u * u;
+
+    double poly = v * fma(v,
+                          fma(v, 2.23219810758559851206e-03, 1.24999999978138668903e-02),
+                          8.33333333333333593622e-02);
+
+    // log2_lead and log2_tail sum to an extra-precise version of log(2)
+    const double log2_lead = 6.93147122859954833984e-01; /* 0x3fe62e42e0000000 */
+    const double log2_tail = 5.76999904754328540596e-08; /* 0x3e6efa39ef35793c */
+
+    double z2 = q + fma(u, poly, u);
+    double dxexp = (double)xexp;
+    double r1 = fma(dxexp, log2_lead, z1);
+    double r2 = fma(dxexp, log2_tail, z2);
+    double result1 = r1 + r2;
+
+    // Process Outside the threshold now
+    double r = x;
+    u = r / (2.0 + r);
+    double correction = r * u;
+    u = u + u;
+    v = u * u;
+    r1 = r;
+
+    poly = fma(v,
+               fma(v,
+                   fma(v, 4.34887777707614552256e-04, 2.23213998791944806202e-03),
+                   1.25000000037717509602e-02),
+               8.33333333333317923934e-02);
+
+    r2 = fma(u*v, poly, -correction);
+
+    // The values exp(-1/16)-1 and exp(1/16)-1
+    const double log1p_thresh1 = -0x1.f0540438fd5c3p-5;
+    const double log1p_thresh2 =  0x1.082b577d34ed8p-4;
+    double result2 = r1 + r2;
+    result2 = x < log1p_thresh1 | x > log1p_thresh2 ? result1 : result2;
+
+    result2 = isinf(x) ? x : result2;
+    result2 = x < -1.0 ? as_double(QNANBITPATT_DP64) : result2;
+    result2 = x == -1.0 ? as_double(NINFBITPATT_DP64) : result2;
+    return result2;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log1p, double);
+
+#endif // cl_khr_fp64
diff --git a/generic/lib/math/math.h b/generic/lib/math/math.h
index 53ed38a..f46c7ea 100644
--- a/generic/lib/math/math.h
+++ b/generic/lib/math/math.h
@@ -61,4 +61,30 @@
 #define MANTLENGTH_SP32   24
 #define BASEDIGITS_SP32   7
 
+#ifdef cl_khr_fp64
+
+#define SIGNBIT_DP64      0x8000000000000000L
+#define EXSIGNBIT_DP64    0x7fffffffffffffffL
+#define EXPBITS_DP64      0x7ff0000000000000L
+#define MANTBITS_DP64     0x000fffffffffffffL
+#define ONEEXPBITS_DP64   0x3ff0000000000000L
+#define TWOEXPBITS_DP64   0x4000000000000000L
+#define HALFEXPBITS_DP64  0x3fe0000000000000L
+#define IMPBIT_DP64       0x0010000000000000L
+#define QNANBITPATT_DP64  0x7ff8000000000000L
+#define INDEFBITPATT_DP64 0xfff8000000000000L
+#define PINFBITPATT_DP64  0x7ff0000000000000L
+#define NINFBITPATT_DP64  0xfff0000000000000L
+#define EXPBIAS_DP64      1023
+#define EXPSHIFTBITS_DP64 52
+#define BIASEDEMIN_DP64   1
+#define EMIN_DP64         -1022
+#define BIASEDEMAX_DP64   2046 /* 0x7fe */
+#define EMAX_DP64         1023 /* 0x3ff */
+#define LAMBDA_DP64       1.0e300
+#define MANTLENGTH_DP64   53
+#define BASEDIGITS_DP64   15
+
+#endif // cl_khr_fp64
+
 #define ALIGNED(x)	__attribute__((aligned(x)))
diff --git a/generic/lib/math/tables.cl b/generic/lib/math/tables.cl
new file mode 100644
index 0000000..b5345a2
--- /dev/null
+++ b/generic/lib/math/tables.cl
@@ -0,0 +1,366 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "tables.h"
+
+DECLARE_TABLE(float2, LOGE_TBL, 129) = {
+    (float2)(0x0.000000p+0f, 0x0.000000p+0f),
+    (float2)(0x1.fe0000p-8f, 0x1.535882p-23f),
+    (float2)(0x1.fc0000p-7f, 0x1.5161f8p-20f),
+    (float2)(0x1.7b8000p-6f, 0x1.1b07d4p-18f),
+    (float2)(0x1.f82000p-6f, 0x1.361cf0p-19f),
+    (float2)(0x1.39e000p-5f, 0x1.0f73fcp-18f),
+    (float2)(0x1.774000p-5f, 0x1.63d8cap-19f),
+    (float2)(0x1.b42000p-5f, 0x1.bae232p-18f),
+    (float2)(0x1.f0a000p-5f, 0x1.86008ap-20f),
+    (float2)(0x1.164000p-4f, 0x1.36eea2p-16f),
+    (float2)(0x1.340000p-4f, 0x1.d7961ap-16f),
+    (float2)(0x1.51a000p-4f, 0x1.073f06p-16f),
+    (float2)(0x1.6f0000p-4f, 0x1.a515cap-17f),
+    (float2)(0x1.8c2000p-4f, 0x1.45d630p-16f),
+    (float2)(0x1.a92000p-4f, 0x1.b4e92ap-18f),
+    (float2)(0x1.c5e000p-4f, 0x1.523d6ep-18f),
+    (float2)(0x1.e26000p-4f, 0x1.076e2ap-16f),
+    (float2)(0x1.fec000p-4f, 0x1.2263b6p-17f),
+    (float2)(0x1.0d6000p-3f, 0x1.7e7cd0p-15f),
+    (float2)(0x1.1b6000p-3f, 0x1.2ad52ep-15f),
+    (float2)(0x1.294000p-3f, 0x1.52f81ep-15f),
+    (float2)(0x1.370000p-3f, 0x1.fc201ep-15f),
+    (float2)(0x1.44c000p-3f, 0x1.2b6ccap-15f),
+    (float2)(0x1.526000p-3f, 0x1.cbc742p-16f),
+    (float2)(0x1.5fe000p-3f, 0x1.3070a6p-15f),
+    (float2)(0x1.6d6000p-3f, 0x1.fce33ap-20f),
+    (float2)(0x1.7aa000p-3f, 0x1.890210p-15f),
+    (float2)(0x1.87e000p-3f, 0x1.a06520p-15f),
+    (float2)(0x1.952000p-3f, 0x1.6a73d0p-17f),
+    (float2)(0x1.a22000p-3f, 0x1.bc1fe2p-15f),
+    (float2)(0x1.af2000p-3f, 0x1.c94e80p-15f),
+    (float2)(0x1.bc2000p-3f, 0x1.0ce85ap-16f),
+    (float2)(0x1.c8e000p-3f, 0x1.f7c79ap-15f),
+    (float2)(0x1.d5c000p-3f, 0x1.0b5a7cp-18f),
+    (float2)(0x1.e26000p-3f, 0x1.076e2ap-15f),
+    (float2)(0x1.ef0000p-3f, 0x1.5b97b8p-16f),
+    (float2)(0x1.fb8000p-3f, 0x1.186d5ep-15f),
+    (float2)(0x1.040000p-2f, 0x1.2ca5a6p-17f),
+    (float2)(0x1.0a2000p-2f, 0x1.24e272p-14f),
+    (float2)(0x1.104000p-2f, 0x1.8bf9aep-14f),
+    (float2)(0x1.166000p-2f, 0x1.5cabaap-14f),
+    (float2)(0x1.1c8000p-2f, 0x1.3182d2p-15f),
+    (float2)(0x1.228000p-2f, 0x1.41fbcep-14f),
+    (float2)(0x1.288000p-2f, 0x1.5a13dep-14f),
+    (float2)(0x1.2e8000p-2f, 0x1.c575c2p-15f),
+    (float2)(0x1.346000p-2f, 0x1.dd9a98p-14f),
+    (float2)(0x1.3a6000p-2f, 0x1.3155a4p-16f),
+    (float2)(0x1.404000p-2f, 0x1.843434p-17f),
+    (float2)(0x1.460000p-2f, 0x1.8bc21cp-14f),
+    (float2)(0x1.4be000p-2f, 0x1.7e55dcp-16f),
+    (float2)(0x1.51a000p-2f, 0x1.5b0e5ap-15f),
+    (float2)(0x1.576000p-2f, 0x1.dc5d14p-16f),
+    (float2)(0x1.5d0000p-2f, 0x1.bdbf58p-14f),
+    (float2)(0x1.62c000p-2f, 0x1.05e572p-15f),
+    (float2)(0x1.686000p-2f, 0x1.903d36p-15f),
+    (float2)(0x1.6e0000p-2f, 0x1.1d5456p-15f),
+    (float2)(0x1.738000p-2f, 0x1.d7f6bap-14f),
+    (float2)(0x1.792000p-2f, 0x1.4abfbap-15f),
+    (float2)(0x1.7ea000p-2f, 0x1.f07704p-15f),
+    (float2)(0x1.842000p-2f, 0x1.a3b43cp-15f),
+    (float2)(0x1.89a000p-2f, 0x1.9c360ap-17f),
+    (float2)(0x1.8f0000p-2f, 0x1.1e8736p-14f),
+    (float2)(0x1.946000p-2f, 0x1.941c20p-14f),
+    (float2)(0x1.99c000p-2f, 0x1.958116p-14f),
+    (float2)(0x1.9f2000p-2f, 0x1.23ecbep-14f),
+    (float2)(0x1.a48000p-2f, 0x1.024396p-16f),
+    (float2)(0x1.a9c000p-2f, 0x1.d93534p-15f),
+    (float2)(0x1.af0000p-2f, 0x1.293246p-14f),
+    (float2)(0x1.b44000p-2f, 0x1.eef798p-15f),
+    (float2)(0x1.b98000p-2f, 0x1.625a4cp-16f),
+    (float2)(0x1.bea000p-2f, 0x1.4d9da6p-14f),
+    (float2)(0x1.c3c000p-2f, 0x1.d7a7ccp-14f),
+    (float2)(0x1.c8e000p-2f, 0x1.f7c79ap-14f),
+    (float2)(0x1.ce0000p-2f, 0x1.af0b84p-14f),
+    (float2)(0x1.d32000p-2f, 0x1.fcfc00p-15f),
+    (float2)(0x1.d82000p-2f, 0x1.e7258ap-14f),
+    (float2)(0x1.dd4000p-2f, 0x1.a81306p-16f),
+    (float2)(0x1.e24000p-2f, 0x1.1034f8p-15f),
+    (float2)(0x1.e74000p-2f, 0x1.09875ap-16f),
+    (float2)(0x1.ec2000p-2f, 0x1.99d246p-14f),
+    (float2)(0x1.f12000p-2f, 0x1.1ebf5ep-15f),
+    (float2)(0x1.f60000p-2f, 0x1.23fa70p-14f),
+    (float2)(0x1.fae000p-2f, 0x1.588f78p-14f),
+    (float2)(0x1.ffc000p-2f, 0x1.2e0856p-14f),
+    (float2)(0x1.024000p-1f, 0x1.52a5a4p-13f),
+    (float2)(0x1.04a000p-1f, 0x1.df9da8p-13f),
+    (float2)(0x1.072000p-1f, 0x1.f2e0e6p-16f),
+    (float2)(0x1.098000p-1f, 0x1.bd3d5cp-15f),
+    (float2)(0x1.0be000p-1f, 0x1.cb9094p-15f),
+    (float2)(0x1.0e4000p-1f, 0x1.261746p-15f),
+    (float2)(0x1.108000p-1f, 0x1.f39e2cp-13f),
+    (float2)(0x1.12e000p-1f, 0x1.719592p-13f),
+    (float2)(0x1.154000p-1f, 0x1.87a5e8p-14f),
+    (float2)(0x1.178000p-1f, 0x1.eabbd8p-13f),
+    (float2)(0x1.19e000p-1f, 0x1.cd68cep-14f),
+    (float2)(0x1.1c2000p-1f, 0x1.b81f70p-13f),
+    (float2)(0x1.1e8000p-1f, 0x1.7d79c0p-15f),
+    (float2)(0x1.20c000p-1f, 0x1.b9a324p-14f),
+    (float2)(0x1.230000p-1f, 0x1.30d7bep-13f),
+    (float2)(0x1.254000p-1f, 0x1.5bce98p-13f),
+    (float2)(0x1.278000p-1f, 0x1.5e1288p-13f),
+    (float2)(0x1.29c000p-1f, 0x1.37fec2p-13f),
+    (float2)(0x1.2c0000p-1f, 0x1.d3da88p-14f),
+    (float2)(0x1.2e4000p-1f, 0x1.d0db90p-15f),
+    (float2)(0x1.306000p-1f, 0x1.d7334ep-13f),
+    (float2)(0x1.32a000p-1f, 0x1.133912p-13f),
+    (float2)(0x1.34e000p-1f, 0x1.44ece6p-16f),
+    (float2)(0x1.370000p-1f, 0x1.17b546p-13f),
+    (float2)(0x1.392000p-1f, 0x1.e0d356p-13f),
+    (float2)(0x1.3b6000p-1f, 0x1.0893fep-14f),
+    (float2)(0x1.3d8000p-1f, 0x1.026a70p-13f),
+    (float2)(0x1.3fa000p-1f, 0x1.5b84d0p-13f),
+    (float2)(0x1.41c000p-1f, 0x1.8fe846p-13f),
+    (float2)(0x1.43e000p-1f, 0x1.9fe2f8p-13f),
+    (float2)(0x1.460000p-1f, 0x1.8bc21cp-13f),
+    (float2)(0x1.482000p-1f, 0x1.53d1eap-13f),
+    (float2)(0x1.4a4000p-1f, 0x1.f0bb60p-14f),
+    (float2)(0x1.4c6000p-1f, 0x1.e6bf32p-15f),
+    (float2)(0x1.4e6000p-1f, 0x1.d811b6p-13f),
+    (float2)(0x1.508000p-1f, 0x1.13cc00p-13f),
+    (float2)(0x1.52a000p-1f, 0x1.6932dep-16f),
+    (float2)(0x1.54a000p-1f, 0x1.246798p-13f),
+    (float2)(0x1.56a000p-1f, 0x1.f9d5b2p-13f),
+    (float2)(0x1.58c000p-1f, 0x1.5b6b9ap-14f),
+    (float2)(0x1.5ac000p-1f, 0x1.404c34p-13f),
+    (float2)(0x1.5cc000p-1f, 0x1.b1dc6cp-13f),
+    (float2)(0x1.5ee000p-1f, 0x1.54920ap-20f),
+    (float2)(0x1.60e000p-1f, 0x1.97a23cp-16f),
+    (float2)(0x1.62e000p-1f, 0x1.0bfbe8p-15f),
+};
+
+DECLARE_TABLE(float, LOG_INV_TBL, 129) = {
+    0x1.000000p+1f,
+    0x1.fc07f0p+0f,
+    0x1.f81f82p+0f,
+    0x1.f4465ap+0f,
+    0x1.f07c20p+0f,
+    0x1.ecc07cp+0f,
+    0x1.e9131ap+0f,
+    0x1.e573acp+0f,
+    0x1.e1e1e2p+0f,
+    0x1.de5d6ep+0f,
+    0x1.dae608p+0f,
+    0x1.d77b66p+0f,
+    0x1.d41d42p+0f,
+    0x1.d0cb58p+0f,
+    0x1.cd8568p+0f,
+    0x1.ca4b30p+0f,
+    0x1.c71c72p+0f,
+    0x1.c3f8f0p+0f,
+    0x1.c0e070p+0f,
+    0x1.bdd2b8p+0f,
+    0x1.bacf92p+0f,
+    0x1.b7d6c4p+0f,
+    0x1.b4e81cp+0f,
+    0x1.b20364p+0f,
+    0x1.af286cp+0f,
+    0x1.ac5702p+0f,
+    0x1.a98ef6p+0f,
+    0x1.a6d01ap+0f,
+    0x1.a41a42p+0f,
+    0x1.a16d40p+0f,
+    0x1.9ec8eap+0f,
+    0x1.9c2d14p+0f,
+    0x1.99999ap+0f,
+    0x1.970e50p+0f,
+    0x1.948b10p+0f,
+    0x1.920fb4p+0f,
+    0x1.8f9c18p+0f,
+    0x1.8d3018p+0f,
+    0x1.8acb90p+0f,
+    0x1.886e60p+0f,
+    0x1.861862p+0f,
+    0x1.83c978p+0f,
+    0x1.818182p+0f,
+    0x1.7f4060p+0f,
+    0x1.7d05f4p+0f,
+    0x1.7ad220p+0f,
+    0x1.78a4c8p+0f,
+    0x1.767dcep+0f,
+    0x1.745d18p+0f,
+    0x1.724288p+0f,
+    0x1.702e06p+0f,
+    0x1.6e1f76p+0f,
+    0x1.6c16c2p+0f,
+    0x1.6a13cep+0f,
+    0x1.681682p+0f,
+    0x1.661ec6p+0f,
+    0x1.642c86p+0f,
+    0x1.623fa8p+0f,
+    0x1.605816p+0f,
+    0x1.5e75bcp+0f,
+    0x1.5c9882p+0f,
+    0x1.5ac056p+0f,
+    0x1.58ed24p+0f,
+    0x1.571ed4p+0f,
+    0x1.555556p+0f,
+    0x1.539094p+0f,
+    0x1.51d07ep+0f,
+    0x1.501502p+0f,
+    0x1.4e5e0ap+0f,
+    0x1.4cab88p+0f,
+    0x1.4afd6ap+0f,
+    0x1.49539ep+0f,
+    0x1.47ae14p+0f,
+    0x1.460cbcp+0f,
+    0x1.446f86p+0f,
+    0x1.42d662p+0f,
+    0x1.414142p+0f,
+    0x1.3fb014p+0f,
+    0x1.3e22ccp+0f,
+    0x1.3c995ap+0f,
+    0x1.3b13b2p+0f,
+    0x1.3991c2p+0f,
+    0x1.381382p+0f,
+    0x1.3698e0p+0f,
+    0x1.3521d0p+0f,
+    0x1.33ae46p+0f,
+    0x1.323e34p+0f,
+    0x1.30d190p+0f,
+    0x1.2f684cp+0f,
+    0x1.2e025cp+0f,
+    0x1.2c9fb4p+0f,
+    0x1.2b404ap+0f,
+    0x1.29e412p+0f,
+    0x1.288b02p+0f,
+    0x1.27350cp+0f,
+    0x1.25e228p+0f,
+    0x1.24924ap+0f,
+    0x1.234568p+0f,
+    0x1.21fb78p+0f,
+    0x1.20b470p+0f,
+    0x1.1f7048p+0f,
+    0x1.1e2ef4p+0f,
+    0x1.1cf06ap+0f,
+    0x1.1bb4a4p+0f,
+    0x1.1a7b96p+0f,
+    0x1.194538p+0f,
+    0x1.181182p+0f,
+    0x1.16e068p+0f,
+    0x1.15b1e6p+0f,
+    0x1.1485f0p+0f,
+    0x1.135c82p+0f,
+    0x1.12358ep+0f,
+    0x1.111112p+0f,
+    0x1.0fef02p+0f,
+    0x1.0ecf56p+0f,
+    0x1.0db20ap+0f,
+    0x1.0c9714p+0f,
+    0x1.0b7e6ep+0f,
+    0x1.0a6810p+0f,
+    0x1.0953f4p+0f,
+    0x1.084210p+0f,
+    0x1.073260p+0f,
+    0x1.0624dep+0f,
+    0x1.051980p+0f,
+    0x1.041042p+0f,
+    0x1.03091cp+0f,
+    0x1.020408p+0f,
+    0x1.010102p+0f,
+    0x1.000000p+0f,
+};
+
+TABLE_FUNCTION(float2, LOGE_TBL, loge_tbl);
+TABLE_FUNCTION(float, LOG_INV_TBL, log_inv_tbl);
+
+#ifdef cl_khr_fp64
+
+DECLARE_TABLE(double2, LN_TBL, 65) = {
+    (double2)(0x0.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.fc0a800000000p-7, 0x1.61f807c79f3dbp-28),
+    (double2)(0x1.f829800000000p-6, 0x1.873c1980267c8p-25),
+    (double2)(0x1.7745800000000p-5, 0x1.ec65b9f88c69ep-26),
+    (double2)(0x1.f0a3000000000p-5, 0x1.8022c54cc2f99p-26),
+    (double2)(0x1.341d700000000p-4, 0x1.2c37a3a125330p-25),
+    (double2)(0x1.6f0d200000000p-4, 0x1.15cad69737c93p-25),
+    (double2)(0x1.a926d00000000p-4, 0x1.d256ab1b285e9p-27),
+    (double2)(0x1.e270700000000p-4, 0x1.b8abcb97a7aa2p-26),
+    (double2)(0x1.0d77e00000000p-3, 0x1.f34239659a5dcp-25),
+    (double2)(0x1.2955280000000p-3, 0x1.e07fd48d30177p-25),
+    (double2)(0x1.44d2b00000000p-3, 0x1.b32df4799f4f6p-25),
+    (double2)(0x1.5ff3000000000p-3, 0x1.c29e4f4f21cf8p-25),
+    (double2)(0x1.7ab8900000000p-3, 0x1.086c848df1b59p-30),
+    (double2)(0x1.9525a80000000p-3, 0x1.cf456b4764130p-27),
+    (double2)(0x1.af3c900000000p-3, 0x1.3a02ffcb63398p-25),
+    (double2)(0x1.c8ff780000000p-3, 0x1.1e6a6886b0976p-25),
+    (double2)(0x1.e270700000000p-3, 0x1.b8abcb97a7aa2p-25),
+    (double2)(0x1.fb91800000000p-3, 0x1.b578f8aa35552p-25),
+    (double2)(0x1.0a324c0000000p-2, 0x1.139c871afb9fcp-25),
+    (double2)(0x1.1675c80000000p-2, 0x1.5d5d30701ce64p-25),
+    (double2)(0x1.22941c0000000p-2, 0x1.de7bcb2d12142p-25),
+    (double2)(0x1.2e8e280000000p-2, 0x1.d708e984e1664p-25),
+    (double2)(0x1.3a64c40000000p-2, 0x1.56945e9c72f36p-26),
+    (double2)(0x1.4618bc0000000p-2, 0x1.0e2f613e85bdap-29),
+    (double2)(0x1.51aad80000000p-2, 0x1.cb7e0b42724f6p-28),
+    (double2)(0x1.5d1bd80000000p-2, 0x1.fac04e52846c7p-25),
+    (double2)(0x1.686c800000000p-2, 0x1.e9b14aec442bep-26),
+    (double2)(0x1.739d7c0000000p-2, 0x1.b5de8034e7126p-25),
+    (double2)(0x1.7eaf800000000p-2, 0x1.dc157e1b259d3p-25),
+    (double2)(0x1.89a3380000000p-2, 0x1.b05096ad69c62p-28),
+    (double2)(0x1.9479400000000p-2, 0x1.c2116faba4cddp-26),
+    (double2)(0x1.9f323c0000000p-2, 0x1.65fcc25f95b47p-25),
+    (double2)(0x1.a9cec80000000p-2, 0x1.a9a08498d4850p-26),
+    (double2)(0x1.b44f740000000p-2, 0x1.de647b1465f77p-25),
+    (double2)(0x1.beb4d80000000p-2, 0x1.da71b7bf7861dp-26),
+    (double2)(0x1.c8ff7c0000000p-2, 0x1.e6a6886b09760p-28),
+    (double2)(0x1.d32fe40000000p-2, 0x1.f0075eab0ef64p-25),
+    (double2)(0x1.dd46a00000000p-2, 0x1.3071282fb989bp-28),
+    (double2)(0x1.e744240000000p-2, 0x1.0eb43c3f1bed2p-25),
+    (double2)(0x1.f128f40000000p-2, 0x1.faf06ecb35c84p-26),
+    (double2)(0x1.faf5880000000p-2, 0x1.ef1e63db35f68p-27),
+    (double2)(0x1.02552a0000000p-1, 0x1.69743fb1a71a5p-27),
+    (double2)(0x1.0723e40000000p-1, 0x1.c1cdf404e5796p-25),
+    (double2)(0x1.0be72e0000000p-1, 0x1.094aa0ada625ep-27),
+    (double2)(0x1.109f380000000p-1, 0x1.e2d4c96fde3ecp-25),
+    (double2)(0x1.154c3c0000000p-1, 0x1.2f4d5e9a98f34p-25),
+    (double2)(0x1.19ee6a0000000p-1, 0x1.467c96ecc5cbep-25),
+    (double2)(0x1.1e85f40000000p-1, 0x1.e7040d03dec5ap-25),
+    (double2)(0x1.23130c0000000p-1, 0x1.7bebf4282de36p-25),
+    (double2)(0x1.2795e00000000p-1, 0x1.289b11aeb783fp-25),
+    (double2)(0x1.2c0e9e0000000p-1, 0x1.a891d1772f538p-26),
+    (double2)(0x1.307d720000000p-1, 0x1.34f10be1fb591p-25),
+    (double2)(0x1.34e2880000000p-1, 0x1.d9ce1d316eb93p-25),
+    (double2)(0x1.393e0c0000000p-1, 0x1.3562a19a9c442p-25),
+    (double2)(0x1.3d90260000000p-1, 0x1.4e2adf548084cp-26),
+    (double2)(0x1.41d8fe0000000p-1, 0x1.08ce55cc8c97ap-26),
+    (double2)(0x1.4618bc0000000p-1, 0x1.0e2f613e85bdap-28),
+    (double2)(0x1.4a4f840000000p-1, 0x1.db03ebb0227bfp-25),
+    (double2)(0x1.4e7d800000000p-1, 0x1.1b75bb09cb098p-25),
+    (double2)(0x1.52a2d20000000p-1, 0x1.96f16abb9df22p-27),
+    (double2)(0x1.56bf9c0000000p-1, 0x1.5b3f399411c62p-25),
+    (double2)(0x1.5ad4040000000p-1, 0x1.86b3e59f65355p-26),
+    (double2)(0x1.5ee02a0000000p-1, 0x1.2482ceae1ac12p-26),
+    (double2)(0x1.62e42e0000000p-1, 0x1.efa39ef35793cp-25),
+};
+
+TABLE_FUNCTION(double2, LN_TBL, ln_tbl);
+
+#endif // cl_khr_fp64
diff --git a/generic/lib/math/tables.h b/generic/lib/math/tables.h
new file mode 100644
index 0000000..9255440
--- /dev/null
+++ b/generic/lib/math/tables.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define TABLE_SPACE __constant
+
+#define TABLE_MANGLE(NAME) __clc_##NAME
+
+#define DECLARE_TABLE(TYPE,NAME,LENGTH) \
+    TABLE_SPACE TYPE NAME [ LENGTH ]
+
+#define TABLE_FUNCTION(TYPE,TABLE,NAME) \
+    TYPE TABLE_MANGLE(NAME)(size_t idx) { \
+        return TABLE[idx]; \
+    }
+
+#define TABLE_FUNCTION_DECL(TYPE, NAME) \
+    TYPE TABLE_MANGLE(NAME)(size_t idx);
+
+#define USE_TABLE(NAME, IDX) \
+    TABLE_MANGLE(NAME)(IDX)
+
+TABLE_FUNCTION_DECL(float2, loge_tbl);
+TABLE_FUNCTION_DECL(float, log_inv_tbl);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+TABLE_FUNCTION_DECL(double2, ln_tbl);
+
+#endif // cl_khr_fp64
-- 
1.8.5.5
    
    
More information about the Libclc-dev
mailing list