[libclc] libclc: Update f64 trig functions (PR #187455)
Matt Arsenault via cfe-commits
cfe-commits at lists.llvm.org
Thu Mar 19 01:11:57 PDT 2026
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/187455
>From 32f7ae21ddee43d581cc70a5448e83ff7e1e2e02 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 13 Mar 2026 19:06:29 +0100
Subject: [PATCH] libclc: Update f64 trig functions
Most of of this was originally ported from rocm
device libs in 2e6ff0c66e180998425776a27579559dc099732f. Merge
in more recent changes.
---
.../include/clc/math/clc_get_twobypi_bits.h | 25 ++
.../clc/math/clc_get_twobypi_bits_decl.inc | 10 +
.../clc/math/clc_sincos_helpers_fp64_decl.inc | 29 +-
libclc/clc/include/clc/math/tables.h | 2 +-
libclc/clc/lib/amdgpu/CMakeLists.txt | 1 +
.../lib/amdgpu/math/clc_get_twobypi_bits.cl | 21 +
libclc/clc/lib/generic/CMakeLists.txt | 1 +
libclc/clc/lib/generic/math/clc_cos.inc | 29 +-
.../lib/generic/math/clc_get_twobypi_bits.cl | 17 +
.../lib/generic/math/clc_get_twobypi_bits.inc | 40 ++
libclc/clc/lib/generic/math/clc_sin.inc | 32 +-
.../lib/generic/math/clc_sincos_helpers.cl | 11 +-
.../generic/math/clc_sincos_helpers_fp64.inc | 373 ++++++++----------
libclc/clc/lib/generic/math/clc_tables.cl | 103 +++--
libclc/clc/lib/generic/math/clc_tan.inc | 32 +-
15 files changed, 386 insertions(+), 340 deletions(-)
create mode 100644 libclc/clc/include/clc/math/clc_get_twobypi_bits.h
create mode 100644 libclc/clc/include/clc/math/clc_get_twobypi_bits_decl.inc
create mode 100644 libclc/clc/lib/amdgpu/math/clc_get_twobypi_bits.cl
create mode 100644 libclc/clc/lib/generic/math/clc_get_twobypi_bits.cl
create mode 100644 libclc/clc/lib/generic/math/clc_get_twobypi_bits.inc
diff --git a/libclc/clc/include/clc/math/clc_get_twobypi_bits.h b/libclc/clc/include/clc/math/clc_get_twobypi_bits.h
new file mode 100644
index 0000000000000..82aed9df5dbab
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_get_twobypi_bits.h
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility function for trigonometric reductions to extract bits out of 2/pi
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_GET_TWOBYPI_BITS_H__
+#define __CLC_MATH_CLC_GET_TWOBYPI_BITS_H__
+
+#define __CLC_DOUBLE_ONLY
+#define __CLC_BODY <clc/math/clc_get_twobypi_bits_decl.inc>
+#define __CLC_FUNCTION __clc_get_twobypi_bits
+
+#include "clc/math/gentype.inc"
+
+#undef __CLC_FUNCTION
+#undef __CLC_DOUBLE_ONLY
+
+#endif // __CLC_MATH_CLC_GET_TWOBYPI_BITS_H__
diff --git a/libclc/clc/include/clc/math/clc_get_twobypi_bits_decl.inc b/libclc/clc/include/clc/math/clc_get_twobypi_bits_decl.inc
new file mode 100644
index 0000000000000..7b3b7bce17788
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_get_twobypi_bits_decl.inc
@@ -0,0 +1,10 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_get_twobypi_bits(__CLC_GENTYPE x, __CLC_INTN segment);
diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc
index 15934cab32751..bbae56c69c937 100644
--- a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc
+++ b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc
@@ -6,6 +6,19 @@
//
//===----------------------------------------------------------------------===//
+typedef struct __CLC_XCONCAT(__clc_sincos_ret_, __CLC_GENTYPE) {
+ __CLC_GENTYPE sin, cos;
+} __CLC_XCONCAT(__clc_sincos_ret_, __CLC_GENTYPE);
+
+#define __CLC_SINCOS_RET_GENTYPE __CLC_XCONCAT(__clc_sincos_ret_, __CLC_GENTYPE)
+
+_CLC_DEF _CLC_OVERLOAD __CLC_SINCOS_RET_GENTYPE
+__clc_sincos_reduced_eval(__CLC_DOUBLEN x, __CLC_DOUBLEN y);
+
+_CLC_DEF _CLC_OVERLOAD __CLC_DOUBLEN __clc_tan_reduced_eval(__CLC_DOUBLEN x,
+ __CLC_DOUBLEN y,
+ __CLC_INTN is_odd);
+
_CLC_DECL _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN x,
__CLC_DOUBLEN xx,
private __CLC_DOUBLEN *sinval,
@@ -15,12 +28,12 @@ _CLC_DECL _CLC_OVERLOAD void __clc_tan_piby4(__CLC_DOUBLEN x, __CLC_DOUBLEN xx,
private __CLC_DOUBLEN *leadval,
private __CLC_DOUBLEN *tailval);
-_CLC_DECL _CLC_OVERLOAD void
-__clc_remainder_piby2_medium(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
- private __CLC_DOUBLEN *rr,
- private __CLC_INTN *regn);
+_CLC_DECL _CLC_OVERLOAD __CLC_INTN __clc_remainder_piby2_small(
+ __CLC_DOUBLEN x, private __CLC_DOUBLEN *r, private __CLC_DOUBLEN *rr);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_INTN __clc_remainder_piby2_large(
+ __CLC_DOUBLEN x, private __CLC_DOUBLEN *r, private __CLC_DOUBLEN *rr);
-_CLC_DECL _CLC_OVERLOAD void
-__clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
- private __CLC_DOUBLEN *rr,
- private __CLC_INTN *regn);
+_CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionS(private __CLC_DOUBLEN *r,
+ private __CLC_DOUBLEN *rr,
+ __CLC_DOUBLEN x);
diff --git a/libclc/clc/include/clc/math/tables.h b/libclc/clc/include/clc/math/tables.h
index 12361a30357ac..66429aaf8344b 100644
--- a/libclc/clc/include/clc/math/tables.h
+++ b/libclc/clc/include/clc/math/tables.h
@@ -74,7 +74,7 @@ __CLC_TABLE_FUNCTION_DECL_VEC(float, cbrt_tbl_head);
__CLC_TABLE_FUNCTION_DECL_VEC(float, cbrt_tbl_tail);
__CLC_TABLE_FUNCTION_DECL_VEC(float, sinhcosh_tbl_head);
__CLC_TABLE_FUNCTION_DECL_VEC(float, sinhcosh_tbl_tail);
-__CLC_TABLE_FUNCTION_DECL_VEC(ulong, pibits_tbl);
+__CLC_TABLE_FUNCTION_DECL_VEC(uint, two_by_pi_bits_tbl);
#ifdef cl_khr_fp64
diff --git a/libclc/clc/lib/amdgpu/CMakeLists.txt b/libclc/clc/lib/amdgpu/CMakeLists.txt
index daccc00b841b3..a2a30c2941d6b 100644
--- a/libclc/clc/lib/amdgpu/CMakeLists.txt
+++ b/libclc/clc/lib/amdgpu/CMakeLists.txt
@@ -15,6 +15,7 @@ libclc_configure_source_list(CLC_AMDGPU_SOURCES
math/clc_half_recip.cl
math/clc_half_rsqrt.cl
math/clc_half_sqrt.cl
+ math/clc_get_twobypi_bits.cl
math/clc_ldexp.cl
math/clc_log2_fast.cl
math/clc_native_exp.cl
diff --git a/libclc/clc/lib/amdgpu/math/clc_get_twobypi_bits.cl b/libclc/clc/lib/amdgpu/math/clc_get_twobypi_bits.cl
new file mode 100644
index 0000000000000..5de3b357f58d4
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/math/clc_get_twobypi_bits.cl
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/math/clc_get_twobypi_bits.h"
+
+_CLC_OVERLOAD _CLC_DEF double __clc_get_twobypi_bits(double x, int y) {
+ return __builtin_amdgcn_trig_preop(x, y);
+}
+
+#define __CLC_DOUBLE_ONLY
+#define __CLC_ARG2_SCALAR_TYPE int
+#define __CLC_FUNCTION __clc_get_twobypi_bits
+#define __CLC_IMPL_FUNCTION(x, y) __builtin_amdgcn_trig_preop(x, y)
+#define __CLC_BODY "clc/shared/binary_def_scalarize_loop.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
diff --git a/libclc/clc/lib/generic/CMakeLists.txt b/libclc/clc/lib/generic/CMakeLists.txt
index f9eb15a0aafda..af6a556d33d1a 100644
--- a/libclc/clc/lib/generic/CMakeLists.txt
+++ b/libclc/clc/lib/generic/CMakeLists.txt
@@ -94,6 +94,7 @@ libclc_configure_source_list(CLC_GENERIC_SOURCES
math/clc_fract.cl
math/clc_frexp.cl
math/clc_frexp_exp.cl
+ math/clc_get_twobypi_bits.cl
math/clc_half_cos.cl
math/clc_half_divide.cl
math/clc_half_exp.cl
diff --git a/libclc/clc/lib/generic/math/clc_cos.inc b/libclc/clc/lib/generic/math/clc_cos.inc
index bd5a8679505e2..8f6d2391e50c0 100644
--- a/libclc/clc/lib/generic/math/clc_cos.inc
+++ b/libclc/clc/lib/generic/math/clc_cos.inc
@@ -30,30 +30,19 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cos(__CLC_GENTYPE x) {
#elif __CLC_FPSIZE == 64
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cos(__CLC_GENTYPE x) {
- x = __clc_select(x, __CLC_GENTYPE_NAN,
- __CLC_CONVERT_S_GENTYPE(__clc_isinf(x)));
+ x = __clc_select(x, __CLC_GENTYPE_NAN, __CLC_CONVERT_LONGN(__clc_isinf(x)));
- __CLC_GENTYPE absx = __clc_fabs(x);
+ __CLC_DOUBLEN absx = __clc_fabs(x);
- __CLC_BIT_INTN is_medium = absx < 0x1.0p+47;
+ __CLC_DOUBLEN reduced_lo, reduced_hi;
+ __CLC_INTN regn = __clc_argReductionS(&reduced_lo, &reduced_hi, absx);
- __CLC_INTN regn_m, regn_l;
- __CLC_GENTYPE r_m, r_l, rr_m, rr_l;
+ __CLC_SINCOS_RET_GENTYPE eval =
+ __clc_sincos_reduced_eval(reduced_hi, reduced_lo);
- __clc_remainder_piby2_medium(absx, &r_m, &rr_m, ®n_m);
- __clc_remainder_piby2_large(absx, &r_l, &rr_l, ®n_l);
-
- __CLC_GENTYPE r = is_medium ? r_m : r_l;
- __CLC_GENTYPE rr = is_medium ? rr_m : rr_l;
- __CLC_INTN regn = __CLC_CONVERT_INTN(is_medium) ? regn_m : regn_l;
-
- __CLC_GENTYPE sinval, cosval;
- __clc_sincos_piby4(r, rr, &sinval, &cosval);
- sinval = -sinval;
-
- __CLC_LONGN c =
- __CLC_AS_LONGN(__CLC_CONVERT_BIT_INTN((regn & 1) != 0) ? sinval : cosval);
- c ^= __CLC_CONVERT_BIT_INTN(regn > 1) << 63;
+ __CLC_ULONGN c = __CLC_AS_ULONGN(
+ __CLC_CONVERT_LONGN((regn & 1) != 0) ? -eval.sin : eval.cos);
+ c ^= __CLC_CONVERT_LONGN(regn > 1) ? SIGNBIT_DP64 : 0u;
return __CLC_AS_GENTYPE(c);
}
diff --git a/libclc/clc/lib/generic/math/clc_get_twobypi_bits.cl b/libclc/clc/lib/generic/math/clc_get_twobypi_bits.cl
new file mode 100644
index 0000000000000..5dfca5dac4d1c
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_get_twobypi_bits.cl
@@ -0,0 +1,17 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/clc_convert.h"
+#include "clc/integer/clc_clz.h"
+#include "clc/math/clc_get_twobypi_bits.h"
+#include "clc/math/tables.h"
+
+#define __CLC_DOUBLE_ONLY
+#define __CLC_FUNCTION __clc_get_twobypi_bits
+#define __CLC_BODY <clc_get_twobypi_bits.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/math/clc_get_twobypi_bits.inc b/libclc/clc/lib/generic/math/clc_get_twobypi_bits.inc
new file mode 100644
index 0000000000000..1c066ad11a58a
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_get_twobypi_bits.inc
@@ -0,0 +1,40 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_DEF _CLC_OVERLOAD __CLC_DOUBLEN __clc_get_twobypi_bits(__CLC_DOUBLEN x,
+ __CLC_INTN index) {
+ const __CLC_INTN e_clamp = 1077;
+ __CLC_INTN e = __CLC_CONVERT_INTN(__CLC_AS_ULONGN(x) >> 52);
+ __CLC_INTN shift = e > e_clamp ? e - e_clamp : 0;
+ __CLC_INTN scale = e >= 0x7b0 ? 128 : 0;
+
+ __CLC_INTN start = shift + index * 53;
+
+ __CLC_INTN i = start >> 5;
+ __CLC_INTN b = start & 0x1f;
+
+ __CLC_UINTN w2 = __CLC_USE_TABLE(two_by_pi_bits_tbl, i);
+ __CLC_UINTN w1 = __CLC_USE_TABLE(two_by_pi_bits_tbl, i + 1);
+ __CLC_UINTN w0 = __CLC_USE_TABLE(two_by_pi_bits_tbl, i + 2);
+
+ __CLC_UINTN t = (w2 << b) | (w1 >> (32 - b));
+ w2 = b != 0 ? t : w2;
+
+ t = (w1 << b) | (w0 >> (32 - b));
+ w1 = b != 0 ? t : w1;
+ w1 &= 0xfffff800;
+
+ __CLC_INTN z = __CLC_CONVERT_INTN(__clc_clz(w2));
+ b = 11 - z;
+ w1 = (w1 >> b) | (w2 << (32 - b));
+ w2 >>= b;
+
+ return __CLC_AS_DOUBLEN(
+ (__CLC_CONVERT_ULONGN(1022 + scale - start - z) << 52) |
+ (__CLC_CONVERT_ULONGN(w2 & 0x000fffff) << 32) | __CLC_CONVERT_ULONGN(w1));
+}
diff --git a/libclc/clc/lib/generic/math/clc_sin.inc b/libclc/clc/lib/generic/math/clc_sin.inc
index be23f125a060d..3e839fdf43f17 100644
--- a/libclc/clc/lib/generic/math/clc_sin.inc
+++ b/libclc/clc/lib/generic/math/clc_sin.inc
@@ -32,31 +32,23 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sin(__CLC_GENTYPE x) {
#elif __CLC_FPSIZE == 64
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sin(__CLC_GENTYPE x) {
- x = __clc_select(x, __CLC_GENTYPE_NAN,
- __CLC_CONVERT_S_GENTYPE(__clc_isinf(x)));
+ x = __clc_select(x, __CLC_GENTYPE_NAN, __CLC_CONVERT_LONGN(__clc_isinf(x)));
- __CLC_GENTYPE absx = __clc_fabs(x);
+ __CLC_DOUBLEN absx = __clc_fabs(x);
- __CLC_BIT_INTN is_medium = absx < 0x1.0p+47;
+ __CLC_DOUBLEN reduced_lo, reduced_hi;
+ __CLC_INTN regn = __clc_argReductionS(&reduced_lo, &reduced_hi, absx);
- __CLC_INTN regn_m, regn_l;
- __CLC_GENTYPE r_m, r_l, rr_m, rr_l;
+ __CLC_SINCOS_RET_GENTYPE eval =
+ __clc_sincos_reduced_eval(reduced_hi, reduced_lo);
- __clc_remainder_piby2_medium(absx, &r_m, &rr_m, ®n_m);
- __clc_remainder_piby2_large(absx, &r_l, &rr_l, ®n_l);
+ __CLC_DOUBLEN s = __CLC_CONVERT_LONGN((regn & 1) == 0) ? eval.sin : eval.cos;
- __CLC_GENTYPE r = is_medium ? r_m : r_l;
- __CLC_GENTYPE rr = is_medium ? rr_m : rr_l;
- __CLC_INTN regn = __CLC_CONVERT_INTN(is_medium) ? regn_m : regn_l;
-
- __CLC_GENTYPE sinval, cosval;
- __clc_sincos_piby4(r, rr, &sinval, &cosval);
-
- __CLC_LONGN s =
- __CLC_AS_LONGN(__CLC_CONVERT_BIT_INTN((regn & 1) != 0) ? cosval : sinval);
-
- s ^= (__CLC_CONVERT_BIT_INTN(regn > 1) << 63) ^
- (__CLC_CONVERT_BIT_INTN(x < 0.0) << 63);
+ s = __CLC_AS_DOUBLEN(__CLC_AS_ULONGN(s) ^
+ (__CLC_CONVERT_LONGN(regn > 1)
+ ? (__CLC_ULONGN)SIGNBIT_DP64
+ : (__CLC_ULONGN)0) ^
+ (__CLC_AS_ULONGN(x) ^ __CLC_AS_ULONGN(absx)));
return __CLC_AS_GENTYPE(s);
}
diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers.cl b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
index 60880c7fae298..8c899c08d57a3 100644
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
@@ -9,6 +9,7 @@
#include "clc/clc_convert.h"
#include "clc/integer/clc_clz.h"
#include "clc/internal/clc.h"
+#include "clc/math/clc_floor.h"
#include "clc/math/clc_fma.h"
#include "clc/math/clc_frexp.h"
#include "clc/math/clc_ldexp.h"
@@ -18,6 +19,8 @@
#include "clc/math/clc_sincos_helpers.h"
#include "clc/math/clc_trunc.h"
#include "clc/math/math.h"
+#include "clc/relational/clc_isinf.h"
+#include "clc/relational/clc_isnan.h"
#define bitalign(hi, lo, shift) __builtin_elementwise_fshr(hi, lo, shift)
@@ -30,14 +33,10 @@
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#include "clc/math/clc_ep.h"
#include "clc/math/clc_fract.h"
+#include "clc/math/clc_get_twobypi_bits.h"
#include "clc/math/tables.h"
-#include "clc/shared/clc_max.h"
-
-#define bytealign(src0, src1, src2) \
- (__CLC_CONVERT_UINTN( \
- ((__CLC_CONVERT_LONGN((src0)) << 32) | __CLC_CONVERT_LONGN((src1))) >> \
- (((src2) & 3) * 8)))
#define __CLC_DOUBLE_ONLY
#define __CLC_BODY "clc_sincos_helpers_fp64.inc"
diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
index ae97b7963f7b3..cdb947c30d49c 100644
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
@@ -8,6 +8,59 @@
#pragma OPENCL FP_CONTRACT OFF
+_CLC_DEF _CLC_OVERLOAD __CLC_SINCOS_RET_GENTYPE
+__clc_sincos_reduced_eval(__CLC_DOUBLEN x, __CLC_DOUBLEN y) {
+ const __CLC_DOUBLEN S0 = -0x1.5555555555555p-3;
+ const __CLC_DOUBLEN S1 = 0x1.1111111110bb3p-7;
+ const __CLC_DOUBLEN S2 = -0x1.a01a019e83e5cp-13;
+ const __CLC_DOUBLEN S3 = 0x1.71de3796cde01p-19;
+ const __CLC_DOUBLEN S4 = -0x1.ae600b42fdfa7p-26;
+ const __CLC_DOUBLEN S5 = 0x1.5e0b2f9a43bb8p-33;
+
+ const __CLC_DOUBLEN C0 = 0x1.5555555555555p-5;
+ const __CLC_DOUBLEN C1 = -0x1.6c16c16c16967p-10;
+ const __CLC_DOUBLEN C2 = 0x1.a01a019f4ec90p-16;
+ const __CLC_DOUBLEN C3 = -0x1.27e4fa17f65f6p-22;
+ const __CLC_DOUBLEN C4 = 0x1.1eeb69037ab78p-29;
+ const __CLC_DOUBLEN C5 = -0x1.907db46cc5e42p-37;
+
+ __CLC_DOUBLEN x2 = x * x;
+ __CLC_DOUBLEN x3 = x * x2;
+ __CLC_DOUBLEN r = 0.5 * x2;
+ __CLC_DOUBLEN t = 1.0 - r;
+ __CLC_DOUBLEN u = 1.0 - t;
+ __CLC_DOUBLEN v = u - r;
+
+ __CLC_DOUBLEN cxy = t + __clc_mad(x2 * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, C5, C4), C3), C2), C1), C0), __clc_mad(x, -y, v));
+ __CLC_DOUBLEN sxy = __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, S5, S4), S3), S2), S1);
+ sxy = x - __clc_mad(-x3, S0, __clc_mad(x2, __clc_mad(-x3, sxy, __CLC_FP_LIT(0.5) * y), -y));
+
+ __CLC_SINCOS_RET_GENTYPE ret;
+ ret.cos = cxy;
+ ret.sin = sxy;
+ return ret;
+}
+
+_CLC_DEF _CLC_OVERLOAD __CLC_DOUBLEN __clc_tan_reduced_eval(__CLC_DOUBLEN x,
+ __CLC_DOUBLEN xx,
+ __CLC_INTN is_odd) {
+ __CLC_DOUBLEN s = __clc_ep_sqr(__clc_ep_make_pair(x, xx)).hi;
+ __CLC_DOUBLEN p = s * __clc_mad(s, __clc_mad(s, __clc_mad(s, __clc_mad(s,
+ __clc_mad(s, __clc_mad(s, __clc_mad(s, __clc_mad(s,
+ __clc_mad(s, __clc_mad(s, __clc_mad(s, __clc_mad(s,
+ __clc_mad(s,
+ 0x1.5e089c751c08cp-16, -0x1.78809a9a29f71p-15),
+ 0x1.7746f90a8aaep-14), -0x1.bb44da6fbf144p-16),
+ 0x1.1e634a7943acfp-13), 0x1.d250fdeb68febp-13),
+ 0x1.37fd9b58c4d95p-11), 0x1.7d5af15120e2cp-10),
+ 0x1.d6d93e09491dfp-9), 0x1.226e12033784dp-7),
+ 0x1.664f49ac36ae2p-6), 0x1.ba1ba1b451c21p-5),
+ 0x1.11111111185b7p-3), 0x1.55555555554eep-2);
+ __CLC_EP_PAIR t = __clc_ep_fast_add(__clc_ep_make_pair(x, xx), __clc_ep_mul(x, p));
+ __CLC_EP_PAIR tr = __clc_ep_fast_recip(t);
+ return __CLC_CONVERT_LONGN(is_odd) ? -tr.hi : t.hi;
+}
+
_CLC_DEF _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN x,
__CLC_DOUBLEN xx,
private __CLC_DOUBLEN *sinval,
@@ -49,8 +102,8 @@ _CLC_DEF _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN x,
__CLC_DOUBLEN x2 = x * x;
__CLC_DOUBLEN x3 = x2 * x;
- __CLC_DOUBLEN r = (__CLC_DOUBLEN)0.5 * x2;
- __CLC_DOUBLEN t = (__CLC_DOUBLEN)1.0 - r;
+ __CLC_DOUBLEN r = __CLC_FP_LIT(0.5) * x2;
+ __CLC_DOUBLEN t = __CLC_FP_LIT(1.0) - r;
__CLC_DOUBLEN sp = __clc_fma(
__clc_fma(__clc_fma(__clc_fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
@@ -62,10 +115,11 @@ _CLC_DEF _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN x,
x2, cc3),
x2, cc2),
x2, cc1),
- x2 * x2, __clc_fma(x, xx, (1.0 - t) - r));
+ x2 * x2, __clc_fma(x, xx, (__CLC_FP_LIT(1.0) - t) - r));
- *sinval =
- x - __clc_fma(-x3, sc1, __clc_fma(__clc_fma(-x3, sp, 0.5 * xx), x2, -xx));
+ *sinval = x - __clc_fma(-x3, sc1,
+ __clc_fma(__clc_fma(-x3, sp, __CLC_FP_LIT(0.5) * xx),
+ x2, -xx));
*cosval = cp;
}
@@ -131,229 +185,128 @@ _CLC_DEF _CLC_OVERLOAD void __clc_tan_piby4(__CLC_DOUBLEN x, __CLC_DOUBLEN xx,
*tailval = c ? tptr : tpr;
}
-// Reduction for medium sized arguments
-_CLC_DEF _CLC_OVERLOAD void
-__clc_remainder_piby2_medium(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
- private __CLC_DOUBLEN *rr,
- private __CLC_INTN *regn) {
+// Reduction for small sized arguments
+_CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_remainder_piby2_small(
+ __CLC_DOUBLEN x, private __CLC_DOUBLEN *rh, private __CLC_DOUBLEN *rt) {
// How many pi/2 is x a multiple of?
- const __CLC_DOUBLEN two_by_pi = 0x1.45f306dc9c883p-1;
- __CLC_DOUBLEN dnpi2 = __clc_trunc(__clc_fma(x, two_by_pi, 0.5));
+ const __CLC_DOUBLEN twobypi = 0x1.45f306dc9c883p-1;
+ const __CLC_DOUBLEN piby2_h = 0x1.921fb54442d18p+0;
+ const __CLC_DOUBLEN piby2_m = 0x1.1a62633145c00p-54;
+ const __CLC_DOUBLEN piby2_t = 0x1.b839a252049c0p-104;
- const __CLC_DOUBLEN piby2_h = -7074237752028440.0 / 0x1.0p+52;
- const __CLC_DOUBLEN piby2_m = -2483878800010755.0 / 0x1.0p+105;
- const __CLC_DOUBLEN piby2_t = -3956492004828932.0 / 0x1.0p+158;
+ __CLC_DOUBLEN dn_pi2 = __clc_rint(x * twobypi);
// Compute product of npi2 with 159 bits of 2/pi
- __CLC_DOUBLEN p_hh = piby2_h * dnpi2;
- __CLC_DOUBLEN p_ht = __clc_fma(piby2_h, dnpi2, -p_hh);
- __CLC_DOUBLEN p_mh = piby2_m * dnpi2;
- __CLC_DOUBLEN p_mt = __clc_fma(piby2_m, dnpi2, -p_mh);
- __CLC_DOUBLEN p_th = piby2_t * dnpi2;
- __CLC_DOUBLEN p_tt = __clc_fma(piby2_t, dnpi2, -p_th);
+ __CLC_DOUBLEN xt = __clc_fma(dn_pi2, -piby2_h, x);
+ __CLC_DOUBLEN yh = __clc_fma(dn_pi2, -piby2_m, xt);
+ __CLC_DOUBLEN ph = dn_pi2 * piby2_m;
+ __CLC_DOUBLEN pt = __clc_fma(dn_pi2, piby2_m, -ph);
// Reduce to 159 bits
- __CLC_DOUBLEN ph = p_hh;
- __CLC_DOUBLEN pm = p_ht + p_mh;
- __CLC_DOUBLEN t = p_mh - (pm - p_ht);
- __CLC_DOUBLEN pt = p_th + t + p_mt + p_tt;
- t = ph + pm;
- pm = pm - (t - ph);
- ph = t;
- t = pm + pt;
- pt = pt - (t - pm);
- pm = t;
-
- // Subtract from x
- t = x + ph;
- __CLC_DOUBLEN qh = t + pm;
- __CLC_DOUBLEN qt = pm - (qh - t) + pt;
-
- *r = qh;
- *rr = qt;
- *regn = __CLC_CONVERT_INTN(__CLC_CONVERT_LONGN(dnpi2)) & 0x3;
+ __CLC_DOUBLEN th = xt - ph;
+ __CLC_DOUBLEN tt = (xt - th) - ph;
+ __CLC_DOUBLEN yt = __clc_fma(dn_pi2, -piby2_t, ((th - yh) + tt) - pt);
+
+ __CLC_EP_PAIR result = __clc_ep_fast_add(yh, yt);
+ *rh = result.lo;
+ *rt = result.hi;
+
+ return __CLC_CONVERT_INTN(dn_pi2) & 0x3;
}
// Given positive argument x, reduce it to the range [-pi/4,pi/4] using
// extra precision, and return the result in r, rr.
// Return value "regn" tells how many lots of pi/2 were subtracted
// from x to put it in the range [-pi/4,pi/4], mod 4.
-_CLC_DEF _CLC_OVERLOAD void
-__clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
- private __CLC_DOUBLEN *rr,
- private __CLC_INTN *regn) {
-
- __CLC_LONGN ux = __CLC_AS_LONGN(x);
- __CLC_INTN e = __CLC_CONVERT_INTN(ux >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
- __CLC_INTN i = __clc_max(23, (e >> 3) + 17);
- __CLC_INTN j = 150 - i;
- __CLC_INTN j16 = j & ~0xf;
- __CLC_DOUBLEN fract_temp;
-
- // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary
- // byte boundary
- __CLC_ULONGN q0 = __CLC_USE_TABLE(pibits_tbl, j16);
- __CLC_ULONGN q1 = __CLC_USE_TABLE(pibits_tbl, (j16 + 8));
- __CLC_ULONGN q2 = __CLC_USE_TABLE(pibits_tbl, (j16 + 16));
- __CLC_ULONGN q3 = __CLC_USE_TABLE(pibits_tbl, (j16 + 24));
- __CLC_ULONGN q4 = __CLC_USE_TABLE(pibits_tbl, (j16 + 32));
-
- __CLC_UINTN q0s0 = __CLC_CONVERT_UINTN(q0);
- __CLC_UINTN q0s1 = __CLC_CONVERT_UINTN(q0 >> 32);
-
- __CLC_UINTN q1s0 = __CLC_CONVERT_UINTN(q1);
- __CLC_UINTN q1s1 = __CLC_CONVERT_UINTN(q1 >> 32);
-
- __CLC_UINTN q2s0 = __CLC_CONVERT_UINTN(q2);
- __CLC_UINTN q2s1 = __CLC_CONVERT_UINTN(q2 >> 32);
-
- __CLC_UINTN q3s0 = __CLC_CONVERT_UINTN(q3);
- __CLC_UINTN q3s1 = __CLC_CONVERT_UINTN(q3 >> 32);
-
- __CLC_UINTN q4s0 = __CLC_CONVERT_UINTN(q4);
- __CLC_UINTN q4s1 = __CLC_CONVERT_UINTN(q4 >> 32);
-
- __CLC_INTN k = (j >> 2) & 0x3;
- __CLC_INTN c1 = k == 1;
- __CLC_INTN c2 = k == 2;
- __CLC_INTN c3 = k == 3;
-
- __CLC_UINTN u0, u1, u2, u3, u4, u5, u6;
-
- u0 = c1 ? q0s1 : q0s0;
- u0 = c2 ? q1s0 : u0;
- u0 = c3 ? q1s1 : u0;
-
- u1 = c1 ? q1s0 : q0s1;
- u1 = c2 ? q1s1 : u1;
- u1 = c3 ? q2s0 : u1;
-
- u2 = c1 ? q1s1 : q1s0;
- u2 = c2 ? q2s0 : u2;
- u2 = c3 ? q2s1 : u2;
-
- u3 = c1 ? q2s0 : q1s1;
- u3 = c2 ? q2s1 : u3;
- u3 = c3 ? q3s0 : u3;
-
- u4 = c1 ? q2s1 : q2s0;
- u4 = c2 ? q3s0 : u4;
- u4 = c3 ? q3s1 : u4;
-
- u5 = c1 ? q3s0 : q2s1;
- u5 = c2 ? q3s1 : u5;
- u5 = c3 ? q4s0 : u5;
-
- u6 = c1 ? q3s1 : q3s0;
- u6 = c2 ? q4s0 : u6;
- u6 = c3 ? q4s1 : u6;
-
- __CLC_UINTN v0 = bytealign(u1, u0, j);
- __CLC_UINTN v1 = bytealign(u2, u1, j);
- __CLC_UINTN v2 = bytealign(u3, u2, j);
- __CLC_UINTN v3 = bytealign(u4, u3, j);
- __CLC_UINTN v4 = bytealign(u5, u4, j);
- __CLC_UINTN v5 = bytealign(u6, u5, j);
-
- // Place those 192 bits in 4 48-bit doubles along with correct exponent
- // If i > 1018 we would get subnormals so we scale p up and x down to get the
- // same product
- i = 2 + 8 * i;
- x *= __CLC_CONVERT_BIT_INTN(i > 1018) ? 0x1.0p-136 : 1.0;
- i -= i > 1018 ? 136 : 0;
-
-#define doublen_lohi(x, y) \
- __CLC_AS_DOUBLEN(__CLC_CONVERT_ULONGN((x)) & 0xFFFFFFFF | \
- __CLC_CONVERT_ULONGN((y)) << 32)
-
- __CLC_UINTN ua = __CLC_CONVERT_UINTN(EXPBIAS_DP64 + EXPSHIFTBITS_DP64 - i)
- << 20;
- __CLC_DOUBLEN a = doublen_lohi((__CLC_ULONGN)0, ua);
- __CLC_DOUBLEN p0 = doublen_lohi(v0, ua | (v1 & 0xffffU)) - a;
- ua += 0x03000000U;
- a = doublen_lohi((__CLC_ULONGN)0, ua);
- __CLC_DOUBLEN p1 =
- doublen_lohi(((v2 << 16) | (v1 >> 16)), (ua | (v2 >> 16))) - a;
- ua += 0x03000000U;
- a = doublen_lohi((__CLC_ULONGN)0, ua);
- __CLC_DOUBLEN p2 = doublen_lohi(v3, (ua | (v4 & 0xffffU))) - a;
- ua += 0x03000000U;
- a = doublen_lohi((__CLC_ULONGN)0, ua);
- __CLC_DOUBLEN p3 =
- doublen_lohi(((v5 << 16) | (v4 >> 16)), (ua | (v5 >> 16))) - a;
-
-#undef doublen_lohi
+_CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_remainder_piby2_large(
+ __CLC_DOUBLEN x, private __CLC_DOUBLEN *r, private __CLC_DOUBLEN *rr) {
+ // Scale x by relevant part of 2/pi
+ __CLC_DOUBLEN p0 = __clc_get_twobypi_bits(x, 2);
+ __CLC_DOUBLEN p1 = __clc_get_twobypi_bits(x, 1);
+ __CLC_DOUBLEN p2 = __clc_get_twobypi_bits(x, 0);
+
+ x = __clc_ldexp(x, __CLC_CONVERT_INTN(x >= 0x1.0p+945) ? -128 : 0);
// Exact multiply
- __CLC_DOUBLEN f0h = p0 * x;
- __CLC_DOUBLEN f0l = __clc_fma(p0, x, -f0h);
- __CLC_DOUBLEN f1h = p1 * x;
- __CLC_DOUBLEN f1l = __clc_fma(p1, x, -f1h);
- __CLC_DOUBLEN f2h = p2 * x;
- __CLC_DOUBLEN f2l = __clc_fma(p2, x, -f2h);
- __CLC_DOUBLEN f3h = p3 * x;
- __CLC_DOUBLEN f3l = __clc_fma(p3, x, -f3h);
-
- // Accumulate product into 4 doubles
- __CLC_DOUBLEN s, t;
-
- __CLC_DOUBLEN f3 = f3h + f2h;
- t = f2h - (f3 - f3h);
- s = f3l + t;
- t = t - (s - f3l);
-
- __CLC_DOUBLEN f2 = s + f1h;
- t = f1h - (f2 - s) + t;
- s = f2l + t;
- t = t - (s - f2l);
-
- __CLC_DOUBLEN f1 = s + f0h;
- t = f0h - (f1 - s) + t;
- s = f1l + t;
-
- __CLC_DOUBLEN f0 = s + f0l;
-
- // Strip off unwanted large integer bits
- f3 = 0x1.0p+10 * __clc_fract(f3 * 0x1.0p-10, &fract_temp);
- f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
+ __CLC_EP_PAIR rp0 = __clc_ep_mul(p0, x);
+ __CLC_EP_PAIR rp1 = __clc_ep_mul(p1, x);
+ __CLC_EP_PAIR rp2 = __clc_ep_mul(p2, x);
+
+ // Accumulate product into 3 doubles
+ __CLC_EP_PAIR v2 = __clc_ep_add(rp2.lo, rp1.hi);
+ __CLC_EP_PAIR v1 = __clc_ep_add(rp1.lo, rp0.hi);
+ __CLC_EP_PAIR w2 = __clc_ep_add(v2.lo, v1.hi);
+
+ __CLC_DOUBLEN e3 = rp2.hi;
+ __CLC_DOUBLEN e2 = v2.hi;
+ __CLC_DOUBLEN e1 = w2.hi;
+ __CLC_DOUBLEN e0 = w2.lo + v1.lo + rp0.lo;
+
+ __CLC_EP_PAIR e32 = __clc_ep_fast_add(e3, e2);
+ __CLC_EP_PAIR e21 = __clc_ep_fast_add(e32.lo, e1);
+ __CLC_EP_PAIR e10 = __clc_ep_fast_add(e21.lo, e0);
+
+ __CLC_DOUBLEN f2 = e32.hi;
+ __CLC_DOUBLEN f1 = e21.hi;
+ __CLC_DOUBLEN f0 = e10.hi;
+
+ __CLC_DOUBLEN f2_scale = __clc_ldexp(f2, -2);
// Compute least significant integer bits
- t = f3 + f2;
- __CLC_DOUBLEN di = t - __clc_fract(t, &fract_temp);
- i = __CLC_CONVERT_INTN(__CLC_CONVERT_FLOATN(di));
-
- // Shift out remaining integer part
- f3 -= di;
- s = f3 + f2;
- t = f2 - (s - f3);
- f3 = s;
- f2 = t;
- s = f2 + f1;
- t = f1 - (s - f2);
- f2 = s;
- f1 = t;
- f1 += f0;
-
- // Subtract 1 if fraction is >= 0.5, and update regn
- __CLC_INTN g = __CLC_CONVERT_INTN(f3 >= 0.5 ? 1L : 0L);
- i += g;
- f3 -= __CLC_CONVERT_DOUBLEN(__CLC_CONVERT_FLOATN(g));
-
- // Shift up bits
- s = f3 + f2;
- t = f2 - (s - f3);
- f3 = s;
- f2 = t + f1;
+ __CLC_DOUBLEN unused;
+ __CLC_DOUBLEN f2_frac = __clc_fract(f2_scale, &unused);
+
+ f2 = __clc_ldexp(f2_frac, 2);
+
+ f2 += (f2 + f1) < 0.0 ? 4.0 : 0.0;
+
+ __CLC_INTN i = __CLC_CONVERT_INTN(f2 + f1);
+ f2 -= __CLC_CONVERT_DOUBLEN(i);
+
+ __CLC_EP_PAIR f21 = __clc_ep_fast_add(f2, f1);
+ __CLC_EP_PAIR f10 = __clc_ep_fast_add(f21.lo, f0);
+ f2 = f21.hi;
+ f1 = f10.hi;
+ f0 = f10.lo;
+
+ __CLC_S_GENTYPE g = f2 >= __CLC_FP_LIT(0.5);
+ i += __CLC_CONVERT_INTN(g) ? 1 : 0;
+ f2 -= g ? __CLC_FP_LIT(1.0) : __CLC_FP_LIT(0.0);
+
+ __CLC_EP_PAIR rf = __clc_ep_fast_add(f2, f1);
+ f2 = rf.hi;
+ f1 = rf.lo;
// Multiply precise fraction by pi/2 to get radians
- const __CLC_DOUBLEN p2h = 7074237752028440.0 / 0x1.0p+52;
- const __CLC_DOUBLEN p2t = 4967757600021510.0 / 0x1.0p+106;
+ const __CLC_DOUBLEN p2h = 0x1.921fb54442d18p+0;
+ const __CLC_DOUBLEN p2t = 0x1.1a62633145c07p-54;
+
+ __CLC_DOUBLEN rh = f2 * p2h;
+ __CLC_DOUBLEN rt =
+ __clc_fma(f1, p2h, __clc_fma(f2, p2t, __clc_fma(f2, p2h, -rh)));
- __CLC_DOUBLEN rhi = f3 * p2h;
- __CLC_DOUBLEN rlo =
- __clc_fma(f2, p2h, __clc_fma(f3, p2t, __clc_fma(f3, p2h, -rhi)));
+ __CLC_EP_PAIR result = __clc_ep_fast_add(rh, rt);
+ *r = result.lo;
+ *rr = result.hi;
+
+ return i & 0x3;
+}
- *r = rhi + rlo;
- *rr = rlo - (*r - rhi);
- *regn = i & 0x3;
+_CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionS(
+ private __CLC_DOUBLEN *r_lo, private __CLC_DOUBLEN *r_hi, __CLC_DOUBLEN x) {
+ __CLC_LONGN is_large = x >= (__CLC_DOUBLEN)0x1.0p+30;
+
+#ifdef __CLC_SCALAR1
+ if (is_large)
+ return __clc_remainder_piby2_large(x, r_lo, r_hi);
+ return __clc_remainder_piby2_small(x, r_lo, r_hi);
+#else
+ __CLC_DOUBLEN rlo_1, rlo_2;
+ __CLC_DOUBLEN rhi_1, rhi_2;
+ __CLC_INTN ret1 = __clc_remainder_piby2_small(x, &rlo_1, &rhi_1);
+ __CLC_INTN ret2 = __clc_remainder_piby2_large(x, &rlo_2, &rhi_2);
+ *r_lo = is_large ? rlo_2 : rlo_1;
+ *r_hi = is_large ? rhi_2 : rhi_1;
+ return __CLC_CONVERT_INTN(is_large) ? ret2 : ret1;
+#endif
}
diff --git a/libclc/clc/lib/generic/math/clc_tables.cl b/libclc/clc/lib/generic/math/clc_tables.cl
index 43fa7387431de..4e76724c877bf 100644
--- a/libclc/clc/lib/generic/math/clc_tables.cl
+++ b/libclc/clc/lib/generic/math/clc_tables.cl
@@ -1648,64 +1648,61 @@ __CLC_TABLE_FUNCTION_VEC(double, SINH_TBL_TAIL, sinh_tbl_tail);
__CLC_TABLE_FUNCTION_VEC(double, COSH_TBL_HEAD, cosh_tbl_head);
__CLC_TABLE_FUNCTION_VEC(double, COSH_TBL_TAIL, cosh_tbl_tail);
-__CLC_DECLARE_TABLE(uchar, PIBITS_TBL, ) = {
- 224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175, 169,
- 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31, 235, 158, 212,
- 181, 168, 127, 121, 154, 253, 24, 61, 221, 38, 44, 159, 60, 251, 217,
- 180, 125, 180, 41, 104, 45, 70, 188, 188, 63, 96, 22, 120, 255, 95,
- 226, 127, 236, 160, 228, 247, 46, 126, 17, 114, 210, 231, 76, 13, 230,
- 88, 71, 230, 4, 249, 125, 209, 154, 192, 113, 166, 19, 18, 237, 186,
- 212, 215, 8, 162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72, 70,
- 39, 168, 187, 36, 25, 128, 75, 55, 9, 233, 184, 145, 220, 134, 21,
- 239, 122, 175, 142, 69, 249, 7, 65, 14, 241, 100, 86, 138, 109, 3,
- 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13, 230, 139,
- 2, 0, 0, 0, 0, 0, 0, 0};
-
-_CLC_DEF _CLC_OVERLOAD ulong __CLC_TABLE_MANGLE(pibits_tbl)(int idx) {
- return *(__constant ulong *)(PIBITS_TBL + idx);
+__CLC_DECLARE_TABLE(uint, TWO_BY_PI_BITS_TABLE, 37) = {
+ 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
+ 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
+ 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
+ 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
+ 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
+ 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
+ 0x56033046};
+
+_CLC_DEF _CLC_OVERLOAD uint __CLC_TABLE_MANGLE(two_by_pi_bits_tbl)(int idx) {
+ return *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx);
}
-_CLC_DEF _CLC_OVERLOAD ulong2 __CLC_TABLE_MANGLE(pibits_tbl)(int2 idx) {
- return (ulong2){*(__constant ulong *)(PIBITS_TBL + idx.s0),
- *(__constant ulong *)(PIBITS_TBL + idx.s1)};
+_CLC_DEF _CLC_OVERLOAD uint2 __CLC_TABLE_MANGLE(two_by_pi_bits_tbl)(int2 idx) {
+ return (uint2){*(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s0),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s1)};
}
-_CLC_DEF _CLC_OVERLOAD ulong3 __CLC_TABLE_MANGLE(pibits_tbl)(int3 idx) {
- return (ulong3){*(__constant ulong *)(PIBITS_TBL + idx.s0),
- *(__constant ulong *)(PIBITS_TBL + idx.s1),
- *(__constant ulong *)(PIBITS_TBL + idx.s2)};
+_CLC_DEF _CLC_OVERLOAD uint3 __CLC_TABLE_MANGLE(two_by_pi_bits_tbl)(int3 idx) {
+ return (uint3){*(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s0),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s1),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s2)};
}
-_CLC_DEF _CLC_OVERLOAD ulong4 __CLC_TABLE_MANGLE(pibits_tbl)(int4 idx) {
- return (ulong4){*(__constant ulong *)(PIBITS_TBL + idx.s0),
- *(__constant ulong *)(PIBITS_TBL + idx.s1),
- *(__constant ulong *)(PIBITS_TBL + idx.s2),
- *(__constant ulong *)(PIBITS_TBL + idx.s3)};
+_CLC_DEF _CLC_OVERLOAD uint4 __CLC_TABLE_MANGLE(two_by_pi_bits_tbl)(int4 idx) {
+ return (uint4){*(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s0),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s1),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s2),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s3)};
}
-_CLC_DEF _CLC_OVERLOAD ulong8 __CLC_TABLE_MANGLE(pibits_tbl)(int8 idx) {
- return (ulong8){*(__constant ulong *)(PIBITS_TBL + idx.s0),
- *(__constant ulong *)(PIBITS_TBL + idx.s1),
- *(__constant ulong *)(PIBITS_TBL + idx.s2),
- *(__constant ulong *)(PIBITS_TBL + idx.s3),
- *(__constant ulong *)(PIBITS_TBL + idx.s4),
- *(__constant ulong *)(PIBITS_TBL + idx.s5),
- *(__constant ulong *)(PIBITS_TBL + idx.s6),
- *(__constant ulong *)(PIBITS_TBL + idx.s7)};
+_CLC_DEF _CLC_OVERLOAD uint8 __CLC_TABLE_MANGLE(two_by_pi_bits_tbl)(int8 idx) {
+ return (uint8){*(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s0),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s1),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s2),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s3),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s4),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s5),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s6),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s7)};
}
-_CLC_DEF _CLC_OVERLOAD ulong16 __CLC_TABLE_MANGLE(pibits_tbl)(int16 idx) {
- return (ulong16){*(__constant ulong *)(PIBITS_TBL + idx.s0),
- *(__constant ulong *)(PIBITS_TBL + idx.s1),
- *(__constant ulong *)(PIBITS_TBL + idx.s2),
- *(__constant ulong *)(PIBITS_TBL + idx.s3),
- *(__constant ulong *)(PIBITS_TBL + idx.s4),
- *(__constant ulong *)(PIBITS_TBL + idx.s5),
- *(__constant ulong *)(PIBITS_TBL + idx.s6),
- *(__constant ulong *)(PIBITS_TBL + idx.s7),
- *(__constant ulong *)(PIBITS_TBL + idx.s8),
- *(__constant ulong *)(PIBITS_TBL + idx.s9),
- *(__constant ulong *)(PIBITS_TBL + idx.sA),
- *(__constant ulong *)(PIBITS_TBL + idx.sB),
- *(__constant ulong *)(PIBITS_TBL + idx.sC),
- *(__constant ulong *)(PIBITS_TBL + idx.sD),
- *(__constant ulong *)(PIBITS_TBL + idx.sE),
- *(__constant ulong *)(PIBITS_TBL + idx.sF)};
+_CLC_DEF _CLC_OVERLOAD uint16
+__CLC_TABLE_MANGLE(two_by_pi_bits_tbl)(int16 idx) {
+ return (uint16){*(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s0),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s1),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s2),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s3),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s4),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s5),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s6),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s7),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s8),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.s9),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.sA),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.sB),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.sC),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.sD),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.sE),
+ *(__constant uint *)(TWO_BY_PI_BITS_TABLE + idx.sF)};
}
#endif // cl_khr_fp64
diff --git a/libclc/clc/lib/generic/math/clc_tan.inc b/libclc/clc/lib/generic/math/clc_tan.inc
index 261e585d5d56d..e4180d1047651 100644
--- a/libclc/clc/lib/generic/math/clc_tan.inc
+++ b/libclc/clc/lib/generic/math/clc_tan.inc
@@ -42,32 +42,20 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_tan(__CLC_GENTYPE x) {
#elif __CLC_FPSIZE == 64
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_tan(__CLC_GENTYPE x) {
- x = __clc_select(x, __CLC_GENTYPE_NAN,
- __CLC_CONVERT_S_GENTYPE(__clc_isinf(x)));
-
- __CLC_GENTYPE y = __clc_fabs(x);
-
- __CLC_BIT_INTN is_medium = y < 0x1.0p+30;
-
- __CLC_INTN regn_m, regn_l;
- __CLC_GENTYPE r_m, r_l, rr_m, rr_l;
-
- __clc_remainder_piby2_medium(y, &r_m, &rr_m, ®n_m);
- __clc_remainder_piby2_large(y, &r_l, &rr_l, ®n_l);
+_CLC_DEF _CLC_OVERLOAD __CLC_DOUBLEN __clc_tan(__CLC_DOUBLEN x) {
+ x = __clc_select(x, __CLC_GENTYPE_NAN, __CLC_CONVERT_LONGN(__clc_isinf(x)));
- __CLC_GENTYPE r = is_medium ? r_m : r_l;
- __CLC_GENTYPE rr = is_medium ? rr_m : rr_l;
- __CLC_INTN regn = __CLC_CONVERT_INTN(is_medium) ? regn_m : regn_l;
+ __CLC_DOUBLEN absx = __clc_fabs(x);
- __CLC_GENTYPE lead, tail;
- __clc_tan_piby4(r, rr, &lead, &tail);
+ __CLC_DOUBLEN reduced_lo, reduced_hi;
+ __CLC_INTN regn = __clc_argReductionS(&reduced_lo, &reduced_hi, absx);
- __CLC_LONGN t =
- __CLC_AS_LONGN(__CLC_CONVERT_BIT_INTN((regn & 1) != 0) ? tail : lead);
- t ^= __CLC_CONVERT_BIT_INTN(x < 0.0) << 63;
+ __CLC_DOUBLEN t =
+ __clc_tan_reduced_eval(reduced_hi, reduced_lo, (regn & 1) != 0);
- return __CLC_AS_GENTYPE(t);
+ t = __CLC_AS_DOUBLEN(__CLC_AS_LONGN(t) ^
+ (__CLC_AS_LONGN(x) & (__CLC_LONGN)SIGNBIT_DP64));
+ return __CLC_AS_DOUBLEN(t);
}
#elif __CLC_FPSIZE == 16
More information about the cfe-commits
mailing list