[libclc] [libclc] Move sin, cos & sincos to CLC library (PR #139527)

Mon May 12 03:17:27 PDT 2025

https://github.com/frasercrmck updated https://github.com/llvm/llvm-project/pull/139527

>From 875c5fa90fff79456da734ae4be1e4590df6d127 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser at codeplay.com>
Date: Thu, 1 May 2025 17:24:58 +0100
Subject: [PATCH 1/3] [libclc] Move sin, cos & sincos to CLC library

This commit also vectorizes these builtins along the way.
---
 libclc/clc/include/clc/math/clc_cos.h         |  19 ++
 libclc/clc/include/clc/math/clc_sin.h         |  19 ++
 libclc/clc/include/clc/math/clc_sincos.h      |  19 ++
 .../clc/include/clc/math/clc_sincos_helpers.h |   7 +
 .../clc/math/clc_sincos_helpers_fp64.inc      |  17 ++
 libclc/clc/include/clc/math/tables.h          |   2 +-
 libclc/clc/lib/generic/SOURCES                |   3 +
 libclc/clc/lib/generic/math/clc_cos.cl        |  21 ++
 libclc/clc/lib/generic/math/clc_cos.inc       |  63 ++++
 libclc/clc/lib/generic/math/clc_sin.cl        |  25 ++
 libclc/clc/lib/generic/math/clc_sin.inc       |  68 +++++
 libclc/clc/lib/generic/math/clc_sincos.cl     |  14 +
 .../lib/generic/math/clc_sincos.inc}          |   8 +-
 .../lib/generic/math/clc_sincos_helpers.cl    |  24 ++
 .../generic/math/clc_sincos_helpers_fp64.inc  | 234 ++++++++++++++
 libclc/clc/lib/generic/math/clc_tables.cl     |  62 ++++
 libclc/clspv/lib/SOURCES                      |   2 -
 libclc/generic/include/clc/math/sincos.h      |   4 +-
 libclc/generic/include/clc/math/sincos.inc    |  14 -
 libclc/generic/lib/SOURCES                    |   2 -
 libclc/generic/lib/math/clc_tan.cl            |   1 -
 libclc/generic/lib/math/cos.cl                |  42 +--
 libclc/generic/lib/math/cos.inc               |  34 ---
 libclc/generic/lib/math/sin.cl                |  41 +--
 libclc/generic/lib/math/sin.inc               |  38 ---
 libclc/generic/lib/math/sincos.cl             |   4 +-
 libclc/generic/lib/math/sincos_helpers.cl     | 285 ------------------
 libclc/generic/lib/math/sincos_helpers.h      |  24 --
 libclc/generic/lib/math/tables.cl             |  30 --
 libclc/spirv/lib/SOURCES                      |   2 -
 30 files changed, 612 insertions(+), 516 deletions(-)
 create mode 100644 libclc/clc/include/clc/math/clc_cos.h
 create mode 100644 libclc/clc/include/clc/math/clc_sin.h
 create mode 100644 libclc/clc/include/clc/math/clc_sincos.h
 create mode 100644 libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc
 create mode 100644 libclc/clc/lib/generic/math/clc_cos.cl
 create mode 100644 libclc/clc/lib/generic/math/clc_cos.inc
 create mode 100644 libclc/clc/lib/generic/math/clc_sin.cl
 create mode 100644 libclc/clc/lib/generic/math/clc_sin.inc
 create mode 100644 libclc/clc/lib/generic/math/clc_sincos.cl
 rename libclc/{generic/lib/math/sincos.inc => clc/lib/generic/math/clc_sincos.inc} (62%)
 create mode 100644 libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
 delete mode 100644 libclc/generic/include/clc/math/sincos.inc
 delete mode 100644 libclc/generic/lib/math/cos.inc
 delete mode 100644 libclc/generic/lib/math/sin.inc
 delete mode 100644 libclc/generic/lib/math/sincos_helpers.cl
 delete mode 100644 libclc/generic/lib/math/sincos_helpers.h
 delete mode 100644 libclc/generic/lib/math/tables.cl

diff --git a/libclc/clc/include/clc/math/clc_cos.h b/libclc/clc/include/clc/math/clc_cos.h
new file mode 100644
index 0000000000000..44681608efc37
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_cos.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_COS_H__
+#define __CLC_MATH_CLC_COS_H__
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_cos
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_COS_H__
diff --git a/libclc/clc/include/clc/math/clc_sin.h b/libclc/clc/include/clc/math/clc_sin.h
new file mode 100644
index 0000000000000..de4c722ca123f
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_sin.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_SIN_H__
+#define __CLC_MATH_CLC_SIN_H__
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_sin
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_SIN_H__
diff --git a/libclc/clc/include/clc/math/clc_sincos.h b/libclc/clc/include/clc/math/clc_sincos.h
new file mode 100644
index 0000000000000..e26dc7c024c9c
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_sincos.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_SINCOS_H__
+#define __CLC_MATH_CLC_SINCOS_H__
+
+#define __CLC_BODY <clc/math/unary_decl_with_ptr.inc>
+#define __CLC_FUNCTION __clc_sincos
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_SINCOS_H__
diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers.h b/libclc/clc/include/clc/math/clc_sincos_helpers.h
index 8029d439f4c62..e3a2e1c0b605c 100644
--- a/libclc/clc/include/clc/math/clc_sincos_helpers.h
+++ b/libclc/clc/include/clc/math/clc_sincos_helpers.h
@@ -16,4 +16,11 @@
 
 #undef __FLOAT_ONLY
 
+#define __DOUBLE_ONLY
+#define __CLC_BODY <clc/math/clc_sincos_helpers_fp64.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __DOUBLE_ONLY
+
 #endif // __CLC_MATH_CLC_SINCOS_HELPERS_H__
diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc
new file mode 100644
index 0000000000000..09c6e1c965f64
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc
@@ -0,0 +1,17 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_DECL _CLC_OVERLOAD void
+__clc_remainder_piby2_medium(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
+                             private __CLC_DOUBLEN *rr,
+                             private __CLC_INTN *regn);
+
+_CLC_DECL _CLC_OVERLOAD void
+__clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
+                            private __CLC_DOUBLEN *rr,
+                            private __CLC_INTN *regn);
diff --git a/libclc/clc/include/clc/math/tables.h b/libclc/clc/include/clc/math/tables.h
index fb172b0b8f221..0fec778b53679 100644
--- a/libclc/clc/include/clc/math/tables.h
+++ b/libclc/clc/include/clc/math/tables.h
@@ -61,7 +61,6 @@
 
 TABLE_FUNCTION_DECL(float2, log2_tbl);
 TABLE_FUNCTION_DECL(float2, log10_tbl);
-TABLE_FUNCTION_DECL(uint4, pibits_tbl);
 
 CLC_TABLE_FUNCTION_DECL(float, log_inv_tbl_ep_head);
 CLC_TABLE_FUNCTION_DECL(float, log_inv_tbl_ep_tail);
@@ -75,6 +74,7 @@ CLC_TABLE_FUNCTION_DECL(float, cbrt_tbl_head);
 CLC_TABLE_FUNCTION_DECL(float, cbrt_tbl_tail);
 CLC_TABLE_FUNCTION_DECL(float, sinhcosh_tbl_head);
 CLC_TABLE_FUNCTION_DECL(float, sinhcosh_tbl_tail);
+CLC_TABLE_FUNCTION_DECL(ulong, pibits_tbl);
 
 #ifdef cl_khr_fp64
 
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index 4240e7b08e7d1..4d66c749fc53e 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -32,6 +32,7 @@ math/clc_atanpi.cl
 math/clc_cbrt.cl
 math/clc_ceil.cl
 math/clc_copysign.cl
+math/clc_cos.cl
 math/clc_cosh.cl
 math/clc_cospi.cl
 math/clc_ep_log.cl
@@ -86,6 +87,8 @@ math/clc_rint.cl
 math/clc_rootn.cl
 math/clc_round.cl
 math/clc_rsqrt.cl
+math/clc_sin.cl
+math/clc_sincos.cl
 math/clc_sincos_helpers.cl
 math/clc_sinh.cl
 math/clc_sinpi.cl
diff --git a/libclc/clc/lib/generic/math/clc_cos.cl b/libclc/clc/lib/generic/math/clc_cos.cl
new file mode 100644
index 0000000000000..0c9dc287aa0b4
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_cos.cl
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/clc_convert.h>
+#include <clc/clcmacro.h>
+#include <clc/float/definitions.h>
+#include <clc/math/clc_fabs.h>
+#include <clc/math/clc_sincos_helpers.h>
+#include <clc/math/clc_sincos_piby4.h>
+#include <clc/math/math.h>
+#include <clc/relational/clc_isinf.h>
+#include <clc/relational/clc_isnan.h>
+#include <clc/relational/clc_select.h>
+
+#define __CLC_BODY <clc_cos.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/math/clc_cos.inc b/libclc/clc/lib/generic/math/clc_cos.inc
new file mode 100644
index 0000000000000..4b8108c086090
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_cos.inc
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if __CLC_FPSIZE == 32
+
+_CLC_OVERLOAD _CLC_DEF __CLC_FLOATN __clc_cos(__CLC_FLOATN x) {
+  __CLC_FLOATN absx = __clc_fabs(x);
+
+  __CLC_FLOATN r0, r1;
+  __CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx);
+
+  __CLC_FLOATN ss = -__clc_sinf_piby4(r0, r1);
+  __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1);
+
+  __CLC_FLOATN c = (regn & 1) != 0 ? ss : cc;
+  c = __CLC_AS_FLOATN(__CLC_AS_INTN(c) ^ ((regn > 1) << 31));
+
+  c = __clc_select(c, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isinf(x));
+
+  return c;
+}
+
+#elif __CLC_FPSIZE == 16
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cos(__CLC_GENTYPE x) {
+  return __CLC_CONVERT_GENTYPE(__clc_cos(__CLC_CONVERT_FLOATN(x)));
+}
+
+#elif __CLC_FPSIZE == 64
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cos(__CLC_GENTYPE x) {
+  __CLC_GENTYPE absx = __clc_fabs(x);
+
+  __CLC_BIT_INTN is_medium = absx < 0x1.0p+47;
+
+  __CLC_INTN regn_m, regn_l;
+  __CLC_GENTYPE r_m, r_l, rr_m, rr_l;
+
+  __clc_remainder_piby2_medium(absx, &r_m, &rr_m, &regn_m);
+  __clc_remainder_piby2_large(absx, &r_l, &rr_l, &regn_l);
+
+  __CLC_GENTYPE r = is_medium ? r_m : r_l;
+  __CLC_GENTYPE rr = is_medium ? rr_m : rr_l;
+  __CLC_INTN regn = __CLC_CONVERT_INTN(is_medium) ? regn_m : regn_l;
+
+  __CLC_GENTYPE sinval, cosval;
+  __clc_sincos_piby4(r, rr, &sinval, &cosval);
+  sinval = -sinval;
+
+  __CLC_LONGN c =
+      __CLC_AS_LONGN(__CLC_CONVERT_BIT_INTN((regn & 1) != 0) ? sinval : cosval);
+  c ^= __CLC_CONVERT_BIT_INTN(regn > 1) << 63;
+
+  return __clc_isnan(absx) | __clc_isinf(absx) ? __CLC_GENTYPE_NAN
+                                               : __CLC_AS_GENTYPE(c);
+}
+
+#endif
diff --git a/libclc/clc/lib/generic/math/clc_sin.cl b/libclc/clc/lib/generic/math/clc_sin.cl
new file mode 100644
index 0000000000000..0ff9739c6a846
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_sin.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/clc_convert.h>
+#include <clc/clcmacro.h>
+#include <clc/float/definitions.h>
+#include <clc/internal/clc.h>
+#include <clc/math/clc_fabs.h>
+#include <clc/math/clc_sincos_helpers.h>
+#include <clc/math/clc_sincos_piby4.h>
+#include <clc/math/clc_trunc.h>
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
+#include <clc/relational/clc_isinf.h>
+#include <clc/relational/clc_isnan.h>
+#include <clc/relational/clc_select.h>
+#include <clc/shared/clc_max.h>
+
+#define __CLC_BODY <clc_sin.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/math/clc_sin.inc b/libclc/clc/lib/generic/math/clc_sin.inc
new file mode 100644
index 0000000000000..b4f72eb625eb0
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_sin.inc
@@ -0,0 +1,68 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if __CLC_FPSIZE == 32
+
+_CLC_OVERLOAD _CLC_DEF __CLC_FLOATN __clc_sin(__CLC_FLOATN x) {
+  __CLC_FLOATN absx = __clc_fabs(x);
+
+  __CLC_FLOATN r0, r1;
+  __CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx);
+
+  __CLC_FLOATN ss = __clc_sinf_piby4(r0, r1);
+  __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1);
+
+  __CLC_FLOATN s = (regn & 1) != 0 ? cc : ss;
+  s = __CLC_AS_FLOATN(__CLC_AS_INTN(s) ^ ((regn > 1) << 31) ^
+                      (__CLC_AS_INTN(x) ^ __CLC_AS_INTN(absx)));
+
+  s = __clc_select(s, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isinf(x));
+
+  // Subnormals
+  s = x == 0.0f ? x : s;
+
+  return s;
+}
+
+#elif __CLC_FPSIZE == 16
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sin(__CLC_GENTYPE x) {
+  return __CLC_CONVERT_GENTYPE(__clc_sin(__CLC_CONVERT_FLOATN(x)));
+}
+
+#elif __CLC_FPSIZE == 64
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sin(__CLC_GENTYPE x) {
+  __CLC_GENTYPE absx = __clc_fabs(x);
+
+  __CLC_BIT_INTN is_medium = absx < 0x1.0p+47;
+
+  __CLC_INTN regn_m, regn_l;
+  __CLC_GENTYPE r_m, r_l, rr_m, rr_l;
+
+  __clc_remainder_piby2_medium(absx, &r_m, &rr_m, &regn_m);
+  __clc_remainder_piby2_large(absx, &r_l, &rr_l, &regn_l);
+
+  __CLC_GENTYPE r = is_medium ? r_m : r_l;
+  __CLC_GENTYPE rr = is_medium ? rr_m : rr_l;
+  __CLC_INTN regn = __CLC_CONVERT_INTN(is_medium) ? regn_m : regn_l;
+
+  __CLC_GENTYPE sinval, cosval;
+  __clc_sincos_piby4(r, rr, &sinval, &cosval);
+
+  __CLC_LONGN s =
+      __CLC_AS_LONGN(__CLC_CONVERT_BIT_INTN((regn & 1) != 0) ? cosval : sinval);
+
+  s ^= (__CLC_CONVERT_BIT_INTN(regn > 1) << 63) ^
+       (__CLC_CONVERT_BIT_INTN(x < 0.0) << 63);
+
+  return __clc_isinf(x) | __clc_isnan(x) ? __CLC_GENTYPE_NAN
+                                         : __CLC_AS_GENTYPE(s);
+}
+
+#endif
diff --git a/libclc/clc/lib/generic/math/clc_sincos.cl b/libclc/clc/lib/generic/math/clc_sincos.cl
new file mode 100644
index 0000000000000..2209a41593a2d
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_sincos.cl
@@ -0,0 +1,14 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/internal/clc.h>
+#include <clc/math/clc_cos.h>
+#include <clc/math/clc_sin.h>
+
+#define __CLC_BODY <clc_sincos.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/sincos.inc b/libclc/clc/lib/generic/math/clc_sincos.inc
similarity index 62%
rename from libclc/generic/lib/math/sincos.inc
rename to libclc/clc/lib/generic/math/clc_sincos.inc
index b1d4f88c3d427..1e21b7549e448 100644
--- a/libclc/generic/lib/math/sincos.inc
+++ b/libclc/clc/lib/generic/math/clc_sincos.inc
@@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define __CLC_DECLARE_SINCOS(ADDRSPACE, TYPE) \
-  _CLC_OVERLOAD _CLC_DEF TYPE sincos (TYPE x, ADDRSPACE TYPE * cosval) { \
-    *cosval = cos(x); \
-    return sin(x); \
+#define __CLC_DECLARE_SINCOS(ADDRSPACE, TYPE)                                  \
+  _CLC_OVERLOAD _CLC_DEF TYPE __clc_sincos(TYPE x, ADDRSPACE TYPE *cosval) {   \
+    *cosval = __clc_cos(x);                                                    \
+    return __clc_sin(x);                                                       \
   }
 
 __CLC_DECLARE_SINCOS(global, __CLC_GENTYPE)
diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers.cl b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
index 24676d3c7711c..c1768e39243fa 100644
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
@@ -31,3 +31,27 @@
 #define __CLC_BODY <clc_sincos_helpers.inc>
 
 #include <clc/math/gentype.inc>
+
+#undef __FLOAT_ONLY
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+#include <clc/math/clc_fract.h>
+#include <clc/math/tables.h>
+#include <clc/shared/clc_max.h>
+
+#define bytealign(src0, src1, src2)                                            \
+  (__CLC_CONVERT_UINTN(                                                        \
+      ((__CLC_CONVERT_LONGN((src0)) << 32) | __CLC_CONVERT_LONGN((src1))) >>   \
+      (((src2) & 3) * 8)))
+
+#define __DOUBLE_ONLY
+#define __CLC_BODY <clc_sincos_helpers_fp64.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __DOUBLE_ONLY
+
+#endif
diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
new file mode 100644
index 0000000000000..3a8edb378cae2
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
@@ -0,0 +1,234 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Reduction for medium sized arguments
+_CLC_DEF _CLC_OVERLOAD void
+__clc_remainder_piby2_medium(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
+                             private __CLC_DOUBLEN *rr,
+                             private __CLC_INTN *regn) {
+  // How many pi/2 is x a multiple of?
+  const __CLC_DOUBLEN two_by_pi = 0x1.45f306dc9c883p-1;
+  __CLC_DOUBLEN dnpi2 = __clc_trunc(__clc_fma(x, two_by_pi, 0.5));
+
+  const __CLC_DOUBLEN piby2_h = -7074237752028440.0 / 0x1.0p+52;
+  const __CLC_DOUBLEN piby2_m = -2483878800010755.0 / 0x1.0p+105;
+  const __CLC_DOUBLEN piby2_t = -3956492004828932.0 / 0x1.0p+158;
+
+  // Compute product of npi2 with 159 bits of 2/pi
+  __CLC_DOUBLEN p_hh = piby2_h * dnpi2;
+  __CLC_DOUBLEN p_ht = __clc_fma(piby2_h, dnpi2, -p_hh);
+  __CLC_DOUBLEN p_mh = piby2_m * dnpi2;
+  __CLC_DOUBLEN p_mt = __clc_fma(piby2_m, dnpi2, -p_mh);
+  __CLC_DOUBLEN p_th = piby2_t * dnpi2;
+  __CLC_DOUBLEN p_tt = __clc_fma(piby2_t, dnpi2, -p_th);
+
+  // Reduce to 159 bits
+  __CLC_DOUBLEN ph = p_hh;
+  __CLC_DOUBLEN pm = p_ht + p_mh;
+  __CLC_DOUBLEN t = p_mh - (pm - p_ht);
+  __CLC_DOUBLEN pt = p_th + t + p_mt + p_tt;
+  t = ph + pm;
+  pm = pm - (t - ph);
+  ph = t;
+  t = pm + pt;
+  pt = pt - (t - pm);
+  pm = t;
+
+  // Subtract from x
+  t = x + ph;
+  __CLC_DOUBLEN qh = t + pm;
+  __CLC_DOUBLEN qt = pm - (qh - t) + pt;
+
+  *r = qh;
+  *rr = qt;
+  *regn = __CLC_CONVERT_INTN(__CLC_CONVERT_LONGN(dnpi2)) & 0x3;
+}
+
+// Given positive argument x, reduce it to the range [-pi/4,pi/4] using
+// extra precision, and return the result in r, rr.
+// Return value "regn" tells how many lots of pi/2 were subtracted
+// from x to put it in the range [-pi/4,pi/4], mod 4.
+_CLC_DEF _CLC_OVERLOAD void
+__clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
+                            private __CLC_DOUBLEN *rr,
+                            private __CLC_INTN *regn) {
+
+  __CLC_LONGN ux = __CLC_AS_LONGN(x);
+  __CLC_INTN e = __CLC_CONVERT_INTN(ux >> 52) - 1023;
+  __CLC_INTN i = __clc_max(23, (e >> 3) + 17);
+  __CLC_INTN j = 150 - i;
+  __CLC_INTN j16 = j & ~0xf;
+  __CLC_DOUBLEN fract_temp;
+
+  // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary
+  // byte boundary
+  __CLC_ULONGN q0 = USE_TABLE(pibits_tbl, j16);
+  __CLC_ULONGN q3 = USE_TABLE(pibits_tbl, (j16 + 8));
+  __CLC_ULONGN q1 = USE_TABLE(pibits_tbl, (j16 + 16));
+  __CLC_ULONGN q4 = USE_TABLE(pibits_tbl, (j16 + 24));
+  __CLC_ULONGN q2 = USE_TABLE(pibits_tbl, (j16 + 32));
+
+  __CLC_INTN k = (j >> 2) & 0x3;
+  __CLC_INTN c0 = k == 0;
+  __CLC_INTN c1 = k == 1;
+  __CLC_INTN c2 = k == 2;
+  __CLC_INTN c3 = k == 3;
+
+  __CLC_UINTN u0, u1, u2, u3, u4, u5, u6;
+
+  __CLC_UINTN q0s0 = __CLC_CONVERT_UINTN(q0);
+  __CLC_UINTN q0s1 = __CLC_CONVERT_UINTN(q0 >> 32);
+
+  __CLC_UINTN q1s0 = __CLC_CONVERT_UINTN(q1);
+  __CLC_UINTN q1s1 = __CLC_CONVERT_UINTN(q1 >> 32);
+
+  __CLC_UINTN q2s0 = __CLC_CONVERT_UINTN(q2);
+  __CLC_UINTN q2s1 = __CLC_CONVERT_UINTN(q2 >> 32);
+
+  __CLC_UINTN q3s0 = __CLC_CONVERT_UINTN(q3);
+  __CLC_UINTN q3s1 = __CLC_CONVERT_UINTN(q3 >> 32);
+
+  __CLC_UINTN q4s0 = __CLC_CONVERT_UINTN(q4);
+  __CLC_UINTN q4s1 = __CLC_CONVERT_UINTN(q4 >> 32);
+
+  u0 = c1 ? q0s1 : q0s0;
+  u0 = c2 ? q3s0 : u0;
+  u0 = c3 ? q3s1 : u0;
+
+  u1 = c1 ? q3s0 : q0s1;
+  u1 = c2 ? q3s1 : u1;
+  u1 = c3 ? q1s0 : u1;
+
+  u2 = c1 ? q3s1 : q3s0;
+  u2 = c2 ? q1s0 : u2;
+  u2 = c3 ? q1s1 : u2;
+
+  u3 = c1 ? q1s0 : q3s1;
+  u3 = c2 ? q1s1 : u3;
+  u3 = c3 ? q4s0 : u3;
+
+  u4 = c1 ? q1s1 : q1s0;
+  u4 = c2 ? q4s0 : u4;
+  u4 = c3 ? q4s1 : u4;
+
+  u5 = c1 ? q4s0 : q1s1;
+  u5 = c2 ? q4s1 : u5;
+  u5 = c3 ? q2s0 : u5;
+
+  u6 = c1 ? q4s1 : q4s0;
+  u6 = c2 ? q2s0 : u6;
+  u6 = c3 ? q2s1 : u6;
+
+  __CLC_UINTN v0 = bytealign(u1, u0, j);
+  __CLC_UINTN v1 = bytealign(u2, u1, j);
+  __CLC_UINTN v2 = bytealign(u3, u2, j);
+  __CLC_UINTN v3 = bytealign(u4, u3, j);
+  __CLC_UINTN v4 = bytealign(u5, u4, j);
+  __CLC_UINTN v5 = bytealign(u6, u5, j);
+
+  // Place those 192 bits in 4 48-bit doubles along with correct exponent
+  // If i > 1018 we would get subnormals so we scale p up and x down to get the
+  // same product
+  i = 2 + 8 * i;
+  x *= __CLC_CONVERT_BIT_INTN(i > 1018) ? 0x1.0p-136 : 1.0;
+  i -= i > 1018 ? 136 : 0;
+
+#define doublen_lohi(x, y)                                                     \
+  __CLC_AS_DOUBLEN(__CLC_CONVERT_ULONGN((x)) & 0xFFFFFFFF |                    \
+                   __CLC_CONVERT_ULONGN((y)) << 32)
+
+  __CLC_UINTN ua = __CLC_CONVERT_UINTN(1023 + 52 - i) << 20;
+  __CLC_DOUBLEN a = doublen_lohi((__CLC_ULONGN)0, ua);
+  __CLC_DOUBLEN p0 = doublen_lohi(v0, ua | (v1 & 0xffffU)) - a;
+  ua += 0x03000000U;
+  a = doublen_lohi((__CLC_ULONGN)0, ua);
+  __CLC_DOUBLEN p1 =
+      doublen_lohi(((v2 << 16) | (v1 >> 16)), (ua | (v2 >> 16))) - a;
+  ua += 0x03000000U;
+  a = doublen_lohi((__CLC_ULONGN)0, ua);
+  __CLC_DOUBLEN p2 = doublen_lohi(v3, (ua | (v4 & 0xffffU))) - a;
+  ua += 0x03000000U;
+  a = doublen_lohi((__CLC_ULONGN)0, ua);
+  __CLC_DOUBLEN p3 =
+      doublen_lohi(((v5 << 16) | (v4 >> 16)), (ua | (v5 >> 16))) - a;
+
+#undef doublen_lohi
+
+  // Exact multiply
+  __CLC_DOUBLEN f0h = p0 * x;
+  __CLC_DOUBLEN f0l = __clc_fma(p0, x, -f0h);
+  __CLC_DOUBLEN f1h = p1 * x;
+  __CLC_DOUBLEN f1l = __clc_fma(p1, x, -f1h);
+  __CLC_DOUBLEN f2h = p2 * x;
+  __CLC_DOUBLEN f2l = __clc_fma(p2, x, -f2h);
+  __CLC_DOUBLEN f3h = p3 * x;
+  __CLC_DOUBLEN f3l = __clc_fma(p3, x, -f3h);
+
+  // Accumulate product into 4 doubles
+  __CLC_DOUBLEN s, t;
+
+  __CLC_DOUBLEN f3 = f3h + f2h;
+  t = f2h - (f3 - f3h);
+  s = f3l + t;
+  t = t - (s - f3l);
+
+  __CLC_DOUBLEN f2 = s + f1h;
+  t = f1h - (f2 - s) + t;
+  s = f2l + t;
+  t = t - (s - f2l);
+
+  __CLC_DOUBLEN f1 = s + f0h;
+  t = f0h - (f1 - s) + t;
+  s = f1l + t;
+
+  __CLC_DOUBLEN f0 = s + f0l;
+
+  // Strip off unwanted large integer bits
+  f3 = 0x1.0p+10 * __clc_fract(f3 * 0x1.0p-10, &fract_temp);
+  f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
+
+  // Compute least significant integer bits
+  t = f3 + f2;
+  __CLC_DOUBLEN di = t - __clc_fract(t, &fract_temp);
+  i = __CLC_CONVERT_INTN(__CLC_CONVERT_FLOATN(di));
+
+  // Shift out remaining integer part
+  f3 -= di;
+  s = f3 + f2;
+  t = f2 - (s - f3);
+  f3 = s;
+  f2 = t;
+  s = f2 + f1;
+  t = f1 - (s - f2);
+  f2 = s;
+  f1 = t;
+  f1 += f0;
+
+  // Subtract 1 if fraction is >= 0.5, and update regn
+  __CLC_INTN g = __CLC_CONVERT_INTN(f3 >= 0.5 ? 1L : 0L);
+  i += g;
+  f3 -= __CLC_CONVERT_DOUBLEN(__CLC_CONVERT_FLOATN(g));
+
+  // Shift up bits
+  s = f3 + f2;
+  t = f2 - (s - f3);
+  f3 = s;
+  f2 = t + f1;
+
+  // Multiply precise fraction by pi/2 to get radians
+  const __CLC_DOUBLEN p2h = 7074237752028440.0 / 0x1.0p+52;
+  const __CLC_DOUBLEN p2t = 4967757600021510.0 / 0x1.0p+106;
+
+  __CLC_DOUBLEN rhi = f3 * p2h;
+  __CLC_DOUBLEN rlo =
+      __clc_fma(f2, p2h, __clc_fma(f3, p2t, __clc_fma(f3, p2h, -rhi)));
+
+  *r = rhi + rlo;
+  *rr = rlo - (*r - rhi);
+  *regn = i & 0x3;
+}
diff --git a/libclc/clc/lib/generic/math/clc_tables.cl b/libclc/clc/lib/generic/math/clc_tables.cl
index 7ee744589ae1d..05053aea7bc12 100644
--- a/libclc/clc/lib/generic/math/clc_tables.cl
+++ b/libclc/clc/lib/generic/math/clc_tables.cl
@@ -1648,4 +1648,66 @@ CLC_TABLE_FUNCTION(double, SINH_TBL_TAIL, sinh_tbl_tail);
 CLC_TABLE_FUNCTION(double, COSH_TBL_HEAD, cosh_tbl_head);
 CLC_TABLE_FUNCTION(double, COSH_TBL_TAIL, cosh_tbl_tail);
 
+DECLARE_TABLE(uchar, PIBITS_TBL, ) = {
+    224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175,
+    169, 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31,
+    235, 158, 212, 181, 168, 127, 121, 154, 253, 24, 61, 221, 38,
+    44, 159, 60, 251, 217, 180, 125, 180, 41, 104, 45, 70, 188,
+    188, 63, 96, 22, 120, 255, 95, 226, 127, 236, 160, 228, 247,
+    46, 126, 17, 114, 210, 231, 76, 13, 230, 88, 71, 230, 4, 249,
+    125, 209, 154, 192, 113, 166, 19, 18, 237, 186, 212, 215, 8,
+    162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72, 70, 39,
+    168, 187, 36, 25, 128, 75, 55, 9, 233, 184, 145, 220, 134, 21,
+    239, 122, 175, 142, 69, 249, 7, 65, 14, 241, 100, 86, 138, 109,
+    3, 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13,
+    230, 139, 2, 0, 0, 0, 0, 0, 0, 0
+};
+
+_CLC_DEF _CLC_OVERLOAD ulong TABLE_MANGLE(pibits_tbl)(int idx) {
+  return *(__constant ulong *)(PIBITS_TBL + idx);
+}
+_CLC_DEF _CLC_OVERLOAD ulong2 TABLE_MANGLE(pibits_tbl)(int2 idx) {
+  return (ulong2){*(__constant ulong *)(PIBITS_TBL + idx.s0),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s1)};
+}
+_CLC_DEF _CLC_OVERLOAD ulong3 TABLE_MANGLE(pibits_tbl)(int3 idx) {
+  return (ulong3){*(__constant ulong *)(PIBITS_TBL + idx.s0),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s1),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s2)};
+}
+_CLC_DEF _CLC_OVERLOAD ulong4 TABLE_MANGLE(pibits_tbl)(int4 idx) {
+  return (ulong4){*(__constant ulong *)(PIBITS_TBL + idx.s0),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s1),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s2),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s3)};
+}
+_CLC_DEF _CLC_OVERLOAD ulong8 TABLE_MANGLE(pibits_tbl)(int8 idx) {
+  return (ulong8){*(__constant ulong *)(PIBITS_TBL + idx.s0),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s1),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s2),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s3),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s4),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s5),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s6),
+                  *(__constant ulong *)(PIBITS_TBL + idx.s7)};
+}
+_CLC_DEF _CLC_OVERLOAD ulong16 TABLE_MANGLE(pibits_tbl)(int16 idx) {
+  return (ulong16){*(__constant ulong *)(PIBITS_TBL + idx.s0),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s1),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s2),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s3),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s4),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s5),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s6),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s7),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s8),
+                   *(__constant ulong *)(PIBITS_TBL + idx.s9),
+                   *(__constant ulong *)(PIBITS_TBL + idx.sA),
+                   *(__constant ulong *)(PIBITS_TBL + idx.sB),
+                   *(__constant ulong *)(PIBITS_TBL + idx.sC),
+                   *(__constant ulong *)(PIBITS_TBL + idx.sD),
+                   *(__constant ulong *)(PIBITS_TBL + idx.sE),
+                   *(__constant ulong *)(PIBITS_TBL + idx.sF)};
+}
+
 #endif // cl_khr_fp64
diff --git a/libclc/clspv/lib/SOURCES b/libclc/clspv/lib/SOURCES
index d2fea9d586287..437210d1922da 100644
--- a/libclc/clspv/lib/SOURCES
+++ b/libclc/clspv/lib/SOURCES
@@ -66,10 +66,8 @@ subnormal_config.cl
 ../../generic/lib/math/rootn.cl
 ../../generic/lib/math/sin.cl
 ../../generic/lib/math/sincos.cl
-../../generic/lib/math/sincos_helpers.cl
 ../../generic/lib/math/sinh.cl
 ../../generic/lib/math/sinpi.cl
-../../generic/lib/math/tables.cl
 ../../generic/lib/math/tan.cl
 ../../generic/lib/math/tanh.cl
 ../../generic/lib/math/tanpi.cl
diff --git a/libclc/generic/include/clc/math/sincos.h b/libclc/generic/include/clc/math/sincos.h
index fbd29c441e319..7426c23e88af9 100644
--- a/libclc/generic/include/clc/math/sincos.h
+++ b/libclc/generic/include/clc/math/sincos.h
@@ -6,5 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define __CLC_BODY <clc/math/sincos.inc>
+#define __CLC_BODY <clc/math/unary_decl_with_ptr.inc>
+#define __CLC_FUNCTION __clc_sincos
 #include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
diff --git a/libclc/generic/include/clc/math/sincos.inc b/libclc/generic/include/clc/math/sincos.inc
deleted file mode 100644
index d6ec2fe94704e..0000000000000
--- a/libclc/generic/include/clc/math/sincos.inc
+++ /dev/null
@@ -1,14 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE sincos(__CLC_GENTYPE x,
-                                             global __CLC_GENTYPE *cosval);
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE sincos(__CLC_GENTYPE x,
-                                             local __CLC_GENTYPE *cosval);
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE sincos(__CLC_GENTYPE x,
-                                             private __CLC_GENTYPE *cosval);
diff --git a/libclc/generic/lib/SOURCES b/libclc/generic/lib/SOURCES
index 781aaf16a5244..8c7565e3dd231 100644
--- a/libclc/generic/lib/SOURCES
+++ b/libclc/generic/lib/SOURCES
@@ -131,7 +131,6 @@ math/native_rsqrt.cl
 math/native_sin.cl
 math/native_sqrt.cl
 math/native_tan.cl
-math/tables.cl
 math/nextafter.cl
 math/pow.cl
 math/pown.cl
@@ -144,7 +143,6 @@ math/round.cl
 math/rsqrt.cl
 math/sin.cl
 math/sincos.cl
-math/sincos_helpers.cl
 math/sinh.cl
 math/sinpi.cl
 math/sqrt.cl
diff --git a/libclc/generic/lib/math/clc_tan.cl b/libclc/generic/lib/math/clc_tan.cl
index 7e28e9ffed3b6..ce51c1031fa45 100644
--- a/libclc/generic/lib/math/clc_tan.cl
+++ b/libclc/generic/lib/math/clc_tan.cl
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "sincos_helpers.h"
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
diff --git a/libclc/generic/lib/math/cos.cl b/libclc/generic/lib/math/cos.cl
index 00ffa371ad8fc..5b97c6a6f379a 100644
--- a/libclc/generic/lib/math/cos.cl
+++ b/libclc/generic/lib/math/cos.cl
@@ -6,45 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "sincos_helpers.h"
 #include <clc/clc.h>
-#include <clc/clc_convert.h>
-#include <clc/clcmacro.h>
-#include <clc/math/clc_fabs.h>
-#include <clc/math/clc_sincos_helpers.h>
-#include <clc/math/math.h>
-#include <clc/relational/clc_isinf.h>
-#include <clc/relational/clc_isnan.h>
-#include <clc/relational/clc_select.h>
+#include <clc/math/clc_cos.h>
 
-// FP32 and FP16 versions.
-#define __CLC_BODY <cos.inc>
+#define FUNCTION cos
+#define __CLC_BODY <clc/shared/unary_def.inc>
 #include <clc/math/gentype.inc>
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_OVERLOAD _CLC_DEF double cos(double x) {
-    x = fabs(x);
-
-    double r, rr;
-    int regn;
-
-    if (x < 0x1.0p+47)
-        __clc_remainder_piby2_medium(x, &r, &rr, &regn);
-    else
-        __clc_remainder_piby2_large(x, &r, &rr, &regn);
-
-    double2 sc = __clc_sincos_piby4(r, rr);
-    sc.lo = -sc.lo;
-
-    int2 c = as_int2(regn & 1 ? sc.lo : sc.hi);
-    c.hi ^= (regn > 1) << 31;
-
-    return isnan(x) | isinf(x) ? as_double(QNANBITPATT_DP64) : as_double(c);
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cos, double);
-
-#endif
diff --git a/libclc/generic/lib/math/cos.inc b/libclc/generic/lib/math/cos.inc
deleted file mode 100644
index 1db96710457dd..0000000000000
--- a/libclc/generic/lib/math/cos.inc
+++ /dev/null
@@ -1,34 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-_CLC_OVERLOAD _CLC_DEF __CLC_FLOATN cos(__CLC_FLOATN x) {
-  __CLC_FLOATN absx = __clc_fabs(x);
-
-  __CLC_FLOATN r0, r1;
-  __CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx);
-
-  __CLC_FLOATN ss = -__clc_sinf_piby4(r0, r1);
-  __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1);
-
-  __CLC_FLOATN c = (regn & 1) != 0 ? ss : cc;
-  c = __CLC_AS_FLOATN(__CLC_AS_INTN(c) ^ ((regn > 1) << 31));
-
-  c = __clc_select(c, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isinf(x));
-
-  return c;
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE cos(__CLC_GENTYPE x) {
-  return __CLC_CONVERT_GENTYPE(cos(__CLC_CONVERT_FLOATN(x)));
-}
-
-#endif
diff --git a/libclc/generic/lib/math/sin.cl b/libclc/generic/lib/math/sin.cl
index f776805ad1cdb..76728cfb1c5e5 100644
--- a/libclc/generic/lib/math/sin.cl
+++ b/libclc/generic/lib/math/sin.cl
@@ -6,44 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "sincos_helpers.h"
 #include <clc/clc.h>
-#include <clc/clc_convert.h>
-#include <clc/clcmacro.h>
-#include <clc/math/clc_fabs.h>
-#include <clc/math/clc_sincos_helpers.h>
-#include <clc/math/math.h>
-#include <clc/relational/clc_isinf.h>
-#include <clc/relational/clc_isnan.h>
-#include <clc/relational/clc_select.h>
+#include <clc/math/clc_sin.h>
 
-// FP32 and FP16 versions.
-#define __CLC_BODY <sin.inc>
+#define FUNCTION sin
+#define __CLC_BODY <clc/shared/unary_def.inc>
 #include <clc/math/gentype.inc>
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_OVERLOAD _CLC_DEF double sin(double x) {
-    double y = fabs(x);
-
-    double r, rr;
-    int regn;
-
-    if (y < 0x1.0p+47)
-        __clc_remainder_piby2_medium(y, &r, &rr, &regn);
-    else
-        __clc_remainder_piby2_large(y, &r, &rr, &regn);
-
-    double2 sc = __clc_sincos_piby4(r, rr);
-
-    int2 s = as_int2(regn & 1 ? sc.hi : sc.lo);
-    s.hi ^= ((regn > 1) << 31) ^ ((x < 0.0) << 31);
-
-    return  isinf(x) | isnan(x) ? as_double(QNANBITPATT_DP64) : as_double(s);
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sin, double);
-
-#endif
diff --git a/libclc/generic/lib/math/sin.inc b/libclc/generic/lib/math/sin.inc
deleted file mode 100644
index dbc99116dfa7d..0000000000000
--- a/libclc/generic/lib/math/sin.inc
+++ /dev/null
@@ -1,38 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-_CLC_OVERLOAD _CLC_DEF __CLC_FLOATN sin(__CLC_FLOATN x) {
-  __CLC_FLOATN absx = __clc_fabs(x);
-
-  __CLC_FLOATN r0, r1;
-  __CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx);
-
-  __CLC_FLOATN ss = __clc_sinf_piby4(r0, r1);
-  __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1);
-
-  __CLC_FLOATN s = (regn & 1) != 0 ? cc : ss;
-  s = __CLC_AS_FLOATN(__CLC_AS_INTN(s) ^ ((regn > 1) << 31) ^
-                      (__CLC_AS_INTN(x) ^ __CLC_AS_INTN(absx)));
-
-  s = __clc_select(s, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isinf(x));
-
-  // Subnormals
-  s = x == 0.0f ? x : s;
-
-  return s;
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE sin(__CLC_GENTYPE x) {
-  return __CLC_CONVERT_GENTYPE(sin(__CLC_CONVERT_FLOATN(x)));
-}
-
-#endif
diff --git a/libclc/generic/lib/math/sincos.cl b/libclc/generic/lib/math/sincos.cl
index fc0e4d350c122..25e620cf2a38e 100644
--- a/libclc/generic/lib/math/sincos.cl
+++ b/libclc/generic/lib/math/sincos.cl
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include <clc/clc.h>
+#include <clc/math/clc_sincos.h>
 
-#define __CLC_BODY <sincos.inc>
+#define FUNCTION sincos
+#define __CLC_BODY <clc/math/unary_def_with_ptr.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/sincos_helpers.cl b/libclc/generic/lib/math/sincos_helpers.cl
deleted file mode 100644
index 651cd11ccf016..0000000000000
--- a/libclc/generic/lib/math/sincos_helpers.cl
+++ /dev/null
@@ -1,285 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "sincos_helpers.h"
-#include <clc/clc.h>
-#include <clc/integer/clc_clz.h>
-#include <clc/integer/clc_mul_hi.h>
-#include <clc/math/clc_fma.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/clc_trunc.h>
-#include <clc/math/math.h>
-#include <clc/math/tables.h>
-#include <clc/shared/clc_max.h>
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-#define bytealign(src0, src1, src2)                                            \
-  ((uint)(((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3) * 8)))
-
-// Reduction for medium sized arguments
-_CLC_DEF void __clc_remainder_piby2_medium(double x, private double *r,
-                                           private double *rr,
-                                           private int *regn) {
-  // How many pi/2 is x a multiple of?
-  const double two_by_pi = 0x1.45f306dc9c883p-1;
-  double dnpi2 = __clc_trunc(__clc_fma(x, two_by_pi, 0.5));
-
-  const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
-  const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
-  const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
-
-  // Compute product of npi2 with 159 bits of 2/pi
-  double p_hh = piby2_h * dnpi2;
-  double p_ht = __clc_fma(piby2_h, dnpi2, -p_hh);
-  double p_mh = piby2_m * dnpi2;
-  double p_mt = __clc_fma(piby2_m, dnpi2, -p_mh);
-  double p_th = piby2_t * dnpi2;
-  double p_tt = __clc_fma(piby2_t, dnpi2, -p_th);
-
-  // Reduce to 159 bits
-  double ph = p_hh;
-  double pm = p_ht + p_mh;
-  double t = p_mh - (pm - p_ht);
-  double pt = p_th + t + p_mt + p_tt;
-  t = ph + pm;
-  pm = pm - (t - ph);
-  ph = t;
-  t = pm + pt;
-  pt = pt - (t - pm);
-  pm = t;
-
-  // Subtract from x
-  t = x + ph;
-  double qh = t + pm;
-  double qt = pm - (qh - t) + pt;
-
-  *r = qh;
-  *rr = qt;
-  *regn = (int)(long)dnpi2 & 0x3;
-}
-
-// Given positive argument x, reduce it to the range [-pi/4,pi/4] using
-// extra precision, and return the result in r, rr.
-// Return value "regn" tells how many lots of pi/2 were subtracted
-// from x to put it in the range [-pi/4,pi/4], mod 4.
-
-_CLC_DEF void __clc_remainder_piby2_large(double x, private double *r,
-                                          private double *rr,
-                                          private int *regn) {
-
-  long ux = as_long(x);
-  int e = (int)(ux >> 52) - 1023;
-  int i = __clc_max(23, (e >> 3) + 17);
-  int j = 150 - i;
-  int j16 = j & ~0xf;
-  double fract_temp;
-
-  // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary
-  // byte boundary
-  uint4 q0 = USE_TABLE(pibits_tbl, j16);
-  uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16));
-  uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32));
-
-  int k = (j >> 2) & 0x3;
-  int4 c = (int4)k == (int4)(0, 1, 2, 3);
-
-  uint u0, u1, u2, u3, u4, u5, u6;
-
-  u0 = c.s1 ? q0.s1 : q0.s0;
-  u0 = c.s2 ? q0.s2 : u0;
-  u0 = c.s3 ? q0.s3 : u0;
-
-  u1 = c.s1 ? q0.s2 : q0.s1;
-  u1 = c.s2 ? q0.s3 : u1;
-  u1 = c.s3 ? q1.s0 : u1;
-
-  u2 = c.s1 ? q0.s3 : q0.s2;
-  u2 = c.s2 ? q1.s0 : u2;
-  u2 = c.s3 ? q1.s1 : u2;
-
-  u3 = c.s1 ? q1.s0 : q0.s3;
-  u3 = c.s2 ? q1.s1 : u3;
-  u3 = c.s3 ? q1.s2 : u3;
-
-  u4 = c.s1 ? q1.s1 : q1.s0;
-  u4 = c.s2 ? q1.s2 : u4;
-  u4 = c.s3 ? q1.s3 : u4;
-
-  u5 = c.s1 ? q1.s2 : q1.s1;
-  u5 = c.s2 ? q1.s3 : u5;
-  u5 = c.s3 ? q2.s0 : u5;
-
-  u6 = c.s1 ? q1.s3 : q1.s2;
-  u6 = c.s2 ? q2.s0 : u6;
-  u6 = c.s3 ? q2.s1 : u6;
-
-  uint v0 = bytealign(u1, u0, j);
-  uint v1 = bytealign(u2, u1, j);
-  uint v2 = bytealign(u3, u2, j);
-  uint v3 = bytealign(u4, u3, j);
-  uint v4 = bytealign(u5, u4, j);
-  uint v5 = bytealign(u6, u5, j);
-
-  // Place those 192 bits in 4 48-bit doubles along with correct exponent
-  // If i > 1018 we would get subnormals so we scale p up and x down to get the
-  // same product
-  i = 2 + 8 * i;
-  x *= i > 1018 ? 0x1.0p-136 : 1.0;
-  i -= i > 1018 ? 136 : 0;
-
-  uint ua = (uint)(1023 + 52 - i) << 20;
-  double a = as_double((uint2)(0, ua));
-  double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a;
-  ua += 0x03000000U;
-  a = as_double((uint2)(0, ua));
-  double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a;
-  ua += 0x03000000U;
-  a = as_double((uint2)(0, ua));
-  double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a;
-  ua += 0x03000000U;
-  a = as_double((uint2)(0, ua));
-  double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a;
-
-  // Exact multiply
-  double f0h = p0 * x;
-  double f0l = __clc_fma(p0, x, -f0h);
-  double f1h = p1 * x;
-  double f1l = __clc_fma(p1, x, -f1h);
-  double f2h = p2 * x;
-  double f2l = __clc_fma(p2, x, -f2h);
-  double f3h = p3 * x;
-  double f3l = __clc_fma(p3, x, -f3h);
-
-  // Accumulate product into 4 doubles
-  double s, t;
-
-  double f3 = f3h + f2h;
-  t = f2h - (f3 - f3h);
-  s = f3l + t;
-  t = t - (s - f3l);
-
-  double f2 = s + f1h;
-  t = f1h - (f2 - s) + t;
-  s = f2l + t;
-  t = t - (s - f2l);
-
-  double f1 = s + f0h;
-  t = f0h - (f1 - s) + t;
-  s = f1l + t;
-
-  double f0 = s + f0l;
-
-  // Strip off unwanted large integer bits
-  f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp);
-  f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
-
-  // Compute least significant integer bits
-  t = f3 + f2;
-  double di = t - fract(t, &fract_temp);
-  i = (float)di;
-
-  // Shift out remaining integer part
-  f3 -= di;
-  s = f3 + f2;
-  t = f2 - (s - f3);
-  f3 = s;
-  f2 = t;
-  s = f2 + f1;
-  t = f1 - (s - f2);
-  f2 = s;
-  f1 = t;
-  f1 += f0;
-
-  // Subtract 1 if fraction is >= 0.5, and update regn
-  int g = f3 >= 0.5;
-  i += g;
-  f3 -= (float)g;
-
-  // Shift up bits
-  s = f3 + f2;
-  t = f2 - (s - f3);
-  f3 = s;
-  f2 = t + f1;
-
-  // Multiply precise fraction by pi/2 to get radians
-  const double p2h = 7074237752028440.0 / 0x1.0p+52;
-  const double p2t = 4967757600021510.0 / 0x1.0p+106;
-
-  double rhi = f3 * p2h;
-  double rlo = __clc_fma(f2, p2h, __clc_fma(f3, p2t, __clc_fma(f3, p2h, -rhi)));
-
-  *r = rhi + rlo;
-  *rr = rlo - (*r - rhi);
-  *regn = i & 0x3;
-}
-
-_CLC_DEF double2 __clc_sincos_piby4(double x, double xx) {
-  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
-  //                      = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
-  //                      = x * f(w)
-  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
-  // We use a minimax approximation of (f(w) - 1) / w
-  // because this produces an expansion in even powers of x.
-  // If xx (the tail of x) is non-zero, we add a correction
-  // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
-  // is an approximation to cos(x)*sin(xx) valid because
-  // xx is tiny relative to x.
-
-  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
-  //                      = f(w)
-  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
-  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
-  // because this produces an expansion in even powers of x.
-  // If xx (the tail of x) is non-zero, we subtract a correction
-  // term g(x,xx) = x*xx to the result, where g(x,xx)
-  // is an approximation to sin(x)*sin(xx) valid because
-  // xx is tiny relative to x.
-
-  const double sc1 = -0.166666666666666646259241729;
-  const double sc2 = 0.833333333333095043065222816e-2;
-  const double sc3 = -0.19841269836761125688538679e-3;
-  const double sc4 = 0.275573161037288022676895908448e-5;
-  const double sc5 = -0.25051132068021699772257377197e-7;
-  const double sc6 = 0.159181443044859136852668200e-9;
-
-  const double cc1 = 0.41666666666666665390037e-1;
-  const double cc2 = -0.13888888888887398280412e-2;
-  const double cc3 = 0.248015872987670414957399e-4;
-  const double cc4 = -0.275573172723441909470836e-6;
-  const double cc5 = 0.208761463822329611076335e-8;
-  const double cc6 = -0.113826398067944859590880e-10;
-
-  double x2 = x * x;
-  double x3 = x2 * x;
-  double r = 0.5 * x2;
-  double t = 1.0 - r;
-
-  double sp = __clc_fma(
-      __clc_fma(__clc_fma(__clc_fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
-
-  double cp =
-      t +
-      __clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(cc6, x2, cc5),
-                                                        x2, cc4),
-                                              x2, cc3),
-                                    x2, cc2),
-                          x2, cc1),
-                x2 * x2, __clc_fma(x, xx, (1.0 - t) - r));
-
-  double2 ret;
-  ret.lo =
-      x - __clc_fma(-x3, sc1, __clc_fma(__clc_fma(-x3, sp, 0.5 * xx), x2, -xx));
-  ret.hi = cp;
-
-  return ret;
-}
-
-#endif
diff --git a/libclc/generic/lib/math/sincos_helpers.h b/libclc/generic/lib/math/sincos_helpers.h
deleted file mode 100644
index 11cb93f34850d..0000000000000
--- a/libclc/generic/lib/math/sincos_helpers.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <clc/clcfunc.h>
-#include <clc/clctypes.h>
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_DECL void __clc_remainder_piby2_medium(double x, private double *r,
-                                            private double *rr,
-                                            private int *regn);
-_CLC_DECL void __clc_remainder_piby2_large(double x, private double *r,
-                                           private double *rr,
-                                           private int *regn);
-_CLC_DECL double2 __clc_sincos_piby4(double x, double xx);
-
-#endif
diff --git a/libclc/generic/lib/math/tables.cl b/libclc/generic/lib/math/tables.cl
deleted file mode 100644
index 1bda480a9fa72..0000000000000
--- a/libclc/generic/lib/math/tables.cl
+++ /dev/null
@@ -1,30 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <clc/clc.h>
-
-#include <clc/math/tables.h>
-
-DECLARE_TABLE(uchar, PIBITS_TBL, ) = {
-    224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175,
-    169, 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31,
-    235, 158, 212, 181, 168, 127, 121, 154, 253, 24, 61, 221, 38,
-    44, 159, 60, 251, 217, 180, 125, 180, 41, 104, 45, 70, 188,
-    188, 63, 96, 22, 120, 255, 95, 226, 127, 236, 160, 228, 247,
-    46, 126, 17, 114, 210, 231, 76, 13, 230, 88, 71, 230, 4, 249,
-    125, 209, 154, 192, 113, 166, 19, 18, 237, 186, 212, 215, 8,
-    162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72, 70, 39,
-    168, 187, 36, 25, 128, 75, 55, 9, 233, 184, 145, 220, 134, 21,
-    239, 122, 175, 142, 69, 249, 7, 65, 14, 241, 100, 86, 138, 109,
-    3, 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13,
-    230, 139, 2, 0, 0, 0, 0, 0, 0, 0
-};
-
-uint4 TABLE_MANGLE(pibits_tbl)(size_t idx) {
-    return *(__constant uint4 *)(PIBITS_TBL + idx);
-}
diff --git a/libclc/spirv/lib/SOURCES b/libclc/spirv/lib/SOURCES
index 5446fe13a6d93..20e052253d977 100644
--- a/libclc/spirv/lib/SOURCES
+++ b/libclc/spirv/lib/SOURCES
@@ -55,7 +55,6 @@ math/fma.cl
 ../../generic/lib/math/log2.cl
 ../../generic/lib/math/logb.cl
 ../../generic/lib/math/modf.cl
-../../generic/lib/math/tables.cl
 ../../generic/lib/math/pow.cl
 ../../generic/lib/math/pown.cl
 ../../generic/lib/math/powr.cl
@@ -64,7 +63,6 @@ math/fma.cl
 ../../generic/lib/math/rootn.cl
 ../../generic/lib/math/sin.cl
 ../../generic/lib/math/sincos.cl
-../../generic/lib/math/sincos_helpers.cl
 ../../generic/lib/math/sinh.cl
 ../../generic/lib/math/sinpi.cl
 ../../generic/lib/math/clc_tan.cl

>From 212ae34f61507cc71dcd20f21af7a02e8e62593a Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser at codeplay.com>
Date: Thu, 1 May 2025 17:48:20 +0100
Subject: [PATCH 2/3] update table vars

---
 .../generic/math/clc_sincos_helpers_fp64.inc  | 69 ++++++++++---------
 1 file changed, 35 insertions(+), 34 deletions(-)

diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
index 3a8edb378cae2..9b5776d9b05e9 100644
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
@@ -59,7 +59,7 @@ __clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
                             private __CLC_INTN *regn) {
 
   __CLC_LONGN ux = __CLC_AS_LONGN(x);
-  __CLC_INTN e = __CLC_CONVERT_INTN(ux >> 52) - 1023;
+  __CLC_INTN e = __CLC_CONVERT_INTN(ux >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
   __CLC_INTN i = __clc_max(23, (e >> 3) + 17);
   __CLC_INTN j = 150 - i;
   __CLC_INTN j16 = j & ~0xf;
@@ -68,18 +68,10 @@ __clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
   // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary
   // byte boundary
   __CLC_ULONGN q0 = USE_TABLE(pibits_tbl, j16);
-  __CLC_ULONGN q3 = USE_TABLE(pibits_tbl, (j16 + 8));
-  __CLC_ULONGN q1 = USE_TABLE(pibits_tbl, (j16 + 16));
-  __CLC_ULONGN q4 = USE_TABLE(pibits_tbl, (j16 + 24));
-  __CLC_ULONGN q2 = USE_TABLE(pibits_tbl, (j16 + 32));
-
-  __CLC_INTN k = (j >> 2) & 0x3;
-  __CLC_INTN c0 = k == 0;
-  __CLC_INTN c1 = k == 1;
-  __CLC_INTN c2 = k == 2;
-  __CLC_INTN c3 = k == 3;
-
-  __CLC_UINTN u0, u1, u2, u3, u4, u5, u6;
+  __CLC_ULONGN q1 = USE_TABLE(pibits_tbl, (j16 + 8));
+  __CLC_ULONGN q2 = USE_TABLE(pibits_tbl, (j16 + 16));
+  __CLC_ULONGN q3 = USE_TABLE(pibits_tbl, (j16 + 24));
+  __CLC_ULONGN q4 = USE_TABLE(pibits_tbl, (j16 + 32));
 
   __CLC_UINTN q0s0 = __CLC_CONVERT_UINTN(q0);
   __CLC_UINTN q0s1 = __CLC_CONVERT_UINTN(q0 >> 32);
@@ -96,33 +88,41 @@ __clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
   __CLC_UINTN q4s0 = __CLC_CONVERT_UINTN(q4);
   __CLC_UINTN q4s1 = __CLC_CONVERT_UINTN(q4 >> 32);
 
+  __CLC_INTN k = (j >> 2) & 0x3;
+  __CLC_INTN c0 = k == 0;
+  __CLC_INTN c1 = k == 1;
+  __CLC_INTN c2 = k == 2;
+  __CLC_INTN c3 = k == 3;
+
+  __CLC_UINTN u0, u1, u2, u3, u4, u5, u6;
+
   u0 = c1 ? q0s1 : q0s0;
-  u0 = c2 ? q3s0 : u0;
-  u0 = c3 ? q3s1 : u0;
+  u0 = c2 ? q1s0 : u0;
+  u0 = c3 ? q1s1 : u0;
 
-  u1 = c1 ? q3s0 : q0s1;
-  u1 = c2 ? q3s1 : u1;
-  u1 = c3 ? q1s0 : u1;
+  u1 = c1 ? q1s0 : q0s1;
+  u1 = c2 ? q1s1 : u1;
+  u1 = c3 ? q2s0 : u1;
 
-  u2 = c1 ? q3s1 : q3s0;
-  u2 = c2 ? q1s0 : u2;
-  u2 = c3 ? q1s1 : u2;
+  u2 = c1 ? q1s1 : q1s0;
+  u2 = c2 ? q2s0 : u2;
+  u2 = c3 ? q2s1 : u2;
 
-  u3 = c1 ? q1s0 : q3s1;
-  u3 = c2 ? q1s1 : u3;
-  u3 = c3 ? q4s0 : u3;
+  u3 = c1 ? q2s0 : q1s1;
+  u3 = c2 ? q2s1 : u3;
+  u3 = c3 ? q3s0 : u3;
 
-  u4 = c1 ? q1s1 : q1s0;
-  u4 = c2 ? q4s0 : u4;
-  u4 = c3 ? q4s1 : u4;
+  u4 = c1 ? q2s1 : q2s0;
+  u4 = c2 ? q3s0 : u4;
+  u4 = c3 ? q3s1 : u4;
 
-  u5 = c1 ? q4s0 : q1s1;
-  u5 = c2 ? q4s1 : u5;
-  u5 = c3 ? q2s0 : u5;
+  u5 = c1 ? q3s0 : q2s1;
+  u5 = c2 ? q3s1 : u5;
+  u5 = c3 ? q4s0 : u5;
 
-  u6 = c1 ? q4s1 : q4s0;
-  u6 = c2 ? q2s0 : u6;
-  u6 = c3 ? q2s1 : u6;
+  u6 = c1 ? q3s1 : q3s0;
+  u6 = c2 ? q4s0 : u6;
+  u6 = c3 ? q4s1 : u6;
 
   __CLC_UINTN v0 = bytealign(u1, u0, j);
   __CLC_UINTN v1 = bytealign(u2, u1, j);
@@ -142,7 +142,8 @@ __clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
   __CLC_AS_DOUBLEN(__CLC_CONVERT_ULONGN((x)) & 0xFFFFFFFF |                    \
                    __CLC_CONVERT_ULONGN((y)) << 32)
 
-  __CLC_UINTN ua = __CLC_CONVERT_UINTN(1023 + 52 - i) << 20;
+  __CLC_UINTN ua = __CLC_CONVERT_UINTN(EXPBIAS_DP64 + EXPSHIFTBITS_DP64 - i)
+                   << 20;
   __CLC_DOUBLEN a = doublen_lohi((__CLC_ULONGN)0, ua);
   __CLC_DOUBLEN p0 = doublen_lohi(v0, ua | (v1 & 0xffffU)) - a;
   ua += 0x03000000U;

>From f629943b86bab7a283979a15adc15633c68e882f Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser at codeplay.com>
Date: Mon, 12 May 2025 11:17:17 +0100
Subject: [PATCH 3/3] fix formatting

---
 libclc/clc/lib/generic/math/clc_tables.cl | 24 +++++++++++------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/libclc/clc/lib/generic/math/clc_tables.cl b/libclc/clc/lib/generic/math/clc_tables.cl
index 05053aea7bc12..7db00532c8be5 100644
--- a/libclc/clc/lib/generic/math/clc_tables.cl
+++ b/libclc/clc/lib/generic/math/clc_tables.cl
@@ -1649,19 +1649,17 @@ CLC_TABLE_FUNCTION(double, COSH_TBL_HEAD, cosh_tbl_head);
 CLC_TABLE_FUNCTION(double, COSH_TBL_TAIL, cosh_tbl_tail);
 
 DECLARE_TABLE(uchar, PIBITS_TBL, ) = {
-    224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175,
-    169, 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31,
-    235, 158, 212, 181, 168, 127, 121, 154, 253, 24, 61, 221, 38,
-    44, 159, 60, 251, 217, 180, 125, 180, 41, 104, 45, 70, 188,
-    188, 63, 96, 22, 120, 255, 95, 226, 127, 236, 160, 228, 247,
-    46, 126, 17, 114, 210, 231, 76, 13, 230, 88, 71, 230, 4, 249,
-    125, 209, 154, 192, 113, 166, 19, 18, 237, 186, 212, 215, 8,
-    162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72, 70, 39,
-    168, 187, 36, 25, 128, 75, 55, 9, 233, 184, 145, 220, 134, 21,
-    239, 122, 175, 142, 69, 249, 7, 65, 14, 241, 100, 86, 138, 109,
-    3, 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13,
-    230, 139, 2, 0, 0, 0, 0, 0, 0, 0
-};
+    224, 241, 27,  193, 12,  88,  33,  116, 53,  126, 196, 126, 237, 175, 169,
+    75,  74,  41,  222, 231, 28,  244, 236, 197, 151, 175, 31,  235, 158, 212,
+    181, 168, 127, 121, 154, 253, 24,  61,  221, 38,  44,  159, 60,  251, 217,
+    180, 125, 180, 41,  104, 45,  70,  188, 188, 63,  96,  22,  120, 255, 95,
+    226, 127, 236, 160, 228, 247, 46,  126, 17,  114, 210, 231, 76,  13,  230,
+    88,  71,  230, 4,   249, 125, 209, 154, 192, 113, 166, 19,  18,  237, 186,
+    212, 215, 8,   162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72,  70,
+    39,  168, 187, 36,  25,  128, 75,  55,  9,   233, 184, 145, 220, 134, 21,
+    239, 122, 175, 142, 69,  249, 7,   65,  14,  241, 100, 86,  138, 109, 3,
+    119, 211, 212, 71,  95,  157, 240, 167, 84,  16,  57,  185, 13,  230, 139,
+    2,   0,   0,   0,   0,   0,   0,   0};
 
 _CLC_DEF _CLC_OVERLOAD ulong TABLE_MANGLE(pibits_tbl)(int idx) {
   return *(__constant ulong *)(PIBITS_TBL + idx);