[libclc] 641569d - libclc: Improve tgamma handling (#188066)
via cfe-commits
cfe-commits at lists.llvm.org
Tue Mar 24 02:45:11 PDT 2026
Author: Matt Arsenault
Date: 2026-03-24T10:45:07+01:00
New Revision: 641569de4e07f3a38af6720cabd84a91c73d8143
URL: https://github.com/llvm/llvm-project/commit/641569de4e07f3a38af6720cabd84a91c73d8143
DIFF: https://github.com/llvm/llvm-project/commit/641569de4e07f3a38af6720cabd84a91c73d8143.diff
LOG: libclc: Improve tgamma handling (#188066)
Added:
libclc/clc/lib/generic/math/clc_tgamma.inc
Modified:
libclc/clc/lib/generic/math/clc_tgamma.cl
Removed:
################################################################################
diff --git a/libclc/clc/lib/generic/math/clc_tgamma.cl b/libclc/clc/lib/generic/math/clc_tgamma.cl
index 0631028b6ce2f..7e34ac6cb8095 100644
--- a/libclc/clc/lib/generic/math/clc_tgamma.cl
+++ b/libclc/clc/lib/generic/math/clc_tgamma.cl
@@ -6,65 +6,23 @@
//
//===----------------------------------------------------------------------===//
+#include "clc/math/clc_tgamma.h"
+
+#include "clc/clc_convert.h"
#include "clc/float/definitions.h"
-#include "clc/internal/clc.h"
+#include "clc/math/clc_copysign.h"
#include "clc/math/clc_exp.h"
#include "clc/math/clc_fabs.h"
-#include "clc/math/clc_lgamma.h"
+#include "clc/math/clc_mad.h"
+#include "clc/math/clc_powr.h"
+#include "clc/math/clc_recip_fast.h"
#include "clc/math/clc_sinpi.h"
-#include "clc/math/math.h"
-
-_CLC_OVERLOAD _CLC_DEF float __clc_tgamma(float x) {
- const float pi = 3.1415926535897932384626433832795f;
- float absx = __clc_fabs(x);
- float lg = __clc_lgamma(absx);
- float g = __clc_exp(lg);
-
- if (x < 0.0f) {
- float z = __clc_sinpi(x);
- g = g * absx * z;
- g = pi / g;
- g = g == 0 ? INFINITY : g;
- g = z == 0 ? FLT_NAN : g;
- }
-
- return g;
-}
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_OVERLOAD _CLC_DEF double __clc_tgamma(double x) {
- const double pi = 3.1415926535897932384626433832795;
- double absx = __clc_fabs(x);
- double lg = __clc_lgamma(absx);
- double g = __clc_exp(lg);
+#include "clc/math/clc_trunc.h"
+#include "clc/relational/clc_isnan.h"
- if (x < 0.0) {
- double z = __clc_sinpi(x);
- g = g * absx * z;
- g = pi / g;
- g = g == 0 ? INFINITY : g;
- g = z == 0 ? DBL_NAN : g;
- }
-
- return g;
-}
-
-#endif
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// Forward the half version of this builtin onto the float one
-_CLC_OVERLOAD _CLC_DEF half __clc_tgamma(half x) {
- return (half)__clc_tgamma((float)x);
-}
-
-#endif
+#define __CLC_BODY "clc_tgamma.inc"
+#include "clc/math/gentype.inc"
#define __CLC_FUNCTION __clc_tgamma
-#define __CLC_BODY "clc/shared/unary_def_scalarize.inc"
+#define __CLC_BODY "clc/shared/unary_def_scalarize_loop.inc"
#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_tgamma.inc b/libclc/clc/lib/generic/math/clc_tgamma.inc
new file mode 100644
index 0000000000000..0b8c354ba48d6
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_tgamma.inc
@@ -0,0 +1,213 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+#if __CLC_FPSIZE != 16
+/// \returns true if the fractional part of \p x is 0. This is equivalent to
+// __clc_fract(x, / unused) == 0.0
+static _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
+__clc_fract_is_zero(__CLC_GENTYPE x) {
+ return __clc_trunc(x) == x;
+}
+#endif
+
+#if __CLC_FPSIZE == 32
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_tgamma(__CLC_FLOATN x) {
+ __CLC_FLOATN ax = __clc_fabs(x);
+ __CLC_FLOATN ret;
+
+ if (ax < 16.0f) {
+ __CLC_FLOATN n, d;
+ __CLC_FLOATN y = x;
+ if (x > 0.0f) {
+ n = 1.0f;
+
+ while (y > 2.5f) {
+ n = __clc_mad(n, y, -n);
+ y = y - 1.0f;
+ n = __clc_mad(n, y, -n);
+ y = y - 1.0f;
+ }
+
+ if (y > 1.5f) {
+ n = __clc_mad(n, y, -n);
+ y = y - 1.0f;
+ }
+
+ if (x >= 0.5f)
+ y = y - 1.0f;
+
+ d = x < 0.5f ? x : 1.0f;
+ } else {
+ d = x;
+
+ while (y < -1.5f) {
+ d = __clc_mad(d, y, d);
+ y = y + 1.0f;
+ d = __clc_mad(d, y, d);
+ y = y + 1.0f;
+ }
+
+ if (y < -0.5f) {
+ d = __clc_mad(d, y, d);
+ y = y + 1.0f;
+ }
+
+ n = 1.0f;
+ }
+
+ __CLC_FLOATN p0 = __clc_mad(y, 0x1.d5a56ep-8f, -0x1.4dcb00p-7f);
+ __CLC_FLOATN p1 = __clc_mad(y, p0, -0x1.59c03ap-5f);
+ __CLC_FLOATN p2 = __clc_mad(y, p1, 0x1.55405ap-3f);
+ __CLC_FLOATN p3 = __clc_mad(y, p2, -0x1.5810f2p-5f);
+ __CLC_FLOATN p4 = __clc_mad(y, p3, -0x1.4fcfd6p-1f);
+ __CLC_FLOATN qt = __clc_mad(y, p4, 0x1.2788ccp-1f);
+
+ ret = n / __clc_mad(d, y * qt, d);
+ ret = x == 0.0f ? __clc_copysign(__CLC_GENTYPE_INF, x) : ret;
+ ret = (x < 0.0f && __clc_fract_is_zero(x)) ? FLT_NAN : ret;
+ } else {
+ const __CLC_FLOATN sqrt2pi = 0x1.40d932p+1f;
+ const __CLC_FLOATN sqrtpiby2 = 0x1.40d932p+0f;
+
+ __CLC_FLOATN t1 = __clc_powr(ax, __clc_mad(ax, 0.5f, -0.25f));
+ __CLC_FLOATN t2 = __clc_exp(-ax);
+ __CLC_FLOATN xr = __clc_recip_fast(ax);
+ __CLC_FLOATN p = __clc_mad(
+ xr, __clc_mad(xr, 0x1.96d7e4p-9f, 0x1.556652p-4f), 0x1.fffff8p-1f);
+ if (x > 0.0f) {
+ __CLC_FLOATN g = sqrt2pi * t2 * t1 * t1 * p;
+ ret = x > 0x1.18521ep+5f ? __CLC_GENTYPE_INF : g;
+ } else {
+ __CLC_FLOATN s = -x * __clc_sinpi(x);
+ if (x > -30.0f)
+ ret = sqrtpiby2 / (s * t2 * t1 * t1 * p);
+ else if (x > -41.0f)
+ ret = (sqrtpiby2 / (t2 * t1 * p)) / (s * t1);
+ else
+ ret = __clc_copysign(0.0f, s);
+
+ ret = __clc_fract_is_zero(x) || __clc_isnan(x) ? FLT_NAN : ret;
+ }
+ }
+
+ return ret;
+}
+
+#elif __CLC_FPSIZE == 64
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_tgamma(__CLC_DOUBLEN x) {
+ __CLC_DOUBLEN ax = __clc_fabs(x);
+ __CLC_DOUBLEN ret;
+
+ if (ax < 16.0) {
+ __CLC_DOUBLEN n, d;
+ __CLC_DOUBLEN y = x;
+ if (x > 0.0) {
+ n = 1.0;
+ while (y > 2.5) {
+ n = __clc_mad(n, y, -n);
+ y = y - 1.0;
+ n = __clc_mad(n, y, -n);
+ y = y - 1.0;
+ }
+
+ if (y > 1.5) {
+ n = __clc_mad(n, y, -n);
+ y = y - 1.0;
+ }
+
+ if (x >= 0.5)
+ y = y - 1.0;
+
+ d = x < 0.5 ? x : 1.0;
+ } else {
+ d = x;
+
+ while (y < -1.5) {
+ d = __clc_mad(d, y, d);
+ y = y + 1.0;
+ d = __clc_mad(d, y, d);
+ y = y + 1.0;
+ }
+
+ if (y < -0.5) {
+ d = __clc_mad(d, y, d);
+ y = y + 1.0;
+ }
+
+ n = 1.0;
+ }
+
+ __CLC_DOUBLEN t0 =
+ __clc_mad(y, -0x1.aed75feec7b9ap-23, 0x1.31854a0be3cd3p-20);
+ __CLC_DOUBLEN t1 = __clc_mad(y, t0, -0x1.5037d6a97a8b7p-20);
+ __CLC_DOUBLEN t2 = __clc_mad(y, t1, -0x1.51d67f2cdbcfbp-16);
+ __CLC_DOUBLEN t3 = __clc_mad(y, t2, 0x1.0c8ab2ac5112dp-13);
+ __CLC_DOUBLEN t4 = __clc_mad(y, t3, -0x1.c364ce9b5e149p-13);
+ __CLC_DOUBLEN t5 = __clc_mad(y, t4, -0x1.317113a39f929p-10);
+ __CLC_DOUBLEN t6 = __clc_mad(y, t5, 0x1.d919c501178a3p-8);
+ __CLC_DOUBLEN t7 = __clc_mad(y, t6, -0x1.3b4af282da690p-7);
+ __CLC_DOUBLEN t8 = __clc_mad(y, t7, -0x1.59af103bf2cd0p-5);
+ __CLC_DOUBLEN t9 = __clc_mad(y, t8, 0x1.5512320b432ccp-3);
+ __CLC_DOUBLEN t10 = __clc_mad(y, t9, -0x1.5815e8fa28886p-5);
+ __CLC_DOUBLEN t11 = __clc_mad(y, t10, -0x1.4fcf4026afa24p-1);
+ __CLC_DOUBLEN qt = __clc_mad(y, t11, 0x1.2788cfc6fb61cp-1);
+
+ ret = n / __clc_mad(d, y * qt, d);
+ ret = x == 0.0 ? __clc_copysign(__CLC_GENTYPE_INF, x) : ret;
+ ret = (x < 0.0 && __clc_fract_is_zero(x)) ? DBL_NAN : ret;
+ } else {
+ const __CLC_DOUBLEN sqrt2pi = 0x1.40d931ff62706p+1;
+ const __CLC_DOUBLEN sqrtpiby2 = 0x1.40d931ff62706p+0;
+
+ __CLC_DOUBLEN t1 = __clc_powr(ax, __clc_mad(ax, 0.5, -0.25));
+ __CLC_DOUBLEN t2 = __clc_exp(-ax);
+ __CLC_DOUBLEN xr = __clc_recip_fast(ax);
+
+ __CLC_DOUBLEN p0 =
+ __clc_mad(xr, -0x1.2b04c5ea74bbfp-11, 0x1.14869344f1d9bp-14);
+ __CLC_DOUBLEN p1 = __clc_mad(xr, p0, 0x1.9b3457156ffefp-11);
+ __CLC_DOUBLEN p2 = __clc_mad(xr, p1, -0x1.e1427e86ee097p-13);
+ __CLC_DOUBLEN p3 = __clc_mad(xr, p2, -0x1.5f7266f67c4e0p-9);
+ __CLC_DOUBLEN p4 = __clc_mad(xr, p3, 0x1.c71c71c0f96adp-9);
+ __CLC_DOUBLEN pt = __clc_mad(xr, p4, 0x1.5555555555a28p-4);
+
+ if (x > 0.0) {
+ __CLC_DOUBLEN gt = sqrt2pi * t2 * t1 * t1;
+ __CLC_DOUBLEN g = __clc_mad(gt, xr * pt, gt);
+ ret = x > 0x1.573fae561f646p+7 ? __CLC_GENTYPE_INF : g;
+ } else {
+ __CLC_DOUBLEN s = -x * __clc_sinpi(x);
+ if (x > -170.5) {
+ __CLC_DOUBLEN d = s * t2 * t1 * t1;
+ ret = sqrtpiby2 / __clc_mad(d, xr * pt, d);
+ } else if (x > -184.0) {
+ __CLC_DOUBLEN d = t2 * t1;
+ ret = (sqrtpiby2 / __clc_mad(d, xr * pt, d)) / (s * t1);
+ } else
+ ret = __clc_copysign(0.0, s);
+
+ ret = __clc_fract_is_zero(x) || __clc_isnan(x) ? DBL_NAN : ret;
+ }
+ }
+
+ return ret;
+}
+
+#elif __CLC_FPSIZE == 16
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_tgamma(__CLC_HALFN x) {
+ return __CLC_CONVERT_HALFN(__clc_tgamma(__CLC_CONVERT_FLOATN(x)));
+}
+
+#endif
+
+#endif // __CLC_SCALAR
More information about the cfe-commits
mailing list