[libclc] ff3632c - libclc: Update lgamma_r (#188065)

Tue Mar 24 02:43:07 PDT 2026

Author: Matt Arsenault
Date: 2026-03-24T10:43:00+01:00
New Revision: ff3632cdf37c94a4eb830c6cd501b3067a79163d

URL: https://github.com/llvm/llvm-project/commit/ff3632cdf37c94a4eb830c6cd501b3067a79163d
DIFF: https://github.com/llvm/llvm-project/commit/ff3632cdf37c94a4eb830c6cd501b3067a79163d.diff

LOG: libclc: Update lgamma_r (#188065)

This was originally ported from rocm device libs in
0ab07e1bde7d002f1a4c30babb6241c0cc366320. Merge
in more recent changes.

Added: 
    libclc/clc/include/clc/math/clc_lgamma_r_decl.inc
    libclc/clc/include/clc/shared/unary_with_out_arg_scalarize_loop.inc
    libclc/clc/lib/generic/math/clc_lgamma_r_stret.inc

Modified: 
    libclc/clc/include/clc/math/clc_lgamma_r.h
    libclc/clc/include/clc/shared/binary_with_out_arg_scalarize.inc
    libclc/clc/lib/generic/math/clc_lgamma_r.cl
    libclc/clc/lib/generic/math/clc_lgamma_r.inc

Removed: 
    


################################################################################
diff  --git a/libclc/clc/include/clc/math/clc_lgamma_r.h b/libclc/clc/include/clc/math/clc_lgamma_r.h
index 93e8c8dcfcbe6..7029c0539b409 100644

--- a/libclc/clc/include/clc/math/clc_lgamma_r.h
+++ b/libclc/clc/include/clc/math/clc_lgamma_r.h
@@ -9,11 +9,13 @@
 #ifndef __CLC_MATH_CLC_LGAMMA_R_H__
 #define __CLC_MATH_CLC_LGAMMA_R_H__
 
+#define __CLC_BODY "clc_lgamma_r_decl.inc"
+#include "clc/math/gentype.inc"
+
 #define __CLC_FUNCTION __clc_lgamma_r
 #define __CLC_BODY "clc/math/unary_decl_with_int_ptr.inc"
 
 #include "clc/math/gentype.inc"
-
 #undef __CLC_FUNCTION
 
 #endif // __CLC_MATH_CLC_LGAMMA_R_H__

diff  --git a/libclc/clc/include/clc/math/clc_lgamma_r_decl.inc b/libclc/clc/include/clc/math/clc_lgamma_r_decl.inc
new file mode 100644
index 0000000000000..61a80a56aacdb
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_lgamma_r_decl.inc
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+typedef struct __CLC_XCONCAT(__clc_lgamma_r_ret_, __CLC_GENTYPE) {
+  __CLC_GENTYPE result;
+  __CLC_INTN sign;
+} __CLC_XCONCAT(__clc_lgamma_r_ret_, __CLC_GENTYPE);
+
+#define __CLC_LGAMMA_R_RET_GENTYPE                                             \
+  __CLC_XCONCAT(__clc_lgamma_r_ret_, __CLC_GENTYPE)
+_CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_LGAMMA_R_RET_GENTYPE
+__clc_lgamma_r_stret(__CLC_GENTYPE x);
+
+#endif

diff  --git a/libclc/clc/include/clc/shared/binary_with_out_arg_scalarize.inc b/libclc/clc/include/clc/shared/binary_with_out_arg_scalarize.inc
index 2c233b36cc73c..bcd646ad073ea 100644
--- a/libclc/clc/include/clc/shared/binary_with_out_arg_scalarize.inc
+++ b/libclc/clc/include/clc/shared/binary_with_out_arg_scalarize.inc
@@ -8,6 +8,10 @@
 
 #include "clc/utils.h"
 
+#ifndef __CLC_FUNCTION
+#error missing function def
+#endif
+
 #ifndef __CLC_IMPL_FUNCTION
 #define __CLC_IMPL_FUNCTION __CLC_FUNCTION
 #endif

diff  --git a/libclc/clc/include/clc/shared/unary_with_out_arg_scalarize_loop.inc b/libclc/clc/include/clc/shared/unary_with_out_arg_scalarize_loop.inc
new file mode 100644
index 0000000000000..51404c204b7ea
--- /dev/null
+++ b/libclc/clc/include/clc/shared/unary_with_out_arg_scalarize_loop.inc
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/utils.h"
+
+#if __CLC_VECSIZE_OR_1 >= 2
+
+#ifndef __CLC_IMPL_FUNCTION
+#define __CLC_IMPL_FUNCTION __CLC_FUNCTION
+#endif
+
+#ifndef __CLC_RET_SCALAR_TYPE
+#define __CLC_RET_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#ifndef __CLC_ARG1_SCALAR_TYPE
+#define __CLC_ARG1_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#ifndef __CLC_OUT_ARG2_SCALAR_TYPE
+#define __CLC_OUT_ARG2_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#ifndef __CLC_ADDRSPACE
+#error missing addrspace def
+#endif
+
+#define __CLC_RET_TYPE __CLC_XCONCAT(__CLC_RET_SCALAR_TYPE, __CLC_VECSIZE)
+#define __CLC_ARG1_TYPE __CLC_XCONCAT(__CLC_ARG1_SCALAR_TYPE, __CLC_VECSIZE)
+#define __CLC_OUT_ARG2_TYPE                                                    \
+  __CLC_XCONCAT(__CLC_OUT_ARG2_SCALAR_TYPE, __CLC_VECSIZE)
+
+_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE
+__CLC_FUNCTION(__CLC_ARG1_TYPE x, __CLC_ADDRSPACE __CLC_OUT_ARG2_TYPE *y) {
+  union {
+    __CLC_ARG1_TYPE vec;
+    __CLC_ARG1_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+  } u_x;
+
+  union {
+    __CLC_RET_TYPE vec;
+    __CLC_RET_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+  } u_result0;
+
+  union {
+    __CLC_OUT_ARG2_TYPE vec;
+    __CLC_OUT_ARG2_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+  } u_result1;
+
+  u_x.vec = x;
+  for (int i = 0; i < __CLC_VECSIZE_OR_1; ++i)
+    u_result0.arr[i] = __CLC_IMPL_FUNCTION(u_x.arr[i], &u_result1.arr[i]);
+
+  *y = u_result1.vec;
+  return u_result0.vec;
+}
+
+#undef __CLC_RET_TYPE
+#undef __CLC_ARG1_TYPE
+#undef __CLC_OUT_ARG2_TYPE
+
+#endif // __CLC_VECSIZE_OR_1 >= 2

diff  --git a/libclc/clc/lib/generic/math/clc_lgamma_r.cl b/libclc/clc/lib/generic/math/clc_lgamma_r.cl
index 929aadeb5357b..154a541f9baf5 100644
--- a/libclc/clc/lib/generic/math/clc_lgamma_r.cl
+++ b/libclc/clc/lib/generic/math/clc_lgamma_r.cl
@@ -6,616 +6,52 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clc/math/clc_lgamma_r.h"
+
 #include "clc/clc_convert.h"
 #include "clc/float/definitions.h"
-#include "clc/internal/clc.h"
+#include "clc/math/clc_div_fast.h"
 #include "clc/math/clc_fabs.h"
 #include "clc/math/clc_fma.h"
 #include "clc/math/clc_log.h"
 #include "clc/math/clc_mad.h"
+#include "clc/math/clc_recip_fast.h"
 #include "clc/math/clc_sinpi.h"
-#include "clc/math/math.h"
-
-// ====================================================
-// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
-//
-// Developed at SunPro, a Sun Microsystems, Inc. business.
-// Permission to use, copy, modify, and distribute this
-// software is freely granted, provided that this notice
-// is preserved.
-// ====================================================
-
-#define pi_f 3.1415927410e+00f /* 0x40490fdb */
-
-#define a0_f 7.7215664089e-02f  /* 0x3d9e233f */
-#define a1_f 3.2246702909e-01f  /* 0x3ea51a66 */
-#define a2_f 6.7352302372e-02f  /* 0x3d89f001 */
-#define a3_f 2.0580807701e-02f  /* 0x3ca89915 */
-#define a4_f 7.3855509982e-03f  /* 0x3bf2027e */
-#define a5_f 2.8905137442e-03f  /* 0x3b3d6ec6 */
-#define a6_f 1.1927076848e-03f  /* 0x3a9c54a1 */
-#define a7_f 5.1006977446e-04f  /* 0x3a05b634 */
-#define a8_f 2.2086278477e-04f  /* 0x39679767 */
-#define a9_f 1.0801156895e-04f  /* 0x38e28445 */
-#define a10_f 2.5214456400e-05f /* 0x37d383a2 */
-#define a11_f 4.4864096708e-05f /* 0x383c2c75 */
-
-#define tc_f 1.4616321325e+00f /* 0x3fbb16c3 */
-
-#define tf_f -1.2148628384e-01f /* 0xbdf8cdcd */
-/* tt -(tail of tf) */
-#define tt_f 6.6971006518e-09f /* 0x31e61c52 */
-
-#define t0_f 4.8383611441e-01f   /* 0x3ef7b95e */
-#define t1_f -1.4758771658e-01f  /* 0xbe17213c */
-#define t2_f 6.4624942839e-02f   /* 0x3d845a15 */
-#define t3_f -3.2788541168e-02f  /* 0xbd064d47 */
-#define t4_f 1.7970675603e-02f   /* 0x3c93373d */
-#define t5_f -1.0314224288e-02f  /* 0xbc28fcfe */
-#define t6_f 6.1005386524e-03f   /* 0x3bc7e707 */
-#define t7_f -3.6845202558e-03f  /* 0xbb7177fe */
-#define t8_f 2.2596477065e-03f   /* 0x3b141699 */
-#define t9_f -1.4034647029e-03f  /* 0xbab7f476 */
-#define t10_f 8.8108185446e-04f  /* 0x3a66f867 */
-#define t11_f -5.3859531181e-04f /* 0xba0d3085 */
-#define t12_f 3.1563205994e-04f  /* 0x39a57b6b */
-#define t13_f -3.1275415677e-04f /* 0xb9a3f927 */
-#define t14_f 3.3552918467e-04f  /* 0x39afe9f7 */
-
-#define u0_f -7.7215664089e-02f /* 0xbd9e233f */
-#define u1_f 6.3282704353e-01f  /* 0x3f2200f4 */
-#define u2_f 1.4549225569e+00f  /* 0x3fba3ae7 */
-#define u3_f 9.7771751881e-01f  /* 0x3f7a4bb2 */
-#define u4_f 2.2896373272e-01f  /* 0x3e6a7578 */
-#define u5_f 1.3381091878e-02f  /* 0x3c5b3c5e */
-
-#define v1_f 2.4559779167e+00f /* 0x401d2ebe */
-#define v2_f 2.1284897327e+00f /* 0x4008392d */
-#define v3_f 7.6928514242e-01f /* 0x3f44efdf */
-#define v4_f 1.0422264785e-01f /* 0x3dd572af */
-#define v5_f 3.2170924824e-03f /* 0x3b52d5db */
-
-#define s0_f -7.7215664089e-02f /* 0xbd9e233f */
-#define s1_f 2.1498242021e-01f  /* 0x3e5c245a */
-#define s2_f 3.2577878237e-01f  /* 0x3ea6cc7a */
-#define s3_f 1.4635047317e-01f  /* 0x3e15dce6 */
-#define s4_f 2.6642270386e-02f  /* 0x3cda40e4 */
-#define s5_f 1.8402845599e-03f  /* 0x3af135b4 */
-#define s6_f 3.1947532989e-05f  /* 0x3805ff67 */
-
-#define r1_f 1.3920053244e+00f /* 0x3fb22d3b */
-#define r2_f 7.2193557024e-01f /* 0x3f38d0c5 */
-#define r3_f 1.7193385959e-01f /* 0x3e300f6e */
-#define r4_f 1.8645919859e-02f /* 0x3c98bf54 */
-#define r5_f 7.7794247773e-04f /* 0x3a4beed6 */
-#define r6_f 7.3266842264e-06f /* 0x36f5d7bd */
-
-#define w0_f 4.1893854737e-01f  /* 0x3ed67f1d */
-#define w1_f 8.3333335817e-02f  /* 0x3daaaaab */
-#define w2_f -2.7777778450e-03f /* 0xbb360b61 */
-#define w3_f 7.9365057172e-04f  /* 0x3a500cfd */
-#define w4_f -5.9518753551e-04f /* 0xba1c065c */
-#define w5_f 8.3633989561e-04f  /* 0x3a5b3dd2 */
-#define w6_f -1.6309292987e-03f /* 0xbad5c4e8 */
-
-_CLC_OVERLOAD _CLC_DEF float __clc_lgamma_r(float x, private int *signp) {
-  int hx = __clc_as_int(x);
-  float absx = __clc_fabs(x);
-  int ix = __clc_as_int(absx);
-
-  if (ix >= 0x7f800000) {
-    *signp = 1;
-    return x;
-  }
-
-  if (absx < 0x1.0p-70f) {
-    *signp = hx < 0 ? -1 : 1;
-    return -__clc_log(absx);
-  }
-
-  float r;
-
-  if (absx == 1.0f | absx == 2.0f)
-    r = 0.0f;
-
-  else if (absx < 2.0f) {
-    float y = 2.0f - absx;
-    int i = 0;
-
-    int c = absx < 0x1.bb4c30p+0f;
-    float yt = absx - tc_f;
-    y = c ? yt : y;
-    i = c ? 1 : i;
-
-    c = absx < 0x1.3b4c40p+0f;
-    yt = absx - 1.0f;
-    y = c ? yt : y;
-    i = c ? 2 : i;
-
-    r = -__clc_log(absx);
-    yt = 1.0f - absx;
-    c = absx <= 0x1.ccccccp-1f;
-    r = c ? r : 0.0f;
-    y = c ? yt : y;
-    i = c ? 0 : i;
-
-    c = absx < 0x1.769440p-1f;
-    yt = absx - (tc_f - 1.0f);
-    y = c ? yt : y;
-    i = c ? 1 : i;
-
-    c = absx < 0x1.da6610p-3f;
-    y = c ? absx : y;
-    i = c ? 2 : i;
-
-    float z, w, p1, p2, p3, p;
-    switch (i) {
-    case 0:
-      z = y * y;
-      p1 = __clc_mad(
-          z,
-          __clc_mad(
-              z,
-              __clc_mad(z, __clc_mad(z, __clc_mad(z, a10_f, a8_f), a6_f), a4_f),
-              a2_f),
-          a0_f);
-      p2 = z *
-           __clc_mad(
-               z,
-               __clc_mad(
-                   z,
-                   __clc_mad(z, __clc_mad(z, __clc_mad(z, a11_f, a9_f), a7_f),
-                             a5_f),
-                   a3_f),
-               a1_f);
-      p = __clc_mad(y, p1, p2);
-      r += __clc_mad(y, -0.5f, p);
-      break;
-    case 1:
-      z = y * y;
-      w = z * y;
-      p1 = __clc_mad(
-          w, __clc_mad(w, __clc_mad(w, __clc_mad(w, t12_f, t9_f), t6_f), t3_f),
-          t0_f);
-      p2 = __clc_mad(
-          w, __clc_mad(w, __clc_mad(w, __clc_mad(w, t13_f, t10_f), t7_f), t4_f),
-          t1_f);
-      p3 = __clc_mad(
-          w, __clc_mad(w, __clc_mad(w, __clc_mad(w, t14_f, t11_f), t8_f), t5_f),
-          t2_f);
-      p = __clc_mad(z, p1, -__clc_mad(w, -__clc_mad(y, p3, p2), tt_f));
-      r += tf_f + p;
-      break;
-    case 2:
-      p1 = y *
-           __clc_mad(
-               y,
-               __clc_mad(y,
-                         __clc_mad(y,
-                                   __clc_mad(y, __clc_mad(y, u5_f, u4_f), u3_f),
-                                   u2_f),
-                         u1_f),
-               u0_f);
-      p2 = __clc_mad(
-          y,
-          __clc_mad(
-              y,
-              __clc_mad(y, __clc_mad(y, __clc_mad(y, v5_f, v4_f), v3_f), v2_f),
-              v1_f),
-          1.0f);
-      r += __clc_mad(y, -0.5f, MATH_DIVIDE(p1, p2));
-      break;
-    }
-  } else if (absx < 8.0f) {
-    int i = (int)absx;
-    float y = absx - (float)i;
-    float p =
-        y *
-        __clc_mad(
-            y,
-            __clc_mad(
-                y,
-                __clc_mad(
-                    y,
-                    __clc_mad(y, __clc_mad(y, __clc_mad(y, s6_f, s5_f), s4_f),
-                              s3_f),
-                    s2_f),
-                s1_f),
-            s0_f);
-    float q = __clc_mad(
-        y,
-        __clc_mad(
-            y,
-            __clc_mad(y,
-                      __clc_mad(y, __clc_mad(y, __clc_mad(y, r6_f, r5_f), r4_f),
-                                r3_f),
-                      r2_f),
-            r1_f),
-        1.0f);
-    r = __clc_mad(y, 0.5f, MATH_DIVIDE(p, q));
-
-    float y6 = y + 6.0f;
-    float y5 = y + 5.0f;
-    float y4 = y + 4.0f;
-    float y3 = y + 3.0f;
-    float y2 = y + 2.0f;
-
-    float z = 1.0f;
-    z *= i > 6 ? y6 : 1.0f;
-    z *= i > 5 ? y5 : 1.0f;
-    z *= i > 4 ? y4 : 1.0f;
-    z *= i > 3 ? y3 : 1.0f;
-    z *= i > 2 ? y2 : 1.0f;
-
-    r += __clc_log(z);
-  } else if (absx < 0x1.0p+58f) {
-    float z = 1.0f / absx;
-    float y = z * z;
-    float w = __clc_mad(
-        z,
-        __clc_mad(
-            y,
-            __clc_mad(y,
-                      __clc_mad(y, __clc_mad(y, __clc_mad(y, w6_f, w5_f), w4_f),
-                                w3_f),
-                      w2_f),
-            w1_f),
-        w0_f);
-    r = __clc_mad(absx - 0.5f, __clc_log(absx) - 1.0f, w);
-  } else
-    // 2**58 <= x <= Inf
-    r = absx * (__clc_log(absx) - 1.0f);
-
-  int s = 1;
-
-  if (x < 0.0f) {
-    float t = __clc_sinpi(x);
-    r = __clc_log(pi_f / __clc_fabs(t * x)) - r;
-    r = t == 0.0f ? INFINITY : r;
-    s = t < 0.0f ? -1 : s;
-  }
-
-  *signp = s;
-  return r;
-}
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-// ====================================================
-// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
-//
-// Developed at SunPro, a Sun Microsystems, Inc. business.
-// Permission to use, copy, modify, and distribute this
-// software is freely granted, provided that this notice
-// is preserved.
-// ====================================================
-
-// lgamma_r(x, i)
-// Reentrant version of the logarithm of the Gamma function
-// with user provide pointer for the sign of Gamma(x).
-//
-// Method:
-//   1. Argument Reduction for 0 < x <= 8
-//      Since gamma(1+s)=s*gamma(s), for x in [0,8], we may
-//      reduce x to a number in [1.5,2.5] by
-//              lgamma(1+s) = log(s) + lgamma(s)
-//      for example,
-//              lgamma(7.3) = log(6.3) + lgamma(6.3)
-//                          = log(6.3*5.3) + lgamma(5.3)
-//                          = log(6.3*5.3*4.3*3.3*2.3) + lgamma(2.3)
-//   2. Polynomial approximation of lgamma around its
-//      minimun ymin=1.461632144968362245 to maintain monotonicity.
-//      On [ymin-0.23, ymin+0.27] (i.e., [1.23164,1.73163]), use
-//              Let z = x-ymin;
-//              lgamma(x) = -1.214862905358496078218 + z^2*poly(z)
-//      where
-//              poly(z) is a 14 degree polynomial.
-//   2. Rational approximation in the primary interval [2,3]
-//      We use the following approximation:
-//              s = x-2.0;
-//              lgamma(x) = 0.5*s + s*P(s)/Q(s)
-//      with accuracy
-//              |P/Q - (lgamma(x)-0.5s)| < 2**-61.71
-//      Our algorithms are based on the following observation
-//
-//                             zeta(2)-1    2    zeta(3)-1    3
-// lgamma(2+s) = s*(1-Euler) + --------- * s  -  --------- * s  + ...
-//                                 2                 3
-//
-//      where Euler = 0.5771... is the Euler constant, which is very
-//      close to 0.5.
-//
-//   3. For x>=8, we have
-//      lgamma(x)~(x-0.5)log(x)-x+0.5*log(2pi)+1/(12x)-1/(360x**3)+....
-//      (better formula:
-//         lgamma(x)~(x-0.5)*(log(x)-1)-.5*(log(2pi)-1) + ...)
-//      Let z = 1/x, then we approximation
-//              f(z) = lgamma(x) - (x-0.5)(log(x)-1)
-//      by
-//                                  3       5             11
-//              w = w0 + w1*z + w2*z  + w3*z  + ... + w6*z
-//      where
-//              |w - f(z)| < 2**-58.74
-//
-//   4. For negative x, since (G is gamma function)
-//              -x*G(-x)*G(x) = pi/sin(pi*x),
-//      we have
-//              G(x) = pi/(sin(pi*x)*(-x)*G(-x))
-//      since G(-x) is positive, sign(G(x)) = sign(sin(pi*x)) for x<0
-//      Hence, for x<0, signgam = sign(sin(pi*x)) and
-//              lgamma(x) = log(|Gamma(x)|)
-//                        = log(pi/(|x*sin(pi*x)|)) - lgamma(-x);
-//      Note: one should avoid compute pi*(-x) directly in the
-//            computation of sin(pi*(-x)).
-//
-//   5. Special Cases
-//              lgamma(2+s) ~ s*(1-Euler) for tiny s
-//              lgamma(1)=lgamma(2)=0
-//              lgamma(x) ~ -log(x) for tiny x
-//              lgamma(0) = lgamma(inf) = inf
-//              lgamma(-integer) = +-inf
-//
-#define pi 3.14159265358979311600e+00 /* 0x400921FB, 0x54442D18 */
-
-#define a0 7.72156649015328655494e-02  /* 0x3FB3C467, 0xE37DB0C8 */
-#define a1 3.22467033424113591611e-01  /* 0x3FD4A34C, 0xC4A60FAD */
-#define a2 6.73523010531292681824e-02  /* 0x3FB13E00, 0x1A5562A7 */
-#define a3 2.05808084325167332806e-02  /* 0x3F951322, 0xAC92547B */
-#define a4 7.38555086081402883957e-03  /* 0x3F7E404F, 0xB68FEFE8 */
-#define a5 2.89051383673415629091e-03  /* 0x3F67ADD8, 0xCCB7926B */
-#define a6 1.19270763183362067845e-03  /* 0x3F538A94, 0x116F3F5D */
-#define a7 5.10069792153511336608e-04  /* 0x3F40B6C6, 0x89B99C00 */
-#define a8 2.20862790713908385557e-04  /* 0x3F2CF2EC, 0xED10E54D */
-#define a9 1.08011567247583939954e-04  /* 0x3F1C5088, 0x987DFB07 */
-#define a10 2.52144565451257326939e-05 /* 0x3EFA7074, 0x428CFA52 */
-#define a11 4.48640949618915160150e-05 /* 0x3F07858E, 0x90A45837 */
-
-#define tc 1.46163214496836224576e+00  /* 0x3FF762D8, 0x6356BE3F */
-#define tf -1.21486290535849611461e-01 /* 0xBFBF19B9, 0xBCC38A42 */
-#define tt -3.63867699703950536541e-18 /* 0xBC50C7CA, 0xA48A971F */
-
-#define t0 4.83836122723810047042e-01   /* 0x3FDEF72B, 0xC8EE38A2 */
-#define t1 -1.47587722994593911752e-01  /* 0xBFC2E427, 0x8DC6C509 */
-#define t2 6.46249402391333854778e-02   /* 0x3FB08B42, 0x94D5419B */
-#define t3 -3.27885410759859649565e-02  /* 0xBFA0C9A8, 0xDF35B713 */
-#define t4 1.79706750811820387126e-02   /* 0x3F9266E7, 0x970AF9EC */
-#define t5 -1.03142241298341437450e-02  /* 0xBF851F9F, 0xBA91EC6A */
-#define t6 6.10053870246291332635e-03   /* 0x3F78FCE0, 0xE370E344 */
-#define t7 -3.68452016781138256760e-03  /* 0xBF6E2EFF, 0xB3E914D7 */
-#define t8 2.25964780900612472250e-03   /* 0x3F6282D3, 0x2E15C915 */
-#define t9 -1.40346469989232843813e-03  /* 0xBF56FE8E, 0xBF2D1AF1 */
-#define t10 8.81081882437654011382e-04  /* 0x3F4CDF0C, 0xEF61A8E9 */
-#define t11 -5.38595305356740546715e-04 /* 0xBF41A610, 0x9C73E0EC */
-#define t12 3.15632070903625950361e-04  /* 0x3F34AF6D, 0x6C0EBBF7 */
-#define t13 -3.12754168375120860518e-04 /* 0xBF347F24, 0xECC38C38 */
-#define t14 3.35529192635519073543e-04  /* 0x3F35FD3E, 0xE8C2D3F4 */
-
-#define u0 -7.72156649015328655494e-02 /* 0xBFB3C467, 0xE37DB0C8 */
-#define u1 6.32827064025093366517e-01  /* 0x3FE4401E, 0x8B005DFF */
-#define u2 1.45492250137234768737e+00  /* 0x3FF7475C, 0xD119BD6F */
-#define u3 9.77717527963372745603e-01  /* 0x3FEF4976, 0x44EA8450 */
-#define u4 2.28963728064692451092e-01  /* 0x3FCD4EAE, 0xF6010924 */
-#define u5 1.33810918536787660377e-02  /* 0x3F8B678B, 0xBF2BAB09 */
-
-#define v1 2.45597793713041134822e+00 /* 0x4003A5D7, 0xC2BD619C */
-#define v2 2.12848976379893395361e+00 /* 0x40010725, 0xA42B18F5 */
-#define v3 7.69285150456672783825e-01 /* 0x3FE89DFB, 0xE45050AF */
-#define v4 1.04222645593369134254e-01 /* 0x3FBAAE55, 0xD6537C88 */
-#define v5 3.21709242282423911810e-03 /* 0x3F6A5ABB, 0x57D0CF61 */
-
-#define s0_d -7.72156649015328655494e-02 /* 0xBFB3C467, 0xE37DB0C8 */
-#define s1_d 2.14982415960608852501e-01  /* 0x3FCB848B, 0x36E20878 */
-#define s2_d 3.25778796408930981787e-01  /* 0x3FD4D98F, 0x4F139F59 */
-#define s3_d 1.46350472652464452805e-01  /* 0x3FC2BB9C, 0xBEE5F2F7 */
-#define s4_d 2.66422703033638609560e-02  /* 0x3F9B481C, 0x7E939961 */
-#define s5_d 1.84028451407337715652e-03  /* 0x3F5E26B6, 0x7368F239 */
-#define s6_d 3.19475326584100867617e-05  /* 0x3F00BFEC, 0xDD17E945 */
-
-#define r1 1.39200533467621045958e+00 /* 0x3FF645A7, 0x62C4AB74 */
-#define r2 7.21935547567138069525e-01 /* 0x3FE71A18, 0x93D3DCDC */
-#define r3 1.71933865632803078993e-01 /* 0x3FC601ED, 0xCCFBDF27 */
-#define r4 1.86459191715652901344e-02 /* 0x3F9317EA, 0x742ED475 */
-#define r5 7.77942496381893596434e-04 /* 0x3F497DDA, 0xCA41A95B */
-#define r6 7.32668430744625636189e-06 /* 0x3EDEBAF7, 0xA5B38140 */
-
-#define w0 4.18938533204672725052e-01  /* 0x3FDACFE3, 0x90C97D69 */
-#define w1 8.33333333333329678849e-02  /* 0x3FB55555, 0x5555553B */
-#define w2 -2.77777777728775536470e-03 /* 0xBF66C16C, 0x16B02E5C */
-#define w3 7.93650558643019558500e-04  /* 0x3F4A019F, 0x98CF38B6 */
-#define w4 -5.95187557450339963135e-04 /* 0xBF4380CB, 0x8C0FE741 */
-#define w5 8.36339918996282139126e-04  /* 0x3F4B67BA, 0x4CDAD5D1 */
-#define w6 -1.63092934096575273989e-03 /* 0xBF5AB89D, 0x0B9E43E4 */
-
-_CLC_OVERLOAD _CLC_DEF double __clc_lgamma_r(double x, private int *ip) {
-  ulong ux = __clc_as_ulong(x);
-  double absx = __clc_fabs(x);
-  ulong ax = __clc_as_ulong(absx);
-
-  if (ax >= 0x7ff0000000000000UL) {
-    // +-Inf, NaN
-    *ip = 1;
-    return absx;
-  }
-
-  if (absx < 0x1.0p-70) {
-    *ip = ax == ux ? 1 : -1;
-    return -__clc_log(absx);
-  }
-
-  // Handle rest of range
-  double r;
-
-  if (absx < 2.0) {
-    int i = 0;
-    double y = 2.0 - absx;
-
-    int c = absx < 0x1.bb4c3p+0;
-    double t = absx - tc;
-    i = c ? 1 : i;
-    y = c ? t : y;
-
-    c = absx < 0x1.3b4c4p+0;
-    t = absx - 1.0;
-    i = c ? 2 : i;
-    y = c ? t : y;
-
-    c = absx <= 0x1.cccccp-1;
-    t = -__clc_log(absx);
-    r = c ? t : 0.0;
-    t = 1.0 - absx;
-    i = c ? 0 : i;
-    y = c ? t : y;
-
-    c = absx < 0x1.76944p-1;
-    t = absx - (tc - 1.0);
-    i = c ? 1 : i;
-    y = c ? t : y;
-
-    c = absx < 0x1.da661p-3;
-    i = c ? 2 : i;
-    y = c ? absx : y;
+#include "clc/math/clc_trunc.h"
+#include "clc/relational/clc_isinf.h"
+#include "clc/relational/clc_isnan.h"
 
-    double p, q;
-
-    switch (i) {
-    case 0:
-      p = __clc_fma(
-          y, __clc_fma(y, __clc_fma(y, __clc_fma(y, a11, a10), a9), a8), a7);
-      p = __clc_fma(y, __clc_fma(y, __clc_fma(y, __clc_fma(y, p, a6), a5), a4),
-                    a3);
-      p = __clc_fma(y, __clc_fma(y, __clc_fma(y, p, a2), a1), a0);
-      r = __clc_fma(y, p - 0.5, r);
-      break;
-    case 1:
-      p = __clc_fma(
-          y, __clc_fma(y, __clc_fma(y, __clc_fma(y, t14, t13), t12), t11), t10);
-      p = __clc_fma(
-          y,
-          __clc_fma(y, __clc_fma(y, __clc_fma(y, __clc_fma(y, p, t9), t8), t7),
-                    t6),
-          t5);
-      p = __clc_fma(
-          y,
-          __clc_fma(y, __clc_fma(y, __clc_fma(y, __clc_fma(y, p, t4), t3), t2),
-                    t1),
-          t0);
-      p = __clc_fma(y * y, p, -tt);
-      r += (tf + p);
-      break;
-    case 2:
-      p = y *
-          __clc_fma(
-              y,
-              __clc_fma(
-                  y, __clc_fma(y, __clc_fma(y, __clc_fma(y, u5, u4), u3), u2),
-                  u1),
-              u0);
-      q = __clc_fma(
-          y,
-          __clc_fma(y, __clc_fma(y, __clc_fma(y, __clc_fma(y, v5, v4), v3), v2),
-                    v1),
-          1.0);
-      r += __clc_fma(-0.5, y, p / q);
-    }
-  } else if (absx < 8.0) {
-    int i = absx;
-    double y = absx - (double)i;
-    double p =
-        y *
-        __clc_fma(
-            y,
-            __clc_fma(
-                y,
-                __clc_fma(
-                    y,
-                    __clc_fma(y, __clc_fma(y, __clc_fma(y, s6_d, s5_d), s4_d),
-                              s3_d),
-                    s2_d),
-                s1_d),
-            s0_d);
-    double q = __clc_fma(
-        y,
-        __clc_fma(
-            y,
-            __clc_fma(y,
-                      __clc_fma(y, __clc_fma(y, __clc_fma(y, r6, r5), r4), r3),
-                      r2),
-            r1),
-        1.0);
-    r = __clc_fma(0.5, y, p / q);
-    double z = 1.0;
-    // lgamma(1+s) = log(s) + lgamma(s)
-    double y6 = y + 6.0;
-    double y5 = y + 5.0;
-    double y4 = y + 4.0;
-    double y3 = y + 3.0;
-    double y2 = y + 2.0;
-    z *= i > 6 ? y6 : 1.0;
-    z *= i > 5 ? y5 : 1.0;
-    z *= i > 4 ? y4 : 1.0;
-    z *= i > 3 ? y3 : 1.0;
-    z *= i > 2 ? y2 : 1.0;
-    r += __clc_log(z);
-  } else {
-    double z = 1.0 / absx;
-    double z2 = z * z;
-    double w = __clc_fma(
-        z,
-        __clc_fma(
-            z2,
-            __clc_fma(
-                z2, __clc_fma(z2, __clc_fma(z2, __clc_fma(z2, w6, w5), w4), w3),
-                w2),
-            w1),
-        w0);
-    r = (absx - 0.5) * (__clc_log(absx) - 1.0) + w;
-  }
-
-  if (x < 0.0) {
-    double t = __clc_sinpi(x);
-    r = __clc_log(pi / __clc_fabs(t * x)) - r;
-    r = t == 0.0 ? INFINITY : r;
-    *ip = t < 0.0 ? -1 : 1;
-  } else
-    *ip = 1;
-
-  return r;
-}
-
-#endif
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-_CLC_OVERLOAD _CLC_DEF half __clc_lgamma_r(half x, private int *iptr) {
-  return (half)__clc_lgamma_r((float)x, iptr);
-}
-
-#endif
+#define __CLC_FUNCTION __clc_lgamma_r_stret
+#define __CLC_BODY "clc_lgamma_r_stret.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
 
 #define __CLC_FUNCTION __clc_lgamma_r
-#define __CLC_ARG2_TYPE int
-#define __CLC_ADDRSPACE private
-#define __CLC_BODY "clc/shared/unary_def_with_ptr_scalarize.inc"
+#define __CLC_BODY "clc_lgamma_r.inc"
+#include "clc/math/gentype.inc"
+
+#define __CLC_OUT_ARG2_SCALAR_TYPE int
+#define __CLC_ADDRSPACE __private
+#define __CLC_BODY "clc/shared/unary_with_out_arg_scalarize_loop.inc"
 #include "clc/math/gentype.inc"
 #undef __CLC_ADDRSPACE
-#undef __CLC_ARG2_TYPE
-#undef __CLC_FUNCTION
 
-#define __CLC_ADDRSPACE global
-#define __CLC_BODY "clc_lgamma_r.inc"
+#define __CLC_OUT_ARG2_SCALAR_TYPE int
+#define __CLC_ADDRSPACE __global
+#define __CLC_BODY "clc/shared/unary_with_out_arg_scalarize_loop.inc"
 #include "clc/math/gentype.inc"
 #undef __CLC_ADDRSPACE
 
-#define __CLC_ADDRSPACE local
-#define __CLC_BODY "clc_lgamma_r.inc"
+#define __CLC_OUT_ARG2_SCALAR_TYPE int
+#define __CLC_ADDRSPACE __local
+#define __CLC_BODY "clc/shared/unary_with_out_arg_scalarize_loop.inc"
 #include "clc/math/gentype.inc"
 #undef __CLC_ADDRSPACE
 
 #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
-#define __CLC_ADDRSPACE generic
-#define __CLC_BODY "clc_lgamma_r.inc"
+#define __CLC_OUT_ARG2_SCALAR_TYPE int
+#define __CLC_ADDRSPACE __generic
+#define __CLC_BODY "clc/shared/unary_with_out_arg_scalarize_loop.inc"
 #include "clc/math/gentype.inc"
 #undef __CLC_ADDRSPACE
 #endif

diff  --git a/libclc/clc/lib/generic/math/clc_lgamma_r.inc b/libclc/clc/lib/generic/math/clc_lgamma_r.inc
index 87891efd44755..5efa4a261206e 100644
--- a/libclc/clc/lib/generic/math/clc_lgamma_r.inc
+++ b/libclc/clc/lib/generic/math/clc_lgamma_r.inc
@@ -6,10 +6,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE
-__clc_lgamma_r(__CLC_GENTYPE x, __CLC_ADDRSPACE __CLC_INTN *iptr) {
-  __CLC_INTN private_iptr;
-  __CLC_GENTYPE ret = __clc_lgamma_r(x, &private_iptr);
-  *iptr = private_iptr;
-  return ret;
-}
+#ifdef __CLC_SCALAR
+#define __CLC_LGAMMA_R_DEF(addrspace)                                          \
+  _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_lgamma_r(                         \
+      __CLC_GENTYPE x, addrspace __CLC_INTN *signp) {                          \
+    __CLC_LGAMMA_R_RET_GENTYPE result = __clc_lgamma_r_stret(x);               \
+    *signp = result.sign;                                                      \
+    return result.result;                                                      \
+  }
+
+__CLC_LGAMMA_R_DEF(private)
+__CLC_LGAMMA_R_DEF(local)
+__CLC_LGAMMA_R_DEF(global)
+#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
+__CLC_LGAMMA_R_DEF(generic)
+#endif
+#endif // __CLC_SCALAR

diff  --git a/libclc/clc/lib/generic/math/clc_lgamma_r_stret.inc b/libclc/clc/lib/generic/math/clc_lgamma_r_stret.inc
new file mode 100644
index 0000000000000..26b6f3bc77081
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_lgamma_r_stret.inc
@@ -0,0 +1,628 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This lgamma routine began with Sun's lgamma code from netlib.
+// Their original copyright notice follows.
+
+/* @(#)e_lgamma_r.c 1.3 95/01/18 */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ *
+ */
+
+/* __ieee754_lgamma_r(x, signgamp)
+ * Reentrant version of the logarithm of the Gamma function
+ * with user provide pointer for the sign of Gamma(x).
+ *
+ * Method:
+ *   1. Argument Reduction for 0 < x <= 8
+ *      Since gamma(1+s)=s*gamma(s), for x in [0,8], we may
+ *      reduce x to a number in [1.5,2.5] by
+ *              lgamma(1+s) = log(s) + lgamma(s)
+ *      for example,
+ *              lgamma(7.3) = log(6.3) + lgamma(6.3)
+ *                          = log(6.3*5.3) + lgamma(5.3)
+ *                          = log(6.3*5.3*4.3*3.3*2.3) + lgamma(2.3)
+ *   2. Polynomial approximation of lgamma around its
+ *      minimun ymin=1.461632144968362245 to maintain monotonicity.
+ *      On [ymin-0.23, ymin+0.27] (i.e., [1.23164,1.73163]), use
+ *              Let z = x-ymin;
+ *              lgamma(x) = -1.214862905358496078218 + z^2*poly(z)
+ *      where
+ *              poly(z) is a 14 degree polynomial.
+ *   2. Rational approximation in the primary interval [2,3]
+ *      We use the following approximation:
+ *              s = x-2.0;
+ *              lgamma(x) = 0.5*s + s*P(s)/Q(s)
+ *      with accuracy
+ *              |P/Q - (lgamma(x)-0.5s)| < 2**-61.71
+ *      Our algorithms are based on the following observation
+ *
+ *                             zeta(2)-1    2    zeta(3)-1    3
+ * lgamma(2+s) = s*(1-Euler) + --------- * s  -  --------- * s  + ...
+ *                                 2                 3
+ *
+ *      where Euler = 0.5771... is the Euler constant, which is very
+ *      close to 0.5.
+ *
+ *   3. For x>=8, we have
+ *      lgamma(x)~(x-0.5)log(x)-x+0.5*log(2pi)+1/(12x)-1/(360x**3)+....
+ *      (better formula:
+ *         lgamma(x)~(x-0.5)*(log(x)-1)-.5*(log(2pi)-1) + ...)
+ *      Let z = 1/x, then we approximation
+ *              f(z) = lgamma(x) - (x-0.5)(log(x)-1)
+ *      by
+ *                                  3       5             11
+ *              w = w0 + w1*z + w2*z  + w3*z  + ... + w6*z
+ *      where
+ *              |w - f(z)| < 2**-58.74
+ *
+ *   4. For negative x, since (G is gamma function)
+ *              -x*G(-x)*G(x) = pi/sin(pi*x),
+ *      we have
+ *              G(x) = pi/(sin(pi*x)*(-x)*G(-x))
+ *      since G(-x) is positive, sign(G(x)) = sign(sin(pi*x)) for x<0
+ *      Hence, for x<0, signgam = sign(sin(pi*x)) and
+ *              lgamma(x) = log(|Gamma(x)|)
+ *                        = log(pi/(|x*sin(pi*x)|)) - lgamma(-x);
+ *      Note: one should avoid compute pi*(-x) directly in the
+ *            computation of sin(pi*(-x)).
+ *
+ *   5. Special Cases
+ *              lgamma(2+s) ~ s*(1-Euler) for tiny s
+ *              lgamma(1)=lgamma(2)=0
+ *              lgamma(x) ~ -log(x) for tiny x
+ *              lgamma(0) = lgamma(inf) = inf
+ *              lgamma(-integer) = +-inf
+ *
+ */
+
+#ifdef __CLC_SCALAR
+
+#if __CLC_FPSIZE == 32
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_LGAMMA_R_RET_GENTYPE
+__clc_lgamma_r_stret(__CLC_FLOATN x) {
+  const __CLC_FLOATN pi = 3.14159265358979311600e+00f;
+  const __CLC_FLOATN a0 = 7.72156649015328655494e-02f;
+  const __CLC_FLOATN a1 = 3.22467033424113591611e-01f;
+  const __CLC_FLOATN a2 = 6.73523010531292681824e-02f;
+  const __CLC_FLOATN a3 = 2.05808084325167332806e-02f;
+  const __CLC_FLOATN a4 = 7.38555086081402883957e-03f;
+  const __CLC_FLOATN a5 = 2.89051383673415629091e-03f;
+  const __CLC_FLOATN a6 = 1.19270763183362067845e-03f;
+  const __CLC_FLOATN a7 = 5.10069792153511336608e-04f;
+  const __CLC_FLOATN a8 = 2.20862790713908385557e-04f;
+  const __CLC_FLOATN a9 = 1.08011567247583939954e-04f;
+  const __CLC_FLOATN a10 = 2.52144565451257326939e-05f;
+  const __CLC_FLOATN a11 = 4.48640949618915160150e-05f;
+  const __CLC_FLOATN tc = 1.46163214496836224576e+00f;
+  const __CLC_FLOATN tf = -1.21486290535849611461e-01f;
+  const __CLC_FLOATN tt = -3.63867699703950536541e-18f;
+  const __CLC_FLOATN t0 = 4.83836122723810047042e-01f;
+  const __CLC_FLOATN t1 = -1.47587722994593911752e-01f;
+  const __CLC_FLOATN t2 = 6.46249402391333854778e-02f;
+  const __CLC_FLOATN t3 = -3.27885410759859649565e-02f;
+  const __CLC_FLOATN t4 = 1.79706750811820387126e-02f;
+  const __CLC_FLOATN t5 = -1.03142241298341437450e-02f;
+  const __CLC_FLOATN t6 = 6.10053870246291332635e-03f;
+  const __CLC_FLOATN t7 = -3.68452016781138256760e-03f;
+  const __CLC_FLOATN t8 = 2.25964780900612472250e-03f;
+  const __CLC_FLOATN t9 = -1.40346469989232843813e-03f;
+  const __CLC_FLOATN t10 = 8.81081882437654011382e-04f;
+  const __CLC_FLOATN t11 = -5.38595305356740546715e-04f;
+  const __CLC_FLOATN t12 = 3.15632070903625950361e-04f;
+  const __CLC_FLOATN t13 = -3.12754168375120860518e-04f;
+  const __CLC_FLOATN t14 = 3.35529192635519073543e-04f;
+  const __CLC_FLOATN u0 = -7.72156649015328655494e-02f;
+  const __CLC_FLOATN u1 = 6.32827064025093366517e-01f;
+  const __CLC_FLOATN u2 = 1.45492250137234768737e+00f;
+  const __CLC_FLOATN u3 = 9.77717527963372745603e-01f;
+  const __CLC_FLOATN u4 = 2.28963728064692451092e-01f;
+  const __CLC_FLOATN u5 = 1.33810918536787660377e-02f;
+  const __CLC_FLOATN v1 = 2.45597793713041134822e+00f;
+  const __CLC_FLOATN v2 = 2.12848976379893395361e+00f;
+  const __CLC_FLOATN v3 = 7.69285150456672783825e-01f;
+  const __CLC_FLOATN v4 = 1.04222645593369134254e-01f;
+  const __CLC_FLOATN v5 = 3.21709242282423911810e-03f;
+  const __CLC_FLOATN s0 = -7.72156649015328655494e-02f;
+  const __CLC_FLOATN s1 = 2.14982415960608852501e-01f;
+  const __CLC_FLOATN s2 = 3.25778796408930981787e-01f;
+  const __CLC_FLOATN s3 = 1.46350472652464452805e-01f;
+  const __CLC_FLOATN s4 = 2.66422703033638609560e-02f;
+  const __CLC_FLOATN s5 = 1.84028451407337715652e-03f;
+  const __CLC_FLOATN s6 = 3.19475326584100867617e-05f;
+  const __CLC_FLOATN r1 = 1.39200533467621045958e+00f;
+  const __CLC_FLOATN r2 = 7.21935547567138069525e-01f;
+  const __CLC_FLOATN r3 = 1.71933865632803078993e-01f;
+  const __CLC_FLOATN r4 = 1.86459191715652901344e-02f;
+  const __CLC_FLOATN r5 = 7.77942496381893596434e-04f;
+  const __CLC_FLOATN r6 = 7.32668430744625636189e-06f;
+  const __CLC_FLOATN w0 = 4.18938533204672725052e-01f;
+  const __CLC_FLOATN w1 = 8.33333333333329678849e-02f;
+  const __CLC_FLOATN w2 = -2.77777777728775536470e-03f;
+  const __CLC_FLOATN w3 = 7.93650558643019558500e-04f;
+  const __CLC_FLOATN w4 = -5.95187557450339963135e-04f;
+  const __CLC_FLOATN w5 = 8.36339918996282139126e-04f;
+  const __CLC_FLOATN w6 = -1.63092934096575273989e-03f;
+  const __CLC_FLOATN z1 = -0x1.2788d0p-1f;
+  const __CLC_FLOATN z2 = 0x1.a51a66p-1f;
+  const __CLC_FLOATN z3 = -0x1.9a4d56p-2f;
+  const __CLC_FLOATN z4 = 0x1.151322p-2f;
+
+  __CLC_FLOATN ax = __clc_fabs(x);
+  __CLC_FLOATN ret;
+
+  if (ax < 0x1.0p-6f) {
+    ret = __clc_mad(ax,
+                    __clc_mad(ax, __clc_mad(ax, __clc_mad(ax, z4, z3), z2), z1),
+                    -__clc_log(ax));
+  } else if (ax < 2.0f) {
+    __CLC_INTN i;
+    bool c;
+    __CLC_FLOATN y, t;
+    if (ax <= 0.9f) { // lgamma(x) = lgamma(x+1)-log(x)
+      ret = -__clc_log(ax);
+      y = 1.0f - ax;
+      i = 0;
+
+      c = ax < 0.7316f;
+      t = ax - (tc - 1.0f);
+      y = c ? t : y;
+      i = c ? 1 : i;
+
+      c = ax < 0.23164f;
+      y = c ? ax : y;
+      i = c ? 2 : i;
+    } else {
+      ret = 0.0f;
+      y = 2.0f - ax;
+      i = 0;
+
+      c = ax < 1.7316f;
+      t = ax - tc;
+      y = c ? t : y;
+      i = c ? 1 : y;
+
+      c = ax < 1.23f;
+      t = ax - 1.0f;
+      y = c ? t : y;
+      i = c ? 2 : i;
+    }
+
+    __CLC_FLOATN z, w, p1, p2, p3, p;
+    switch (i) {
+    case 0: {
+      z = y * y;
+
+      __CLC_FLOATN z2 = __clc_mad(z, a10, a8);
+      __CLC_FLOATN z3 = __clc_mad(z, z2, a6);
+      __CLC_FLOATN z4 = __clc_mad(z, z3, a4);
+      __CLC_FLOATN z5 = __clc_mad(z, z4, a2);
+      p1 = __clc_mad(z, z5, a0);
+
+      __CLC_FLOATN z2b = __clc_mad(z, a11, a9);
+      __CLC_FLOATN z3b = __clc_mad(z, z2b, a7);
+      __CLC_FLOATN z4b = __clc_mad(z, z3b, a5);
+      __CLC_FLOATN z5b = __clc_mad(z, z4b, a3);
+      p2 = z * __clc_mad(z, z5b, a1);
+
+      p = __clc_mad(y, p1, p2);
+      ret += __clc_mad(y, -0.5f, p);
+      break;
+    }
+    case 1: {
+      z = y * y;
+      w = z * y;
+
+      __CLC_FLOATN w2 = __clc_mad(w, t12, t9);
+      __CLC_FLOATN w3 = __clc_mad(w, w2, t6);
+      __CLC_FLOATN w4 = __clc_mad(w, w3, t3);
+      p1 = __clc_mad(w, w4, t0);
+
+      __CLC_FLOATN w2b = __clc_mad(w, t13, t10);
+      __CLC_FLOATN w3b = __clc_mad(w, w2b, t7);
+      __CLC_FLOATN w4b = __clc_mad(w, w3b, t4);
+      p2 = __clc_mad(w, w4b, t1);
+
+      __CLC_FLOATN w2c = __clc_mad(w, t14, t11);
+      __CLC_FLOATN w3c = __clc_mad(w, w2c, t8);
+      __CLC_FLOATN w4c = __clc_mad(w, w3c, t5);
+      p3 = __clc_mad(w, w4c, t2);
+
+      __CLC_FLOATN negPart = -__clc_mad(w, -__clc_mad(y, p3, p2), tt);
+      p = __clc_mad(z, p1, negPart);
+
+      ret += tf + p;
+      break;
+    }
+    case 2: {
+      __CLC_FLOATN y2 = __clc_mad(y, u5, u4);
+      __CLC_FLOATN y3 = __clc_mad(y, y2, u3);
+      __CLC_FLOATN y4 = __clc_mad(y, y3, u2);
+      __CLC_FLOATN y5 = __clc_mad(y, y4, u1);
+      p1 = y * __clc_mad(y, y5, u0);
+
+      __CLC_FLOATN y2b = __clc_mad(y, v5, v4);
+      __CLC_FLOATN y3b = __clc_mad(y, y2b, v3);
+      __CLC_FLOATN y4b = __clc_mad(y, y3b, v2);
+      __CLC_FLOATN y5b = __clc_mad(y, y4b, v1);
+      p2 = __clc_mad(y, y5b, 1.0f);
+
+      ret += __clc_mad(y, -0.5f, __clc_div_fast(p1, p2));
+      break;
+    }
+    }
+  } else if (ax < 8.0f) { // 2 < |x| < 8
+    __CLC_INTN i = __CLC_CONVERT_INTN(ax);
+    __CLC_FLOATN y = ax - __CLC_CONVERT_FLOATN(i);
+
+    __CLC_FLOATN p1 = __clc_mad(y, s6, s5);
+    __CLC_FLOATN p2 = __clc_mad(y, p1, s4);
+    __CLC_FLOATN p3 = __clc_mad(y, p2, s3);
+    __CLC_FLOATN p4 = __clc_mad(y, p3, s2);
+    __CLC_FLOATN p5 = __clc_mad(y, p4, s1);
+    __CLC_FLOATN p = y * __clc_mad(y, p5, s0);
+
+    __CLC_FLOATN q1 = __clc_mad(y, r6, r5);
+    __CLC_FLOATN q2 = __clc_mad(y, q1, r4);
+    __CLC_FLOATN q3 = __clc_mad(y, q2, r3);
+    __CLC_FLOATN q4 = __clc_mad(y, q3, r2);
+    __CLC_FLOATN q5 = __clc_mad(y, q4, r1);
+    __CLC_FLOATN q = __clc_mad(y, q5, 1.0f);
+
+    ret = __clc_mad(y, 0.5f, __clc_div_fast(p, q));
+
+    __CLC_FLOATN y2 = y + 2.0f;
+    __CLC_FLOATN y3 = y + 3.0f;
+    __CLC_FLOATN y4 = y + 4.0f;
+    __CLC_FLOATN y5 = y + 5.0f;
+    __CLC_FLOATN y6 = y + 6.0f;
+
+    __CLC_FLOATN z = 1.0f;
+    z *= i > 2 ? y2 : 1.0f;
+    z *= i > 3 ? y3 : 1.0f;
+    z *= i > 4 ? y4 : 1.0f;
+    z *= i > 5 ? y5 : 1.0f;
+    z *= i > 6 ? y6 : 1.0f;
+
+    ret += __clc_log(z);
+  } else if (ax < 0x1.0p+58f) { // 8 <= |x| < 2^58
+    __CLC_FLOATN z = __clc_recip_fast(ax);
+    __CLC_FLOATN y = z * z;
+
+    __CLC_FLOATN t1 = __clc_mad(y, w6, w5);
+    __CLC_FLOATN t2 = __clc_mad(y, t1, w4);
+    __CLC_FLOATN t3 = __clc_mad(y, t2, w3);
+    __CLC_FLOATN t4 = __clc_mad(y, t3, w2);
+    __CLC_FLOATN t5 = __clc_mad(y, t4, w1);
+    __CLC_FLOATN w = __clc_mad(z, t5, w0);
+
+    ret = __clc_mad(ax - 0.5f, __clc_log(ax) - 1.0f, w);
+  } else {
+    // 2^58 <= |x| <= Inf
+    ret = __clc_mad(ax, __clc_log(ax), -ax);
+  }
+
+  __CLC_INTN s = 0;
+  if (x >= 0.0f) {
+    ret = ((x == 1.0f) | (x == 2.0f)) ? 0.0f : ret;
+    s = x == 0.0f ? 0 : 1;
+  } else if (ax < 0x1.0p+23f) { // x > -0x1.0p+23
+    if (ax > 0x1.0p-21f) {
+      __CLC_FLOATN t = __clc_sinpi(x);
+      __CLC_FLOATN negadj = __clc_log(pi / __clc_fabs(t * x));
+      ret = negadj - ret;
+
+      bool z = __clc_trunc(x) == x;
+      ret = z ? __CLC_GENTYPE_INF : ret;
+      s = t < 0.0f ? -1 : 1;
+      s = z ? 0 : s;
+    } else {
+      s = -1;
+    }
+  }
+
+  ret = ((ax != 0.0f) && !__clc_isinf(ax) && ((x >= 0.0f) || (ax < 0x1.0p+23f)))
+            ? ret
+            : __CLC_GENTYPE_INF;
+
+  ret = __clc_isnan(x) ? x : ret;
+
+  __CLC_LGAMMA_R_RET_GENTYPE result;
+  result.result = ret;
+  result.sign = s;
+
+  return result;
+}
+
+#elif __CLC_FPSIZE == 64
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_LGAMMA_R_RET_GENTYPE
+__clc_lgamma_r_stret(__CLC_DOUBLEN x) {
+  const __CLC_DOUBLEN pi = 3.14159265358979311600e+00;
+  const __CLC_DOUBLEN a0 = 7.72156649015328655494e-02;
+  const __CLC_DOUBLEN a1 = 3.22467033424113591611e-01;
+  const __CLC_DOUBLEN a2 = 6.73523010531292681824e-02;
+  const __CLC_DOUBLEN a3 = 2.05808084325167332806e-02;
+  const __CLC_DOUBLEN a4 = 7.38555086081402883957e-03;
+  const __CLC_DOUBLEN a5 = 2.89051383673415629091e-03;
+  const __CLC_DOUBLEN a6 = 1.19270763183362067845e-03;
+  const __CLC_DOUBLEN a7 = 5.10069792153511336608e-04;
+  const __CLC_DOUBLEN a8 = 2.20862790713908385557e-04;
+  const __CLC_DOUBLEN a9 = 1.08011567247583939954e-04;
+  const __CLC_DOUBLEN a10 = 2.52144565451257326939e-05;
+  const __CLC_DOUBLEN a11 = 4.48640949618915160150e-05;
+  const __CLC_DOUBLEN tc = 1.46163214496836224576e+00;
+  const __CLC_DOUBLEN tf = -1.21486290535849611461e-01;
+  const __CLC_DOUBLEN tt = -3.63867699703950536541e-18;
+  const __CLC_DOUBLEN t0 = 4.83836122723810047042e-01;
+  const __CLC_DOUBLEN t1 = -1.47587722994593911752e-01;
+  const __CLC_DOUBLEN t2 = 6.46249402391333854778e-02;
+  const __CLC_DOUBLEN t3 = -3.27885410759859649565e-02;
+  const __CLC_DOUBLEN t4 = 1.79706750811820387126e-02;
+  const __CLC_DOUBLEN t5 = -1.03142241298341437450e-02;
+  const __CLC_DOUBLEN t6 = 6.10053870246291332635e-03;
+  const __CLC_DOUBLEN t7 = -3.68452016781138256760e-03;
+  const __CLC_DOUBLEN t8 = 2.25964780900612472250e-03;
+  const __CLC_DOUBLEN t9 = -1.40346469989232843813e-03;
+  const __CLC_DOUBLEN t10 = 8.81081882437654011382e-04;
+  const __CLC_DOUBLEN t11 = -5.38595305356740546715e-04;
+  const __CLC_DOUBLEN t12 = 3.15632070903625950361e-04;
+  const __CLC_DOUBLEN t13 = -3.12754168375120860518e-04;
+  const __CLC_DOUBLEN t14 = 3.35529192635519073543e-04;
+  const __CLC_DOUBLEN u0 = -7.72156649015328655494e-02;
+  const __CLC_DOUBLEN u1 = 6.32827064025093366517e-01;
+  const __CLC_DOUBLEN u2 = 1.45492250137234768737e+00;
+  const __CLC_DOUBLEN u3 = 9.77717527963372745603e-01;
+  const __CLC_DOUBLEN u4 = 2.28963728064692451092e-01;
+  const __CLC_DOUBLEN u5 = 1.33810918536787660377e-02;
+  const __CLC_DOUBLEN v1 = 2.45597793713041134822e+00;
+  const __CLC_DOUBLEN v2 = 2.12848976379893395361e+00;
+  const __CLC_DOUBLEN v3 = 7.69285150456672783825e-01;
+  const __CLC_DOUBLEN v4 = 1.04222645593369134254e-01;
+  const __CLC_DOUBLEN v5 = 3.21709242282423911810e-03;
+  const __CLC_DOUBLEN s0 = -7.72156649015328655494e-02;
+  const __CLC_DOUBLEN s1 = 2.14982415960608852501e-01;
+  const __CLC_DOUBLEN s2 = 3.25778796408930981787e-01;
+  const __CLC_DOUBLEN s3 = 1.46350472652464452805e-01;
+  const __CLC_DOUBLEN s4 = 2.66422703033638609560e-02;
+  const __CLC_DOUBLEN s5 = 1.84028451407337715652e-03;
+  const __CLC_DOUBLEN s6 = 3.19475326584100867617e-05;
+  const __CLC_DOUBLEN r1 = 1.39200533467621045958e+00;
+  const __CLC_DOUBLEN r2 = 7.21935547567138069525e-01;
+  const __CLC_DOUBLEN r3 = 1.71933865632803078993e-01;
+  const __CLC_DOUBLEN r4 = 1.86459191715652901344e-02;
+  const __CLC_DOUBLEN r5 = 7.77942496381893596434e-04;
+  const __CLC_DOUBLEN r6 = 7.32668430744625636189e-06;
+  const __CLC_DOUBLEN w0 = 4.18938533204672725052e-01;
+  const __CLC_DOUBLEN w1 = 8.33333333333329678849e-02;
+  const __CLC_DOUBLEN w2 = -2.77777777728775536470e-03;
+  const __CLC_DOUBLEN w3 = 7.93650558643019558500e-04;
+  const __CLC_DOUBLEN w4 = -5.95187557450339963135e-04;
+  const __CLC_DOUBLEN w5 = 8.36339918996282139126e-04;
+  const __CLC_DOUBLEN w6 = -1.63092934096575273989e-03;
+  const __CLC_DOUBLEN z1 = -0x1.2788cfc6fb619p-1;
+  const __CLC_DOUBLEN z2 = 0x1.a51a6625307d3p-1;
+  const __CLC_DOUBLEN z3 = -0x1.9a4d55beab2d7p-2;
+  const __CLC_DOUBLEN z4 = 0x1.151322ac7d848p-2;
+  const __CLC_DOUBLEN z5 = -0x1.a8b9c17aa6149p-3;
+
+  __CLC_DOUBLEN ax = __clc_fabs(x);
+  __CLC_DOUBLEN ret;
+
+  if (ax < 0x1.0p-8) {
+    __CLC_DOUBLEN t1 = __clc_mad(ax, z5, z4);
+    __CLC_DOUBLEN t2 = __clc_mad(ax, t1, z3);
+    __CLC_DOUBLEN t3 = __clc_mad(ax, t2, z2);
+    __CLC_DOUBLEN t4 = __clc_mad(ax, t3, z1);
+    ret = __clc_mad(ax, t4, -__clc_log(ax));
+  } else if (ax < 2.0) {
+    __CLC_INTN i;
+    bool c;
+    __CLC_DOUBLEN y, t;
+    if (ax <= 0x1.cccccp-1) { // |x| < 0.9 : lgamma(x) = lgamma(x+1)-log(x)
+      ret = -__clc_log(ax);
+
+      y = 1.0 - ax;
+      i = 0;
+
+      c = ax < 0x1.76944p-1; // x < 0.7316
+      t = ax - (tc - 1.0);
+      y = c ? t : y;
+      i = c ? 1 : i;
+
+      c = ax < 0x1.da661p-3; // x < .2316
+      y = c ? ax : y;
+      i = c ? 2 : i;
+    } else {
+      ret = 0.0;
+
+      y = 2.0 - ax;
+      i = 0;
+
+      c = ax < 0x1.bb4c3p+0; // x < 1.7316
+      t = ax - tc;
+      y = c ? t : y;
+      i = c ? 1 : i;
+
+      c = ax < 0x1.3b4c4p+0; // x < 1.2316
+      t = ax - 1.0;
+      y = c ? t : y;
+      i = c ? 2 : i;
+    }
+
+    __CLC_DOUBLEN w, z, p, p1, p2, p3;
+    switch (i) {
+    case 0: {
+      z = y * y;
+
+      __CLC_DOUBLEN z2 = __clc_mad(z, a10, a8);
+      __CLC_DOUBLEN z3 = __clc_mad(z, z2, a6);
+      __CLC_DOUBLEN z4 = __clc_mad(z, z3, a4);
+      __CLC_DOUBLEN z5 = __clc_mad(z, z4, a2);
+      p1 = __clc_mad(z, z5, a0);
+
+      __CLC_DOUBLEN z2b = __clc_mad(z, a11, a9);
+      __CLC_DOUBLEN z3b = __clc_mad(z, z2b, a7);
+      __CLC_DOUBLEN z4b = __clc_mad(z, z3b, a5);
+      __CLC_DOUBLEN z5b = __clc_mad(z, z4b, a3);
+      p2 = z * __clc_mad(z, z5b, a1);
+
+      p = __clc_mad(y, p1, p2);
+      ret += __clc_mad(y, -0.5, p);
+      break;
+    }
+    case 1: {
+      z = y * y;
+      w = z * y;
+
+      __CLC_DOUBLEN w2 = __clc_mad(w, t12, t9);
+      __CLC_DOUBLEN w3 = __clc_mad(w, w2, t6);
+      __CLC_DOUBLEN w4 = __clc_mad(w, w3, t3);
+      p1 = __clc_mad(w, w4, t0);
+
+      __CLC_DOUBLEN w2b = __clc_mad(w, t13, t10);
+      __CLC_DOUBLEN w3b = __clc_mad(w, w2b, t7);
+      __CLC_DOUBLEN w4b = __clc_mad(w, w3b, t4);
+      p2 = __clc_mad(w, w4b, t1);
+
+      __CLC_DOUBLEN w2c = __clc_mad(w, t14, t11);
+      __CLC_DOUBLEN w3c = __clc_mad(w, w2c, t8);
+      __CLC_DOUBLEN w4c = __clc_mad(w, w3c, t5);
+      p3 = __clc_mad(w, w4c, t2);
+
+      __CLC_DOUBLEN inner = __clc_mad(y, p3, p2);
+      __CLC_DOUBLEN negPart = -__clc_mad(w, -inner, tt);
+      p = __clc_mad(z, p1, negPart);
+
+      ret += tf + p;
+      break;
+    }
+    case 2: {
+      __CLC_DOUBLEN y2 = __clc_mad(y, u5, u4);
+      __CLC_DOUBLEN y3 = __clc_mad(y, y2, u3);
+      __CLC_DOUBLEN y4 = __clc_mad(y, y3, u2);
+      __CLC_DOUBLEN y5 = __clc_mad(y, y4, u1);
+      p1 = y * __clc_mad(y, y5, u0);
+
+      __CLC_DOUBLEN y2b = __clc_mad(y, v5, v4);
+      __CLC_DOUBLEN y3b = __clc_mad(y, y2b, v3);
+      __CLC_DOUBLEN y4b = __clc_mad(y, y3b, v2);
+      __CLC_DOUBLEN y5b = __clc_mad(y, y4b, v1);
+      p2 = __clc_mad(y, y5b, 1.0);
+
+      ret += __clc_mad(y, -0.5, p1 / p2);
+      break;
+    }
+    }
+  } else if (ax < 8.0) { // 2 < ax < 8
+    __CLC_INTN i = __CLC_CONVERT_INTN(ax);
+    __CLC_DOUBLEN y = ax - __CLC_CONVERT_DOUBLEN(i);
+
+    __CLC_DOUBLEN p1 = __clc_mad(y, s6, s5);
+    __CLC_DOUBLEN p2 = __clc_mad(y, p1, s4);
+    __CLC_DOUBLEN p3 = __clc_mad(y, p2, s3);
+    __CLC_DOUBLEN p4 = __clc_mad(y, p3, s2);
+    __CLC_DOUBLEN p5 = __clc_mad(y, p4, s1);
+    __CLC_DOUBLEN p = y * __clc_mad(y, p5, s0);
+
+    __CLC_DOUBLEN q1 = __clc_mad(y, r6, r5);
+    __CLC_DOUBLEN q2 = __clc_mad(y, q1, r4);
+    __CLC_DOUBLEN q3 = __clc_mad(y, q2, r3);
+    __CLC_DOUBLEN q4 = __clc_mad(y, q3, r2);
+    __CLC_DOUBLEN q5 = __clc_mad(y, q4, r1);
+    __CLC_DOUBLEN q = __clc_mad(y, q5, 1.0);
+
+    ret = __clc_mad(y, 0.5, p / q);
+
+    __CLC_DOUBLEN y2 = y + 2.0;
+    __CLC_DOUBLEN y3 = y + 3.0;
+    __CLC_DOUBLEN y4 = y + 4.0;
+    __CLC_DOUBLEN y5 = y + 5.0;
+    __CLC_DOUBLEN y6 = y + 6.0;
+
+    __CLC_DOUBLEN z = 1.0;
+    z *= i > 2 ? y2 : 1.0;
+    z *= i > 3 ? y3 : 1.0;
+    z *= i > 4 ? y4 : 1.0;
+    z *= i > 5 ? y5 : 1.0;
+    z *= i > 6 ? y6 : 1.0;
+
+    ret += __clc_log(z);
+  } else if (ax < 0x1p+58) { // 8 <= ax < 2^58
+    __CLC_DOUBLEN z = 1.0 / ax;
+    __CLC_DOUBLEN y = z * z;
+
+    // Nested multiply-add expansions
+    __CLC_DOUBLEN t1 = __clc_mad(y, w6, w5);
+    __CLC_DOUBLEN t2 = __clc_mad(y, t1, w4);
+    __CLC_DOUBLEN t3 = __clc_mad(y, t2, w3);
+    __CLC_DOUBLEN t4 = __clc_mad(y, t3, w2);
+    __CLC_DOUBLEN t5 = __clc_mad(y, t4, w1);
+    __CLC_DOUBLEN w = __clc_mad(z, t5, w0);
+
+    ret = __clc_mad(ax - 0.5, __clc_log(ax) - 1.0, w);
+  } else { // 2^58 <= ax <= Inf
+    ret = __clc_mad(ax, __clc_log(ax), -ax);
+  }
+
+  __CLC_INTN s = 0;
+  if (x >= 0.0) {
+    ret = (x == 1.0 | x == 2.0) ? 0.0 : ret;
+    s = x == 0.0 ? 0 : 1;
+  } else if (ax < 0x1p+52) { // x > -0x1.0p+52
+    if (ax > 0x1.0p-50) {    // x < -0x1.0p-50
+      __CLC_DOUBLEN t = __clc_sinpi(x);
+      __CLC_DOUBLEN negadj = __clc_log(pi / __clc_fabs(t * x));
+      ret = negadj - ret;
+
+      bool z = __clc_trunc(x) == x;
+      ret = z ? __CLC_GENTYPE_INF : ret;
+      s = t < 0.0 ? -1 : 1;
+      s = z ? 0 : s;
+    } else {
+      s = -1;
+    }
+  }
+
+  // Handle negative integer, Inf, NaN
+  ret = (ax == 0.0 || ax == INFINITY) || (x < 0.0 & ax >= 0x1p+52)
+            ? __CLC_GENTYPE_INF
+            : ret;
+  ret = __clc_isnan(x) ? x : ret;
+
+  __CLC_LGAMMA_R_RET_GENTYPE result;
+  result.result = ret;
+  result.sign = s;
+  return result;
+}
+
+#elif __CLC_FPSIZE == 16
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_LGAMMA_R_RET_GENTYPE
+__clc_lgamma_r_stret(__CLC_HALFN x) {
+  __CLC_FLOATN x_promoted = __CLC_CONVERT_FLOATN(x);
+  __CLC_XCONCAT(__clc_lgamma_r_ret_, __CLC_FLOATN)
+  promoted_result = __clc_lgamma_r_stret(x_promoted);
+
+  __CLC_LGAMMA_R_RET_GENTYPE result = {
+      __CLC_CONVERT_HALFN(promoted_result.result), promoted_result.sign};
+  return result;
+}
+
+#endif
+
+#endif // __CLC_SCALAR