[libclc] libclc: Update remquo (PR #187998)

Mon Mar 23 02:55:27 PDT 2026

https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/187998

>From 950caf94385006c7fe1bb0a1f563e394b42fd436 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sun, 22 Mar 2026 18:02:12 +0100
Subject: [PATCH 1/4] libclc: Update remquo

This was failing in the float case without -cl-denorms-are-zero
and failing for double. This now passes in all cases.

This was originally ported from rocm device libs in
8db45e4cf170cc6044a0afe7a0ed8876dcd9a863. This is mostly a port
in of more recent changes with a few changes.

- Templatification, which almost but doesn't quite enable
  vectorization yet due to the outer branch and loop.

- Merging of the 3 types into one shared code path, instead of
  duplicating  per type with 3 different functions implemented together.
  There are only some slight differences for the half case, which mostly
  evaluates as float.

- Splitting out of the is_odd tracking, instead of deriving it from the
  accumulated quotient. This costs an extra register, but saves several
  instructions. This also enables automatic elimination of all of the quo
  output handling when this code is reused for remainder. I'm guessing
  this would be unnecessary if SimplifyDemandedBits handled phis.

- Removal of the slow FMA path. I don't see how this would ever be
  faster with the number of instructions replacing it. This is really a
  problem for the compiler to solve anyway.
---
 libclc/clc/include/clc/math/remquo_decl.inc   |  38 +-
 .../shared/binary_with_out_arg_scalarize.inc  |  77 ++++
 libclc/clc/lib/generic/math/clc_remquo.cl     |  37 +-
 libclc/clc/lib/generic/math/clc_remquo.inc    | 389 +++++++-----------
 4 files changed, 269 insertions(+), 272 deletions(-)
 create mode 100644 libclc/clc/include/clc/shared/binary_with_out_arg_scalarize.inc

diff --git a/libclc/clc/include/clc/math/remquo_decl.inc b/libclc/clc/include/clc/math/remquo_decl.inc
index cba28a7244eb4..d5bfc5bd007ca 100644
--- a/libclc/clc/include/clc/math/remquo_decl.inc
+++ b/libclc/clc/include/clc/math/remquo_decl.inc
@@ -6,19 +6,33 @@
 //
 //===----------------------------------------------------------------------===//
 
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
-                                                     __CLC_GENTYPE y,
-                                                     private __CLC_INTN *q);
+typedef struct __CLC_XCONCAT(__clc_remquo_ret_, __CLC_GENTYPE) {
+  __CLC_GENTYPE rem;
+  __CLC_INTN quo;
+} __CLC_XCONCAT(__clc_remquo_ret_, __CLC_GENTYPE);
 
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
-                                                     __CLC_GENTYPE y,
-                                                     global __CLC_INTN *q);
+#define __CLC_REMQUO_RET_GENTYPE __CLC_XCONCAT(__clc_remquo_ret_, __CLC_GENTYPE)
 
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
-                                                     __CLC_GENTYPE y,
-                                                     local __CLC_INTN *q);
+_CLC_OVERLOAD _CLC_DECL __CLC_REMQUO_RET_GENTYPE
+__clc_remquo_stret(__CLC_GENTYPE x, __CLC_GENTYPE y);
+
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
+                                                   __CLC_GENTYPE y,
+                                                   private __CLC_INTN *q);
+
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
+                                                   __CLC_GENTYPE y,
+                                                   private __CLC_INTN *q);
+
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
+                                                   __CLC_GENTYPE y,
+                                                   global __CLC_INTN *q);
+
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
+                                                   __CLC_GENTYPE y,
+                                                   local __CLC_INTN *q);
 #if _CLC_GENERIC_AS_SUPPORTED
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
-                                                     __CLC_GENTYPE y,
-                                                     generic __CLC_INTN *q);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
+                                                   __CLC_GENTYPE y,
+                                                   generic __CLC_INTN *q);
 #endif
diff --git a/libclc/clc/include/clc/shared/binary_with_out_arg_scalarize.inc b/libclc/clc/include/clc/shared/binary_with_out_arg_scalarize.inc
new file mode 100644
index 0000000000000..6c7193b6f26a9
--- /dev/null
+++ b/libclc/clc/include/clc/shared/binary_with_out_arg_scalarize.inc
@@ -0,0 +1,77 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/utils.h"
+
+#ifndef __CLC_IMPL_FUNCTION
+#define __CLC_IMPL_FUNCTION __CLC_FUNCTION
+#endif
+
+#ifndef __CLC_RET_SCALAR_TYPE
+#define __CLC_RET_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#ifndef __CLC_ARG1_SCALAR_TYPE
+#define __CLC_ARG1_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#ifndef __CLC_ARG2_SCALAR_TYPE
+#define __CLC_ARG2_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#ifndef __CLC_OUT_ARG3_SCALAR_TYPE
+#define __CLC_OUT_ARG3_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#define __CLC_RET_TYPE __CLC_XCONCAT(__CLC_RET_SCALAR_TYPE, __CLC_VECSIZE)
+#define __CLC_ARG1_TYPE __CLC_XCONCAT(__CLC_ARG1_SCALAR_TYPE, __CLC_VECSIZE)
+#define __CLC_ARG2_TYPE __CLC_XCONCAT(__CLC_ARG2_SCALAR_TYPE, __CLC_VECSIZE)
+#define __CLC_OUT_ARG3_TYPE                                                    \
+  __CLC_XCONCAT(__CLC_OUT_ARG3_SCALAR_TYPE, __CLC_VECSIZE)
+
+#ifndef __CLC_OUT_ARG3_ADDRESS_SPACE
+#define __CLC_OUT_ARG3_ADDRESS_SPACE __private
+#endif
+
+#if __CLC_VECSIZE_OR_1 >= 2
+
+_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE
+__CLC_FUNCTION(__CLC_ARG1_TYPE x, __CLC_ARG2_TYPE y,
+               __CLC_OUT_ARG3_ADDRESS_SPACE __CLC_OUT_ARG3_TYPE *z) {
+  union {
+    __CLC_ARG1_TYPE vec;
+    __CLC_ARG1_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+  } u_x;
+
+  union {
+    __CLC_ARG2_TYPE vec;
+    __CLC_ARG2_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+  } u_y;
+
+  union {
+    __CLC_RET_TYPE vec;
+    __CLC_RET_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+  } u_result0;
+
+  union {
+    __CLC_OUT_ARG3_TYPE vec;
+    __CLC_OUT_ARG3_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+  } u_result1;
+
+  u_x.vec = x;
+  u_y.vec = y;
+  for (int i = 0; i < __CLC_VECSIZE_OR_1; ++i) {
+    u_result0.arr[i] =
+        __CLC_IMPL_FUNCTION(u_x.arr[i], u_y.arr[i], &u_result1.arr[i]);
+  }
+
+  *z = u_result1.vec;
+  return u_result0.vec;
+}
+
+#endif // __CLC_VECSIZE_OR_1 >= 2
diff --git a/libclc/clc/lib/generic/math/clc_remquo.cl b/libclc/clc/lib/generic/math/clc_remquo.cl
index e254093d591d4..5e741294cb3a3 100644
--- a/libclc/clc/lib/generic/math/clc_remquo.cl
+++ b/libclc/clc/lib/generic/math/clc_remquo.cl
@@ -6,32 +6,31 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clc/math/clc_remquo.h"
+
 #include "clc/clc_convert.h"
+#include "clc/float/definitions.h"
 #include "clc/integer/clc_clz.h"
-#include "clc/internal/clc.h"
-#include "clc/math/clc_floor.h"
+#include "clc/math/clc_copysign.h"
+#include "clc/math/clc_fabs.h"
 #include "clc/math/clc_flush_if_daz.h"
 #include "clc/math/clc_fma.h"
+#include "clc/math/clc_frexp.h"
 #include "clc/math/clc_ldexp.h"
+#include "clc/math/clc_mad.h"
+#include "clc/math/clc_recip_fast.h"
+#include "clc/math/clc_rint.h"
 #include "clc/math/clc_subnormal_config.h"
 #include "clc/math/clc_trunc.h"
 #include "clc/math/math.h"
-#include "clc/shared/clc_max.h"
-
-#define __CLC_ADDRESS_SPACE private
-#include "clc_remquo.inc"
-#undef __CLC_ADDRESS_SPACE
-
-#define __CLC_ADDRESS_SPACE global
-#include "clc_remquo.inc"
-#undef __CLC_ADDRESS_SPACE
+#include "clc/relational/clc_isfinite.h"
+#include "clc/relational/clc_isnan.h"
+#include "clc/relational/clc_signbit.h"
 
-#define __CLC_ADDRESS_SPACE local
-#include "clc_remquo.inc"
-#undef __CLC_ADDRESS_SPACE
+#define __CLC_BODY "clc_remquo.inc"
+#include "clc/math/gentype.inc"
 
-#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
-#define __CLC_ADDRESS_SPACE generic
-#include "clc_remquo.inc"
-#undef __CLC_ADDRESS_SPACE
-#endif
+#define __CLC_FUNCTION __clc_remquo
+#define __CLC_OUT_ARG3_SCALAR_TYPE int
+#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc"
+#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_remquo.inc b/libclc/clc/lib/generic/math/clc_remquo.inc
index cf8a5ebcea20c..78d5f38ced055 100644
--- a/libclc/clc/lib/generic/math/clc_remquo.inc
+++ b/libclc/clc/lib/generic/math/clc_remquo.inc
@@ -6,266 +6,173 @@
 //
 //===----------------------------------------------------------------------===//
 
-_CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y,
-                                          __CLC_ADDRESS_SPACE int *quo) {
-  x = __clc_flush_if_daz(x);
-  y = __clc_flush_if_daz(y);
-  int ux = __clc_as_int(x);
-  int ax = ux & EXSIGNBIT_SP32;
-  float xa = __clc_as_float(ax);
-  int sx = ux ^ ax;
-  int ex = ax >> EXPSHIFTBITS_SP32;
-
-  int uy = __clc_as_int(y);
-  int ay = uy & EXSIGNBIT_SP32;
-  float ya = __clc_as_float(ay);
-  int sy = uy ^ ay;
-  int ey = ay >> EXPSHIFTBITS_SP32;
-
-  float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff));
-  float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff));
-  int c;
-  int k = ex - ey;
-
-  uint q = 0;
-
-  while (k > 0) {
-    c = xr >= yr;
-    q = (q << 1) | c;
-    xr -= c ? yr : 0.0f;
-    xr += xr;
-    --k;
-  }
+#ifdef __CLC_SCALAR
+
+#if __CLC_FPSIZE == 32
+#define __CLC_REMQUO_EVAL_TYPE __CLC_FLOATN
+#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_FLOATN
+#define __CLC_S_EVAL_TYPE __CLC_INTN
+#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_INTN
+#elif __CLC_FPSIZE == 64
+#define __CLC_REMQUO_EVAL_TYPE __CLC_DOUBLEN
+#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_DOUBLEN
+#define __CLC_S_EVAL_TYPE __CLC_LONGN
+#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_LONGN
+#elif __CLC_FPSIZE == 16
+#define __CLC_REMQUO_EVAL_TYPE __CLC_FLOATN
+#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_FLOATN
+#define __CLC_S_EVAL_TYPE __CLC_INTN
+#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_INTN
+#endif
 
-  c = xr > yr;
-  q = (q << 1) | c;
-  xr -= c ? yr : 0.0f;
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_REMQUO_RET_GENTYPE
+__clc_remquo_stret(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+  // How many bits of the quotient per iteration
+
+#if __CLC_FPSIZE == 32
+  const __CLC_INTN bits = 12;
+  const __CLC_GENTYPE max_exp = 0x1.0p+127f;
+#elif __CLC_FPSIZE == 64
+  const __CLC_INTN bits = 26;
+  const __CLC_GENTYPE max_exp = 0x1.0p+1023;
+#elif __CLC_FPSIZE == 16
+  const __CLC_INTN bits = 11;
+  const __CLC_GENTYPE max_exp = 0x1.0p+15h;
+#endif
 
-  int lt = ex < ey;
+  // Track low 7 bits of the integral quotient.
+  __CLC_INTN q7;
 
-  q = lt ? 0 : q;
-  xr = lt ? xa : xr;
-  yr = lt ? ya : yr;
+  __CLC_REMQUO_EVAL_TYPE ax = __CLC_CONVERT_REMQUO_EVAL_TYPE(__clc_fabs(x));
+  __CLC_REMQUO_EVAL_TYPE ay = __CLC_CONVERT_REMQUO_EVAL_TYPE(__clc_fabs(y));
 
-  c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & ((q & 0x1) == 0x1));
-  xr -= c ? yr : 0.0f;
-  q += c;
+  __CLC_GENTYPE ret;
 
-  float s = __clc_as_float(ey << EXPSHIFTBITS_SP32);
-  xr *= lt ? 1.0f : s;
+  if (ax > ay) {
+    __CLC_INTN ex, ey;
 
-  int qsgn = sx == sy ? 1 : -1;
-  int quot = (q & 0x7f) * qsgn;
+    __CLC_REMQUO_EVAL_TYPE mx = __clc_frexp(ax, &ex);
+    --ex;
 
-  c = ax == ay;
-  quot = c ? qsgn : quot;
-  xr = c ? 0.0f : xr;
+    __CLC_REMQUO_EVAL_TYPE my = __clc_frexp(ay, &ey);
+    --ey;
 
-  xr = __clc_as_float(sx ^ __clc_as_int(xr));
+    ax = __clc_ldexp(mx, bits);
+    ay = __clc_ldexp(my, 1);
 
-  c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 |
-      ay == 0;
-  quot = c ? 0 : quot;
-  xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr;
+    __CLC_INTN nb = ex - ey;
+    __CLC_REMQUO_EVAL_TYPE ayinv = __clc_recip_fast(ay);
 
-  *quo = quot;
+    __CLC_INTN qacc = 0;
 
-  return xr;
-}
+    while (nb > bits) {
+      __CLC_REMQUO_EVAL_TYPE q = __clc_rint(ax * ayinv);
 
-// remquo signature is special, we don't have macro for this
-#define __CLC_VEC_REMQUO(TYPE, VEC_SIZE, HALF_VEC_SIZE)                        \
-  _CLC_DEF _CLC_OVERLOAD TYPE##VEC_SIZE __clc_remquo(                          \
-      TYPE##VEC_SIZE x, TYPE##VEC_SIZE y,                                      \
-      __CLC_ADDRESS_SPACE int##VEC_SIZE *quo) {                                \
-    int##HALF_VEC_SIZE lo, hi;                                                 \
-    TYPE##VEC_SIZE ret;                                                        \
-    ret.lo = __clc_remquo(x.lo, y.lo, &lo);                                    \
-    ret.hi = __clc_remquo(x.hi, y.hi, &hi);                                    \
-    (*quo).lo = lo;                                                            \
-    (*quo).hi = hi;                                                            \
-    return ret;                                                                \
-  }
+#if __CLC_FPSIZE == 16
+      ax = __clc_mad(-q, ay, ax);
+#else
+      ax = __clc_fma(-q, ay, ax);
+#endif
+      __CLC_S_GENTYPE clt = ax < (__CLC_REMQUO_EVAL_TYPE)0.0;
+      __CLC_REMQUO_EVAL_TYPE axp = ax + ay;
+      ax = clt ? axp : ax;
+      ax = __clc_ldexp(ax, bits);
 
-#define __CLC_VEC3_REMQUO(TYPE)                                                \
-  _CLC_DEF _CLC_OVERLOAD TYPE##3 __clc_remquo(                                 \
-      TYPE##3 x, TYPE##3 y, __CLC_ADDRESS_SPACE int##3 * quo) {                \
-    int2 lo;                                                                   \
-    int hi;                                                                    \
-    TYPE##3 ret;                                                               \
-    ret.s01 = __clc_remquo(x.s01, y.s01, &lo);                                 \
-    ret.s2 = __clc_remquo(x.s2, y.s2, &hi);                                    \
-    (*quo).s01 = lo;                                                           \
-    (*quo).s2 = hi;                                                            \
-    return ret;                                                                \
-  }
-__CLC_VEC_REMQUO(float, 2, )
-__CLC_VEC3_REMQUO(float)
-__CLC_VEC_REMQUO(float, 4, 2)
-__CLC_VEC_REMQUO(float, 8, 4)
-__CLC_VEC_REMQUO(float, 16, 8)
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y,
-                                           __CLC_ADDRESS_SPACE int *pquo) {
-  ulong ux = __clc_as_ulong(x);
-  ulong ax = ux & ~SIGNBIT_DP64;
-  ulong xsgn = ux ^ ax;
-  double dx = __clc_as_double(ax);
-  int xexp = __clc_convert_int(ax >> EXPSHIFTBITS_DP64);
-  int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64);
-  xexp1 = xexp < 1 ? xexp1 : xexp;
-
-  ulong uy = __clc_as_ulong(y);
-  ulong ay = uy & ~SIGNBIT_DP64;
-  double dy = __clc_as_double(ay);
-  int yexp = __clc_convert_int(ay >> EXPSHIFTBITS_DP64);
-  int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64);
-  yexp1 = yexp < 1 ? yexp1 : yexp;
-
-  int qsgn = ((ux ^ uy) & SIGNBIT_DP64) == 0UL ? 1 : -1;
-
-  // First assume |x| > |y|
-
-  // Set ntimes to the number of times we need to do a
-  // partial remainder. If the exponent of x is an exact multiple
-  // of 53 larger than the exponent of y, and the mantissa of x is
-  // less than the mantissa of y, ntimes will be one too large
-  // but it doesn't matter - it just means that we'll go round
-  // the loop below one extra time.
-  int ntimes = __clc_max(0, (xexp1 - yexp1) / 53);
-  double w = __clc_ldexp(dy, ntimes * 53);
-  w = ntimes == 0 ? dy : w;
-  double scale = ntimes == 0 ? 1.0 : 0x1.0p-53;
-
-  // Each time round the loop we compute a partial remainder.
-  // This is done by subtracting a large multiple of w
-  // from x each time, where w is a scaled up version of y.
-  // The subtraction must be performed exactly in quad
-  // precision, though the result at each stage can
-  // fit exactly in a double precision number.
-  int i;
-  double t, v, p, pp;
-
-  for (i = 0; i < ntimes; i++) {
-    // Compute integral multiplier
-    t = __clc_trunc(dx / w);
-
-    // Compute w * t in quad precision
-    p = w * t;
-    pp = __clc_fma(w, t, -p);
-
-    // Subtract w * t from dx
-    v = dx - p;
-    dx = v + (((dx - v) - p) - pp);
-
-    // If t was one too large, dx will be negative. Add back one w.
-    dx += dx < 0.0 ? w : 0.0;
-
-    // Scale w down by 2^(-53) for the next iteration
-    w *= scale;
-  }
+      __CLC_INTN iq = __CLC_CONVERT_INTN(q);
+      iq -= __CLC_CONVERT_INTN(clt) ? 1 : 0;
+      qacc = (qacc << bits) | iq;
 
-  // One more time
-  // Variable todd says whether the integer t is odd or not
-  t = __clc_floor(dx / w);
-  long lt = (long)t;
-  int todd = lt & 1;
-
-  p = w * t;
-  pp = __clc_fma(w, t, -p);
-  v = dx - p;
-  dx = v + (((dx - v) - p) - pp);
-  i = dx < 0.0;
-  todd ^= i;
-  dx += i ? w : 0.0;
-
-  lt -= i;
-
-  // At this point, dx lies in the range [0,dy)
-
-  // For the remainder function, we need to adjust dx
-  // so that it lies in the range (-y/2, y/2] by carefully
-  // subtracting w (== dy == y) if necessary. The rigmarole
-  // with todd is to get the correct sign of the result
-  // when x/y lies exactly half way between two integers,
-  // when we need to choose the even integer.
-
-  int al = (2.0 * dx > w) | (todd & (2.0 * dx == w));
-  double dxl = dx - (al ? w : 0.0);
-
-  int ag = (dx > 0.5 * w) | (todd & (dx == 0.5 * w));
-  double dxg = dx - (ag ? w : 0.0);
-
-  dx = dy < 0x1.0p+1022 ? dxl : dxg;
-  lt += dy < 0x1.0p+1022 ? al : ag;
-  int quo = ((int)lt & 0x7f) * qsgn;
-
-  double ret = __clc_as_double(xsgn ^ __clc_as_ulong(dx));
-  dx = __clc_as_double(ax);
-
-  // Now handle |x| == |y|
-  int c = dx == dy;
-  t = __clc_as_double(xsgn);
-  quo = c ? qsgn : quo;
-  ret = c ? t : ret;
-
-  // Next, handle |x| < |y|
-  c = dx < dy;
-  quo = c ? 0 : quo;
-  ret = c ? x : ret;
-
-  c &= (yexp < 1023 & 2.0 * dx > dy) | (dx > 0.5 * dy);
-  quo = c ? qsgn : quo;
-  // we could use a conversion here instead since qsgn = +-1
-  p = qsgn == 1 ? -1.0 : 1.0;
-  t = __clc_fma(y, p, x);
-  ret = c ? t : ret;
-
-  // We don't need anything special for |x| == 0
-
-  // |y| is 0
-  c = dy == 0.0;
-  quo = c ? 0 : quo;
-  ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
-
-  // y is +-Inf, NaN
-  c = yexp > BIASEDEMAX_DP64;
-  quo = c ? 0 : quo;
-  t = y == y ? x : y;
-  ret = c ? t : ret;
-
-  // x is +=Inf, NaN
-  c = xexp > BIASEDEMAX_DP64;
-  quo = c ? 0 : quo;
-  ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
-
-  *pquo = quo;
-  return ret;
-}
-__CLC_VEC_REMQUO(double, 2, )
-__CLC_VEC3_REMQUO(double)
-__CLC_VEC_REMQUO(double, 4, 2)
-__CLC_VEC_REMQUO(double, 8, 4)
-__CLC_VEC_REMQUO(double, 16, 8)
+      nb -= bits;
+    }
 
+    ax = __clc_ldexp(ax, nb - bits + 1);
+
+    __CLC_INTN is_odd;
+
+    // Final iteration
+    {
+      __CLC_REMQUO_EVAL_TYPE q = __clc_rint(ax * ayinv);
+#if __CLC_FPSIZE == 16
+      ax = __clc_mad(-q, ay, ax);
+#else
+      ax = __clc_fma(-q, ay, ax);
 #endif
 
-#ifdef cl_khr_fp16
+      __CLC_S_GENTYPE clt = ax < (__CLC_REMQUO_EVAL_TYPE)0.0;
+      __CLC_REMQUO_EVAL_TYPE axp = ax + ay;
+      ax = clt ? axp : ax;
+      __CLC_INTN iq = __CLC_CONVERT_INTN(q);
+      iq -= __CLC_CONVERT_INTN(clt) ? 1 : 0;
+
+      qacc = (qacc << (nb + 1)) | iq;
+      is_odd = (iq & 1) != 0;
+    }
+
+    // Adjust ax so that it is the range (-y/2, y/2]
+    // We need to choose the even integer when x/y is midway between two
+    // integers
+    __CLC_S_EVAL_TYPE aq = ((__CLC_REMQUO_EVAL_TYPE)2.0 * ax > ay) |
+                           (__CLC_CONVERT_S_EVAL_TYPE(is_odd) &
+                            ((__CLC_REMQUO_EVAL_TYPE)2.0 * ax == ay));
+    ax = ax - (aq ? ay : (__CLC_REMQUO_EVAL_TYPE)0.0);
+
+    ax = __clc_ldexp(ax, ey);
+    qacc += aq ? 1 : 0;
+
+    __CLC_S_GENTYPE qneg = __clc_signbit(x) ^ __clc_signbit(y) ? -1 : 0;
+    q7 = ((qacc & 0x7f) ^ qneg) - qneg;
+
+    ret = __clc_signbit(x) ? -ax : ax;
+  } else {
+    __CLC_S_EVAL_TYPE c = (ax > (__CLC_REMQUO_EVAL_TYPE)0.5 * ay);
+    if (__CLC_FPSIZE != 16)
+      c |= (ay < max_exp && (__CLC_REMQUO_EVAL_TYPE)2.0 * ax > ay);
+
+    __CLC_CHARN qsgn = __CLC_CONVERT_CHARN(__clc_signbit(x) == __clc_signbit(y))
+                           ? (__CLC_CHARN)1
+                           : (__CLC_CHARN)-1;
+
+    __CLC_GENTYPE t = __clc_mad(y, -__CLC_CONVERT_GENTYPE(qsgn), x);
+    ret = c ? t : __clc_flush_if_daz(x);
+    q7 = c ? qsgn : 0;
+
+    __CLC_GENTYPE zero = __clc_copysign(__CLC_FP_LIT(0.0), x);
+    ret = ax == ay ? zero : ret;
+    q7 = ax == ay ? qsgn : q7;
+  }
+
+  ret = y == __CLC_FP_LIT(0.0) ? __CLC_GENTYPE_NAN : ret;
+  q7 = y == __CLC_FP_LIT(0.0) ? 0 : q7;
 
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+  __CLC_S_GENTYPE finite = !__clc_isnan(y) && __clc_isfinite(x);
 
-_CLC_OVERLOAD _CLC_DEF half __clc_remquo(half x, half y,
-                                         __CLC_ADDRESS_SPACE int *pquo) {
-  return (half)__clc_remquo((float)x, (float)y, pquo);
+  // A defined 0 result for quo with a nan result is an additional OpenCL
+  // requirement beyond standard C.
+  __CLC_REMQUO_RET_GENTYPE result;
+  result.quo = finite ? q7 : 0;
+  result.rem = finite ? ret : __CLC_GENTYPE_NAN;
+
+  return result;
 }
-__CLC_VEC_REMQUO(half, 2, )
-__CLC_VEC3_REMQUO(half)
-__CLC_VEC_REMQUO(half, 4, 2)
-__CLC_VEC_REMQUO(half, 8, 4)
-__CLC_VEC_REMQUO(half, 16, 8)
 
+#undef __CLC_REMQUO_EVAL_TYPE
+#undef __CLC_CONVERT_REMQUO_EVAL_TYPE
+#undef __CLC_S_EVAL_TYPE
+#undef __CLC_CONVERT_S_EVAL_TYPE
+
+#define __CLC_REMQUO_DEF(addrspace)                                            \
+  _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_remquo(                           \
+      __CLC_GENTYPE x, __CLC_GENTYPE y, addrspace __CLC_INTN *quo_out) {       \
+    __CLC_REMQUO_RET_GENTYPE result = __clc_remquo_stret(x, y);                \
+    *quo_out = result.quo;                                                     \
+    return result.rem;                                                         \
+  }
+
+__CLC_REMQUO_DEF(private)
+__CLC_REMQUO_DEF(local)
+__CLC_REMQUO_DEF(global)
+#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
+__CLC_REMQUO_DEF(generic)
 #endif
+
+#endif // __CLC_SCALAR

>From b7ee3b0c4ca94aa68f1624276156f46ed4d5bdd1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 23 Mar 2026 10:45:20 +0100
Subject: [PATCH 2/4] Fix missing definitions

---
 libclc/clc/lib/generic/math/clc_remquo.cl     |  29 +++-
 libclc/clc/lib/generic/math/clc_remquo.inc    | 154 -----------------
 .../clc/lib/generic/math/clc_remquo_stret.inc | 163 ++++++++++++++++++
 3 files changed, 191 insertions(+), 155 deletions(-)
 create mode 100644 libclc/clc/lib/generic/math/clc_remquo_stret.inc

diff --git a/libclc/clc/lib/generic/math/clc_remquo.cl b/libclc/clc/lib/generic/math/clc_remquo.cl
index 5e741294cb3a3..502b9e5edc405 100644
--- a/libclc/clc/lib/generic/math/clc_remquo.cl
+++ b/libclc/clc/lib/generic/math/clc_remquo.cl
@@ -27,10 +27,37 @@
 #include "clc/relational/clc_isnan.h"
 #include "clc/relational/clc_signbit.h"
 
-#define __CLC_BODY "clc_remquo.inc"
+#define __CLC_FUNCTION __clc_remquo_stret
+#define __CLC_BODY "clc_remquo_stret.inc"
 #include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
 
 #define __CLC_FUNCTION __clc_remquo
+#define __CLC_BODY "clc_remquo.inc"
+#include "clc/math/gentype.inc"
+
+#define __CLC_OUT_ARG3_SCALAR_TYPE int
+#define __CLC_OUT_ARG3_ADDRESS_SPACE __private
+#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_OUT_ARG3_ADDRESS_SPACE
+
+#define __CLC_OUT_ARG3_SCALAR_TYPE int
+#define __CLC_OUT_ARG3_ADDRESS_SPACE __local
+#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_OUT_ARG3_ADDRESS_SPACE
+
+#define __CLC_OUT_ARG3_SCALAR_TYPE int
+#define __CLC_OUT_ARG3_ADDRESS_SPACE __global
+#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_OUT_ARG3_ADDRESS_SPACE
+
+#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
 #define __CLC_OUT_ARG3_SCALAR_TYPE int
+#define __CLC_OUT_ARG3_ADDRESS_SPACE
 #define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc"
 #include "clc/math/gentype.inc"
+#undef __CLC_OUT_ARG3_ADDRESS_SPACE
+#endif
diff --git a/libclc/clc/lib/generic/math/clc_remquo.inc b/libclc/clc/lib/generic/math/clc_remquo.inc
index 78d5f38ced055..649bdd9ee8b65 100644
--- a/libclc/clc/lib/generic/math/clc_remquo.inc
+++ b/libclc/clc/lib/generic/math/clc_remquo.inc
@@ -7,159 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #ifdef __CLC_SCALAR
-
-#if __CLC_FPSIZE == 32
-#define __CLC_REMQUO_EVAL_TYPE __CLC_FLOATN
-#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_FLOATN
-#define __CLC_S_EVAL_TYPE __CLC_INTN
-#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_INTN
-#elif __CLC_FPSIZE == 64
-#define __CLC_REMQUO_EVAL_TYPE __CLC_DOUBLEN
-#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_DOUBLEN
-#define __CLC_S_EVAL_TYPE __CLC_LONGN
-#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_LONGN
-#elif __CLC_FPSIZE == 16
-#define __CLC_REMQUO_EVAL_TYPE __CLC_FLOATN
-#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_FLOATN
-#define __CLC_S_EVAL_TYPE __CLC_INTN
-#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_INTN
-#endif
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_REMQUO_RET_GENTYPE
-__clc_remquo_stret(__CLC_GENTYPE x, __CLC_GENTYPE y) {
-  // How many bits of the quotient per iteration
-
-#if __CLC_FPSIZE == 32
-  const __CLC_INTN bits = 12;
-  const __CLC_GENTYPE max_exp = 0x1.0p+127f;
-#elif __CLC_FPSIZE == 64
-  const __CLC_INTN bits = 26;
-  const __CLC_GENTYPE max_exp = 0x1.0p+1023;
-#elif __CLC_FPSIZE == 16
-  const __CLC_INTN bits = 11;
-  const __CLC_GENTYPE max_exp = 0x1.0p+15h;
-#endif
-
-  // Track low 7 bits of the integral quotient.
-  __CLC_INTN q7;
-
-  __CLC_REMQUO_EVAL_TYPE ax = __CLC_CONVERT_REMQUO_EVAL_TYPE(__clc_fabs(x));
-  __CLC_REMQUO_EVAL_TYPE ay = __CLC_CONVERT_REMQUO_EVAL_TYPE(__clc_fabs(y));
-
-  __CLC_GENTYPE ret;
-
-  if (ax > ay) {
-    __CLC_INTN ex, ey;
-
-    __CLC_REMQUO_EVAL_TYPE mx = __clc_frexp(ax, &ex);
-    --ex;
-
-    __CLC_REMQUO_EVAL_TYPE my = __clc_frexp(ay, &ey);
-    --ey;
-
-    ax = __clc_ldexp(mx, bits);
-    ay = __clc_ldexp(my, 1);
-
-    __CLC_INTN nb = ex - ey;
-    __CLC_REMQUO_EVAL_TYPE ayinv = __clc_recip_fast(ay);
-
-    __CLC_INTN qacc = 0;
-
-    while (nb > bits) {
-      __CLC_REMQUO_EVAL_TYPE q = __clc_rint(ax * ayinv);
-
-#if __CLC_FPSIZE == 16
-      ax = __clc_mad(-q, ay, ax);
-#else
-      ax = __clc_fma(-q, ay, ax);
-#endif
-      __CLC_S_GENTYPE clt = ax < (__CLC_REMQUO_EVAL_TYPE)0.0;
-      __CLC_REMQUO_EVAL_TYPE axp = ax + ay;
-      ax = clt ? axp : ax;
-      ax = __clc_ldexp(ax, bits);
-
-      __CLC_INTN iq = __CLC_CONVERT_INTN(q);
-      iq -= __CLC_CONVERT_INTN(clt) ? 1 : 0;
-      qacc = (qacc << bits) | iq;
-
-      nb -= bits;
-    }
-
-    ax = __clc_ldexp(ax, nb - bits + 1);
-
-    __CLC_INTN is_odd;
-
-    // Final iteration
-    {
-      __CLC_REMQUO_EVAL_TYPE q = __clc_rint(ax * ayinv);
-#if __CLC_FPSIZE == 16
-      ax = __clc_mad(-q, ay, ax);
-#else
-      ax = __clc_fma(-q, ay, ax);
-#endif
-
-      __CLC_S_GENTYPE clt = ax < (__CLC_REMQUO_EVAL_TYPE)0.0;
-      __CLC_REMQUO_EVAL_TYPE axp = ax + ay;
-      ax = clt ? axp : ax;
-      __CLC_INTN iq = __CLC_CONVERT_INTN(q);
-      iq -= __CLC_CONVERT_INTN(clt) ? 1 : 0;
-
-      qacc = (qacc << (nb + 1)) | iq;
-      is_odd = (iq & 1) != 0;
-    }
-
-    // Adjust ax so that it is the range (-y/2, y/2]
-    // We need to choose the even integer when x/y is midway between two
-    // integers
-    __CLC_S_EVAL_TYPE aq = ((__CLC_REMQUO_EVAL_TYPE)2.0 * ax > ay) |
-                           (__CLC_CONVERT_S_EVAL_TYPE(is_odd) &
-                            ((__CLC_REMQUO_EVAL_TYPE)2.0 * ax == ay));
-    ax = ax - (aq ? ay : (__CLC_REMQUO_EVAL_TYPE)0.0);
-
-    ax = __clc_ldexp(ax, ey);
-    qacc += aq ? 1 : 0;
-
-    __CLC_S_GENTYPE qneg = __clc_signbit(x) ^ __clc_signbit(y) ? -1 : 0;
-    q7 = ((qacc & 0x7f) ^ qneg) - qneg;
-
-    ret = __clc_signbit(x) ? -ax : ax;
-  } else {
-    __CLC_S_EVAL_TYPE c = (ax > (__CLC_REMQUO_EVAL_TYPE)0.5 * ay);
-    if (__CLC_FPSIZE != 16)
-      c |= (ay < max_exp && (__CLC_REMQUO_EVAL_TYPE)2.0 * ax > ay);
-
-    __CLC_CHARN qsgn = __CLC_CONVERT_CHARN(__clc_signbit(x) == __clc_signbit(y))
-                           ? (__CLC_CHARN)1
-                           : (__CLC_CHARN)-1;
-
-    __CLC_GENTYPE t = __clc_mad(y, -__CLC_CONVERT_GENTYPE(qsgn), x);
-    ret = c ? t : __clc_flush_if_daz(x);
-    q7 = c ? qsgn : 0;
-
-    __CLC_GENTYPE zero = __clc_copysign(__CLC_FP_LIT(0.0), x);
-    ret = ax == ay ? zero : ret;
-    q7 = ax == ay ? qsgn : q7;
-  }
-
-  ret = y == __CLC_FP_LIT(0.0) ? __CLC_GENTYPE_NAN : ret;
-  q7 = y == __CLC_FP_LIT(0.0) ? 0 : q7;
-
-  __CLC_S_GENTYPE finite = !__clc_isnan(y) && __clc_isfinite(x);
-
-  // A defined 0 result for quo with a nan result is an additional OpenCL
-  // requirement beyond standard C.
-  __CLC_REMQUO_RET_GENTYPE result;
-  result.quo = finite ? q7 : 0;
-  result.rem = finite ? ret : __CLC_GENTYPE_NAN;
-
-  return result;
-}
-
-#undef __CLC_REMQUO_EVAL_TYPE
-#undef __CLC_CONVERT_REMQUO_EVAL_TYPE
-#undef __CLC_S_EVAL_TYPE
-#undef __CLC_CONVERT_S_EVAL_TYPE
-
 #define __CLC_REMQUO_DEF(addrspace)                                            \
   _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_remquo(                           \
       __CLC_GENTYPE x, __CLC_GENTYPE y, addrspace __CLC_INTN *quo_out) {       \
@@ -174,5 +21,4 @@ __CLC_REMQUO_DEF(global)
 #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
 __CLC_REMQUO_DEF(generic)
 #endif
-
 #endif // __CLC_SCALAR
diff --git a/libclc/clc/lib/generic/math/clc_remquo_stret.inc b/libclc/clc/lib/generic/math/clc_remquo_stret.inc
new file mode 100644
index 0000000000000..78de5cb4615db
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_remquo_stret.inc
@@ -0,0 +1,163 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+#if __CLC_FPSIZE == 32
+#define __CLC_REMQUO_EVAL_TYPE __CLC_FLOATN
+#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_FLOATN
+#define __CLC_S_EVAL_TYPE __CLC_INTN
+#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_INTN
+#elif __CLC_FPSIZE == 64
+#define __CLC_REMQUO_EVAL_TYPE __CLC_DOUBLEN
+#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_DOUBLEN
+#define __CLC_S_EVAL_TYPE __CLC_LONGN
+#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_LONGN
+#elif __CLC_FPSIZE == 16
+#define __CLC_REMQUO_EVAL_TYPE __CLC_FLOATN
+#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_FLOATN
+#define __CLC_S_EVAL_TYPE __CLC_INTN
+#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_INTN
+#endif
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_REMQUO_RET_GENTYPE
+__clc_remquo_stret(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+  // How many bits of the quotient per iteration
+
+#if __CLC_FPSIZE == 32
+  const __CLC_INTN bits = 12;
+  const __CLC_GENTYPE max_exp = 0x1.0p+127f;
+#elif __CLC_FPSIZE == 64
+  const __CLC_INTN bits = 26;
+  const __CLC_GENTYPE max_exp = 0x1.0p+1023;
+#elif __CLC_FPSIZE == 16
+  const __CLC_INTN bits = 11;
+  const __CLC_GENTYPE max_exp = 0x1.0p+15h;
+#endif
+
+  // Track low 7 bits of the integral quotient.
+  __CLC_INTN q7;
+
+  __CLC_REMQUO_EVAL_TYPE ax = __CLC_CONVERT_REMQUO_EVAL_TYPE(__clc_fabs(x));
+  __CLC_REMQUO_EVAL_TYPE ay = __CLC_CONVERT_REMQUO_EVAL_TYPE(__clc_fabs(y));
+
+  __CLC_GENTYPE ret;
+
+  if (ax > ay) {
+    __CLC_INTN ex, ey;
+
+    __CLC_REMQUO_EVAL_TYPE mx = __clc_frexp(ax, &ex);
+    --ex;
+
+    __CLC_REMQUO_EVAL_TYPE my = __clc_frexp(ay, &ey);
+    --ey;
+
+    ax = __clc_ldexp(mx, bits);
+    ay = __clc_ldexp(my, 1);
+
+    __CLC_INTN nb = ex - ey;
+    __CLC_REMQUO_EVAL_TYPE ayinv = __clc_recip_fast(ay);
+
+    __CLC_INTN qacc = 0;
+
+    while (nb > bits) {
+      __CLC_REMQUO_EVAL_TYPE q = __clc_rint(ax * ayinv);
+
+#if __CLC_FPSIZE == 16
+      ax = __clc_mad(-q, ay, ax);
+#else
+      ax = __clc_fma(-q, ay, ax);
+#endif
+      __CLC_S_GENTYPE clt = ax < (__CLC_REMQUO_EVAL_TYPE)0.0;
+      __CLC_REMQUO_EVAL_TYPE axp = ax + ay;
+      ax = clt ? axp : ax;
+      ax = __clc_ldexp(ax, bits);
+
+      __CLC_INTN iq = __CLC_CONVERT_INTN(q);
+      iq -= __CLC_CONVERT_INTN(clt) ? 1 : 0;
+      qacc = (qacc << bits) | iq;
+
+      nb -= bits;
+    }
+
+    ax = __clc_ldexp(ax, nb - bits + 1);
+
+    __CLC_INTN is_odd;
+
+    // Final iteration
+    {
+      __CLC_REMQUO_EVAL_TYPE q = __clc_rint(ax * ayinv);
+#if __CLC_FPSIZE == 16
+      ax = __clc_mad(-q, ay, ax);
+#else
+      ax = __clc_fma(-q, ay, ax);
+#endif
+
+      __CLC_S_GENTYPE clt = ax < (__CLC_REMQUO_EVAL_TYPE)0.0;
+      __CLC_REMQUO_EVAL_TYPE axp = ax + ay;
+      ax = clt ? axp : ax;
+      __CLC_INTN iq = __CLC_CONVERT_INTN(q);
+      iq -= __CLC_CONVERT_INTN(clt) ? 1 : 0;
+
+      qacc = (qacc << (nb + 1)) | iq;
+      is_odd = (iq & 1) != 0;
+    }
+
+    // Adjust ax so that it is the range (-y/2, y/2]
+    // We need to choose the even integer when x/y is midway between two
+    // integers
+    __CLC_S_EVAL_TYPE aq = ((__CLC_REMQUO_EVAL_TYPE)2.0 * ax > ay) |
+                           (__CLC_CONVERT_S_EVAL_TYPE(is_odd) &
+                            ((__CLC_REMQUO_EVAL_TYPE)2.0 * ax == ay));
+    ax = ax - (aq ? ay : (__CLC_REMQUO_EVAL_TYPE)0.0);
+
+    ax = __clc_ldexp(ax, ey);
+    qacc += aq ? 1 : 0;
+
+    __CLC_S_GENTYPE qneg = __clc_signbit(x) ^ __clc_signbit(y) ? -1 : 0;
+    q7 = ((qacc & 0x7f) ^ qneg) - qneg;
+
+    ret = __clc_signbit(x) ? -ax : ax;
+  } else {
+    __CLC_S_EVAL_TYPE c = (ax > (__CLC_REMQUO_EVAL_TYPE)0.5 * ay);
+    if (__CLC_FPSIZE != 16)
+      c |= (ay < max_exp && (__CLC_REMQUO_EVAL_TYPE)2.0 * ax > ay);
+
+    __CLC_CHARN qsgn = __CLC_CONVERT_CHARN(__clc_signbit(x) == __clc_signbit(y))
+                           ? (__CLC_CHARN)1
+                           : (__CLC_CHARN)-1;
+
+    __CLC_GENTYPE t = __clc_mad(y, -__CLC_CONVERT_GENTYPE(qsgn), x);
+    ret = c ? t : __clc_flush_if_daz(x);
+    q7 = c ? qsgn : 0;
+
+    __CLC_GENTYPE zero = __clc_copysign(__CLC_FP_LIT(0.0), x);
+    ret = ax == ay ? zero : ret;
+    q7 = ax == ay ? qsgn : q7;
+  }
+
+  ret = y == __CLC_FP_LIT(0.0) ? __CLC_GENTYPE_NAN : ret;
+  q7 = y == __CLC_FP_LIT(0.0) ? 0 : q7;
+
+  __CLC_S_GENTYPE finite = !__clc_isnan(y) && __clc_isfinite(x);
+
+  // A defined 0 result for quo with a nan result is an additional OpenCL
+  // requirement beyond standard C.
+  __CLC_REMQUO_RET_GENTYPE result;
+  result.quo = finite ? q7 : 0;
+  result.rem = finite ? ret : __CLC_GENTYPE_NAN;
+
+  return result;
+}
+
+#undef __CLC_REMQUO_EVAL_TYPE
+#undef __CLC_CONVERT_REMQUO_EVAL_TYPE
+#undef __CLC_S_EVAL_TYPE
+#undef __CLC_CONVERT_S_EVAL_TYPE
+
+#endif // __CLC_SCALAR

>From bdfb59beba7a041ea9746d51cc70f22f1e88b2aa Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 23 Mar 2026 10:48:04 +0100
Subject: [PATCH 3/4] Address comments

---
 libclc/clc/include/clc/math/remquo_decl.inc                  | 4 ----
 .../clc/include/clc/shared/binary_with_out_arg_scalarize.inc | 5 +++++
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/libclc/clc/include/clc/math/remquo_decl.inc b/libclc/clc/include/clc/math/remquo_decl.inc
index d5bfc5bd007ca..8ba601199ef0f 100644
--- a/libclc/clc/include/clc/math/remquo_decl.inc
+++ b/libclc/clc/include/clc/math/remquo_decl.inc
@@ -20,10 +20,6 @@ _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
                                                    __CLC_GENTYPE y,
                                                    private __CLC_INTN *q);
 
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
-                                                   __CLC_GENTYPE y,
-                                                   private __CLC_INTN *q);
-
 _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
                                                    __CLC_GENTYPE y,
                                                    global __CLC_INTN *q);
diff --git a/libclc/clc/include/clc/shared/binary_with_out_arg_scalarize.inc b/libclc/clc/include/clc/shared/binary_with_out_arg_scalarize.inc
index 6c7193b6f26a9..2c233b36cc73c 100644
--- a/libclc/clc/include/clc/shared/binary_with_out_arg_scalarize.inc
+++ b/libclc/clc/include/clc/shared/binary_with_out_arg_scalarize.inc
@@ -75,3 +75,8 @@ __CLC_FUNCTION(__CLC_ARG1_TYPE x, __CLC_ARG2_TYPE y,
 }
 
 #endif // __CLC_VECSIZE_OR_1 >= 2
+
+#undef __CLC_RET_TYPE
+#undef __CLC_ARG1_TYPE
+#undef __CLC_ARG2_TYPE
+#undef __CLC_OUT_ARG3_TYPE

>From f0715208e16b0c64edd75f1137efe192f6979e2a Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 23 Mar 2026 10:55:02 +0100
Subject: [PATCH 4/4] Address comments

---
 libclc/clc/lib/generic/math/clc_remquo_stret.inc | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/libclc/clc/lib/generic/math/clc_remquo_stret.inc b/libclc/clc/lib/generic/math/clc_remquo_stret.inc
index 78de5cb4615db..eecc25525d8d7 100644
--- a/libclc/clc/lib/generic/math/clc_remquo_stret.inc
+++ b/libclc/clc/lib/generic/math/clc_remquo_stret.inc
@@ -8,16 +8,11 @@
 
 #ifdef __CLC_SCALAR
 
-#if __CLC_FPSIZE == 32
-#define __CLC_REMQUO_EVAL_TYPE __CLC_FLOATN
-#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_FLOATN
-#define __CLC_S_EVAL_TYPE __CLC_INTN
-#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_INTN
-#elif __CLC_FPSIZE == 64
-#define __CLC_REMQUO_EVAL_TYPE __CLC_DOUBLEN
-#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_DOUBLEN
-#define __CLC_S_EVAL_TYPE __CLC_LONGN
-#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_LONGN
+#if __CLC_FPSIZE == 32 || __CLC_FPSIZE == 64
+#define __CLC_REMQUO_EVAL_TYPE __CLC_GENTYPE
+#define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_GENTYPE
+#define __CLC_S_EVAL_TYPE __CLC_S_GENTYPE
+#define __CLC_CONVERT_S_EVAL_TYPE __CLC_CONVERT_S_GENTYPE
 #elif __CLC_FPSIZE == 16
 #define __CLC_REMQUO_EVAL_TYPE __CLC_FLOATN
 #define __CLC_CONVERT_REMQUO_EVAL_TYPE __CLC_CONVERT_FLOATN