[libclc] 20c15c7 - [libclc] replace float remquo with amd ocml implementation (#177131)

Sun Jan 25 16:11:18 PST 2026

Author: Wenju He
Date: 2026-01-26T08:11:14+08:00
New Revision: 20c15c7afe97a546d339a5500d3fa273412e2fda

URL: https://github.com/llvm/llvm-project/commit/20c15c7afe97a546d339a5500d3fa273412e2fda
DIFF: https://github.com/llvm/llvm-project/commit/20c15c7afe97a546d339a5500d3fa273412e2fda.diff

LOG: [libclc] replace float remquo with amd ocml implementation (#177131)

Current implementation has two issues:
* unconditionally soft flushes denormal.
* can't pass OpenCL CTS test "test_bruteforce remquo" on intel gpu.

This PR upstreams remquo implementation from
https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs/ocml/src/remainderF_base.h
It supports denormal and can pass OpenCL CTS test. Number of LLVM IR
instructions of function _Z6remquoffPU3AS5i increased from 96 to 680.

---------

Co-authored-by: Copilot <175728472+Copilot at users.noreply.github.com>

Added: 
    

Modified: 
    libclc/clc/lib/generic/math/clc_remquo.cl
    libclc/clc/lib/generic/math/clc_remquo.inc

Removed: 
    


################################################################################
diff  --git a/libclc/clc/lib/generic/math/clc_remquo.cl b/libclc/clc/lib/generic/math/clc_remquo.cl
index fd83ead06d89a..db7ab8ceaa073 100644

--- a/libclc/clc/lib/generic/math/clc_remquo.cl
+++ b/libclc/clc/lib/generic/math/clc_remquo.cl
@@ -7,14 +7,23 @@
 //===----------------------------------------------------------------------===//
 
 #include <clc/clc_convert.h>
+#include <clc/float/definitions.h>
 #include <clc/integer/clc_clz.h>
 #include <clc/internal/clc.h>
+#include <clc/math/clc_copysign.h>
+#include <clc/math/clc_fabs.h>
 #include <clc/math/clc_floor.h>
 #include <clc/math/clc_fma.h>
+#include <clc/math/clc_frexp.h>
 #include <clc/math/clc_ldexp.h>
-#include <clc/math/clc_subnormal_config.h>
+#include <clc/math/clc_nan.h>
+#include <clc/math/clc_native_recip.h>
+#include <clc/math/clc_rint.h>
+#include <clc/math/clc_sincos_helpers.h>
 #include <clc/math/clc_trunc.h>
 #include <clc/math/math.h>
+#include <clc/relational/clc_isfinite.h>
+#include <clc/relational/clc_isnan.h>
 #include <clc/shared/clc_max.h>
 
 #define __CLC_ADDRESS_SPACE private

diff  --git a/libclc/clc/lib/generic/math/clc_remquo.inc b/libclc/clc/lib/generic/math/clc_remquo.inc
index 3a76ffed7f039..79eef077bf691 100644
--- a/libclc/clc/lib/generic/math/clc_remquo.inc
+++ b/libclc/clc/lib/generic/math/clc_remquo.inc
@@ -8,69 +8,82 @@
 
 _CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y,
                                           __CLC_ADDRESS_SPACE int *quo) {
-  x = __clc_flush_denormal_if_not_supported(x);
-  y = __clc_flush_denormal_if_not_supported(y);
-  int ux = __clc_as_int(x);
-  int ax = ux & EXSIGNBIT_SP32;
-  float xa = __clc_as_float(ax);
-  int sx = ux ^ ax;
-  int ex = ax >> EXPSHIFTBITS_SP32;
-
-  int uy = __clc_as_int(y);
-  int ay = uy & EXSIGNBIT_SP32;
-  float ya = __clc_as_float(ay);
-  int sy = uy ^ ay;
-  int ey = ay >> EXPSHIFTBITS_SP32;
-
-  float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff));
-  float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff));
-  int c;
-  int k = ex - ey;
-
-  uint q = 0;
-
-  while (k > 0) {
-    c = xr >= yr;
-    q = (q << 1) | c;
-    xr -= c ? yr : 0.0f;
-    xr += xr;
-    --k;
+  const int bits = 12;
+  float ax = __clc_fabs(x);
+  float ay = __clc_fabs(y);
+  float ret;
+  int q7;
+  if (ax > ay) {
+    int ex, ey;
+    {
+      int _exp;
+      float _mant = __clc_frexp(ax, &_exp);
+      ex = _exp - 1;
+      ax = __clc_ldexp(_mant, bits);
+    }
+    {
+      int _exp;
+      float _mant = __clc_frexp(ay, &_exp);
+      ey = _exp - 1;
+      ay = __clc_ldexp(_mant, 1);
+    }
+    int nb = ex - ey;
+    float ayinv = __clc_native_recip(ay);
+    int qacc = 0;
+    while (nb > bits) {
+      float q = __clc_rint(ax * ayinv);
+      ax = __clc_fma(-q, ay, ax);
+      int clt = ax < 0.0f;
+      float axp = ax + ay;
+      ax = clt ? axp : ax;
+
+      int iq = (int)q;
+      iq -= clt;
+      qacc = (qacc << bits) | iq;
+
+      ax = __clc_ldexp(ax, bits);
+      nb -= bits;
+    }
+    ax = __clc_ldexp(ax, nb - bits + 1);
+    {
+      float q = __clc_rint(ax * ayinv);
+      ax = __clc_fma(-q, ay, ax);
+      int clt = ax < 0.0f;
+      float axp = ax + ay;
+      ax = clt ? axp : ax;
+      int iq = (int)q;
+      iq -= clt;
+      qacc = (qacc << (nb + 1)) | iq;
+    }
+    int aq = (2.0f * ax > ay) | ((qacc & 0x1) & (2.0f * ax == ay));
+    ax = ax - (aq ? ay : 0.0f);
+    qacc += aq;
+    int qneg = (__clc_as_int(x) ^ __clc_as_int(y)) >> 31;
+    q7 = ((qacc & 0x7f) ^ qneg) - qneg;
+    ax = __clc_ldexp(ax, ey);
+    ret =
+        __clc_as_float((__clc_as_int(x) & (int)0x80000000) ^ __clc_as_int(ax));
+  } else {
+    ret = x;
+    q7 = 0;
+    bool c = (ay < 0x1.0p+127f & 2.0f * ax > ay) | (ax > 0.5f * ay);
+
+    int qsgn = 1 + (((__clc_as_int(x) ^ __clc_as_int(y)) >> 31) << 1);
+    float t = __clc_fma(y, -(float)qsgn, x);
+    ret = c ? t : __builtin_elementwise_canonicalize(x);
+    q7 = c ? qsgn : q7;
+    ret = ax == ay ? __clc_copysign(0.0f, x) : ret;
+    q7 = ax == ay ? qsgn : q7;
   }
 
-  c = xr > yr;
-  q = (q << 1) | c;
-  xr -= c ? yr : 0.0f;
+  ret = y == 0.0f ? __clc_nan(0) : ret;
+  q7 = y == 0.0f ? 0 : q7;
+  bool c = !__clc_isnan(y) && __clc_isfinite(x);
+  ret = c ? ret : __clc_nan(0);
+  q7 = c ? q7 : 0;
 
-  int lt = ex < ey;
-
-  q = lt ? 0 : q;
-  xr = lt ? xa : xr;
-  yr = lt ? ya : yr;
-
-  c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & ((q & 0x1) == 0x1));
-  xr -= c ? yr : 0.0f;
-  q += c;
-
-  float s = __clc_as_float(ey << EXPSHIFTBITS_SP32);
-  xr *= lt ? 1.0f : s;
-
-  int qsgn = sx == sy ? 1 : -1;
-  int quot = (q & 0x7f) * qsgn;
-
-  c = ax == ay;
-  quot = c ? qsgn : quot;
-  xr = c ? 0.0f : xr;
-
-  xr = __clc_as_float(sx ^ __clc_as_int(xr));
-
-  c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 |
-      ay == 0;
-  quot = c ? 0 : quot;
-  xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr;
-
-  *quo = quot;
-
-  return xr;
+  *quo = q7;
+  return ret;
 }
 
 // remquo signature is special, we don't have macro for this