[libclc] [libclc] replace float remquo with amd ocml implementation (PR #177131)
Wenju He via cfe-commits
cfe-commits at lists.llvm.org
Wed Jan 21 02:23:01 PST 2026
https://github.com/wenju-he updated https://github.com/llvm/llvm-project/pull/177131
>From c2e8802a874bdb368b349be588e18b410d0c1985 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Wed, 21 Jan 2026 07:41:41 +0100
Subject: [PATCH 1/3] [libclc] replace float remquo with amd ocml
implementation
Current implementation has two issues:
* unconditionally soft flushes denormal.
* can't pass OpenCL CTS test "test_bruteforce remquo" on intel gpu.
This PR upstreams remquo implementation from
https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs/ocml/src/remainderF_base.h
It supports denormal and can pass OpenCL CTS test.
Note __oclc_finite_only_opt is set to false as there is no dynamic
dispatching for generic implementation.
Number of LLVM IR instructions of function _Z6remquoffPU3AS5i increased
from 96 to 678.
---
libclc/clc/lib/generic/math/clc_remquo.cl | 11 +-
libclc/clc/lib/generic/math/clc_remquo.inc | 150 ++++++++++++---------
2 files changed, 99 insertions(+), 62 deletions(-)
diff --git a/libclc/clc/lib/generic/math/clc_remquo.cl b/libclc/clc/lib/generic/math/clc_remquo.cl
index fd83ead06d89a..db7ab8ceaa073 100644
--- a/libclc/clc/lib/generic/math/clc_remquo.cl
+++ b/libclc/clc/lib/generic/math/clc_remquo.cl
@@ -7,14 +7,23 @@
//===----------------------------------------------------------------------===//
#include <clc/clc_convert.h>
+#include <clc/float/definitions.h>
#include <clc/integer/clc_clz.h>
#include <clc/internal/clc.h>
+#include <clc/math/clc_copysign.h>
+#include <clc/math/clc_fabs.h>
#include <clc/math/clc_floor.h>
#include <clc/math/clc_fma.h>
+#include <clc/math/clc_frexp.h>
#include <clc/math/clc_ldexp.h>
-#include <clc/math/clc_subnormal_config.h>
+#include <clc/math/clc_nan.h>
+#include <clc/math/clc_native_recip.h>
+#include <clc/math/clc_rint.h>
+#include <clc/math/clc_sincos_helpers.h>
#include <clc/math/clc_trunc.h>
#include <clc/math/math.h>
+#include <clc/relational/clc_isfinite.h>
+#include <clc/relational/clc_isnan.h>
#include <clc/shared/clc_max.h>
#define __CLC_ADDRESS_SPACE private
diff --git a/libclc/clc/lib/generic/math/clc_remquo.inc b/libclc/clc/lib/generic/math/clc_remquo.inc
index 3a76ffed7f039..69c9a8731e907 100644
--- a/libclc/clc/lib/generic/math/clc_remquo.inc
+++ b/libclc/clc/lib/generic/math/clc_remquo.inc
@@ -8,69 +8,97 @@
_CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y,
__CLC_ADDRESS_SPACE int *quo) {
- x = __clc_flush_denormal_if_not_supported(x);
- y = __clc_flush_denormal_if_not_supported(y);
- int ux = __clc_as_int(x);
- int ax = ux & EXSIGNBIT_SP32;
- float xa = __clc_as_float(ax);
- int sx = ux ^ ax;
- int ex = ax >> EXPSHIFTBITS_SP32;
-
- int uy = __clc_as_int(y);
- int ay = uy & EXSIGNBIT_SP32;
- float ya = __clc_as_float(ay);
- int sy = uy ^ ay;
- int ey = ay >> EXPSHIFTBITS_SP32;
-
- float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff));
- float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff));
- int c;
- int k = ex - ey;
-
- uint q = 0;
-
- while (k > 0) {
- c = xr >= yr;
- q = (q << 1) | c;
- xr -= c ? yr : 0.0f;
- xr += xr;
- --k;
+ const int bits = 12;
+ float ax = __clc_fabs(x);
+ float ay = __clc_fabs(y);
+ float ret;
+ int q7;
+ if (ax > ay) {
+ int ex, ey;
+ ex = ({
+ int _exp;
+ __clc_frexp(ax, &_exp);
+ _exp;
+ }) -
+ 1;
+ ax = __clc_ldexp(({
+ int _exp;
+ __clc_frexp(ax, &_exp);
+ }),
+ bits);
+ ey = ({
+ int _exp;
+ __clc_frexp(ay, &_exp);
+ _exp;
+ }) -
+ 1;
+ ay = __clc_ldexp(({
+ int _exp;
+ __clc_frexp(ay, &_exp);
+ }),
+ 1);
+ int nb = ex - ey;
+ float ayinv = __clc_native_recip(ay);
+ int qacc = 0;
+ while (nb > bits) {
+ float q = __clc_rint(ax * ayinv);
+ ax = __clc_fma(-q, ay, ax);
+ int clt = ax < 0.0f;
+ float axp = ax + ay;
+ ax = clt ? axp : ax;
+
+ int iq = (int)q;
+ iq -= clt;
+ qacc = (qacc << bits) | iq;
+
+ ax = __clc_ldexp(ax, bits);
+ nb -= bits;
+ }
+ ax = __clc_ldexp(ax, nb - bits + 1);
+ {
+ float q = __clc_rint(ax * ayinv);
+ ax = __clc_fma(-q, ay, ax);
+ int clt = ax < 0.0f;
+ float axp = ax + ay;
+ ax = clt ? axp : ax;
+ int iq = (int)q;
+ iq -= clt;
+ qacc = (qacc << (nb + 1)) | iq;
+ }
+ int aq = (2.0f * ax > ay) | ((qacc & 0x1) & (2.0f * ax == ay));
+ ax = ax - (aq ? ay : 0.0f);
+ qacc += aq;
+ int qneg = (__clc_as_int(x) ^ __clc_as_int(y)) >> 31;
+ q7 = ((qacc & 0x7f) ^ qneg) - qneg;
+ ax = __clc_ldexp(ax, ey);
+ ret =
+ __clc_as_float((__clc_as_int(x) & (int)0x80000000) ^ __clc_as_int(ax));
+ } else {
+ ret = x;
+ q7 = 0;
+ bool c = (ay<0x1.0p+127f & 2.0f * ax> ay) | (ax > 0.5f * ay);
+
+ int qsgn = 1 + (((__clc_as_int(x) ^ __clc_as_int(y)) >> 31) << 1);
+ float t = __clc_fma(y, -(float)qsgn, x);
+ ret = c ? t
+ : (__builtin_isfpclass(__builtin_canonicalizef(0x1p-149f), 0x0040)
+ ? __builtin_canonicalizef(x)
+ : x);
+ q7 = c ? qsgn : q7;
+ ret = ax == ay ? __clc_copysign(0.0f, x) : ret;
+ q7 = ax == ay ? qsgn : q7;
+ }
+ bool __oclc_finite_only_opt = false;
+ if (!__oclc_finite_only_opt) {
+ ret = y == 0.0f ? __clc_nan(0) : ret;
+ q7 = y == 0.0f ? 0 : q7;
+ bool c = !__clc_isnan(y) && __clc_isfinite(x);
+ ret = c ? ret : __clc_nan(0);
+ q7 = c ? q7 : 0;
}
- c = xr > yr;
- q = (q << 1) | c;
- xr -= c ? yr : 0.0f;
-
- int lt = ex < ey;
-
- q = lt ? 0 : q;
- xr = lt ? xa : xr;
- yr = lt ? ya : yr;
-
- c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & ((q & 0x1) == 0x1));
- xr -= c ? yr : 0.0f;
- q += c;
-
- float s = __clc_as_float(ey << EXPSHIFTBITS_SP32);
- xr *= lt ? 1.0f : s;
-
- int qsgn = sx == sy ? 1 : -1;
- int quot = (q & 0x7f) * qsgn;
-
- c = ax == ay;
- quot = c ? qsgn : quot;
- xr = c ? 0.0f : xr;
-
- xr = __clc_as_float(sx ^ __clc_as_int(xr));
-
- c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 |
- ay == 0;
- quot = c ? 0 : quot;
- xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr;
-
- *quo = quot;
-
- return xr;
+ *quo = q7;
+ return ret;
}
// remquo signature is special, we don't have macro for this
>From 84e91fe2ea08ec5ac22ec7319831408cf36021ae Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Wed, 21 Jan 2026 11:20:26 +0100
Subject: [PATCH 2/3] always canonicalize
---
libclc/clc/lib/generic/math/clc_remquo.inc | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/libclc/clc/lib/generic/math/clc_remquo.inc b/libclc/clc/lib/generic/math/clc_remquo.inc
index 69c9a8731e907..7281aef29ef44 100644
--- a/libclc/clc/lib/generic/math/clc_remquo.inc
+++ b/libclc/clc/lib/generic/math/clc_remquo.inc
@@ -80,10 +80,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y,
int qsgn = 1 + (((__clc_as_int(x) ^ __clc_as_int(y)) >> 31) << 1);
float t = __clc_fma(y, -(float)qsgn, x);
- ret = c ? t
- : (__builtin_isfpclass(__builtin_canonicalizef(0x1p-149f), 0x0040)
- ? __builtin_canonicalizef(x)
- : x);
+ ret = c ? t : __builtin_elementwise_canonicalize(x);
q7 = c ? qsgn : q7;
ret = ax == ay ? __clc_copysign(0.0f, x) : ret;
q7 = ax == ay ? qsgn : q7;
>From c8fd881acee9e0a7c1b00c0f61f45c26df23488f Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Wed, 21 Jan 2026 18:22:53 +0800
Subject: [PATCH 3/3] Update libclc/clc/lib/generic/math/clc_remquo.inc
Co-authored-by: Copilot <175728472+Copilot at users.noreply.github.com>
---
libclc/clc/lib/generic/math/clc_remquo.inc | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libclc/clc/lib/generic/math/clc_remquo.inc b/libclc/clc/lib/generic/math/clc_remquo.inc
index 7281aef29ef44..4babbc60727d2 100644
--- a/libclc/clc/lib/generic/math/clc_remquo.inc
+++ b/libclc/clc/lib/generic/math/clc_remquo.inc
@@ -76,7 +76,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y,
} else {
ret = x;
q7 = 0;
- bool c = (ay<0x1.0p+127f & 2.0f * ax> ay) | (ax > 0.5f * ay);
+ bool c = (ay < 0x1.0p+127f & 2.0f * ax > ay) | (ax > 0.5f * ay);
int qsgn = 1 + (((__clc_as_int(x) ^ __clc_as_int(y)) >> 31) << 1);
float t = __clc_fma(y, -(float)qsgn, x);
More information about the cfe-commits
mailing list