[libclc] [libclc] Restore previous generic fmod implementation (PR #187470)
via cfe-commits
cfe-commits at lists.llvm.org
Thu Mar 19 03:29:32 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Wenju He (wenju-he)
<details>
<summary>Changes</summary>
Restore from before 3c7f70bb9cee for targets that do not yet implement frem. Keep the __builtin_elementwise_fmod-based implementation for AMDGPU.
---
Full diff: https://github.com/llvm/llvm-project/pull/187470.diff
3 Files Affected:
- (modified) libclc/clc/lib/amdgpu/CMakeLists.txt (+1)
- (added) libclc/clc/lib/amdgpu/math/clc_fmod.cl (+15)
- (modified) libclc/clc/lib/generic/math/clc_fmod.cl (+181-4)
``````````diff
diff --git a/libclc/clc/lib/amdgpu/CMakeLists.txt b/libclc/clc/lib/amdgpu/CMakeLists.txt
index a2a30c2941d6b..3e3059570c184 100644
--- a/libclc/clc/lib/amdgpu/CMakeLists.txt
+++ b/libclc/clc/lib/amdgpu/CMakeLists.txt
@@ -5,6 +5,7 @@ libclc_configure_source_list(CLC_AMDGPU_SOURCES
math/clc_exp2.cl
math/clc_exp2_fast.cl
math/clc_exp10.cl
+ math/clc_fmod.cl
math/clc_frexp.cl
math/clc_half_exp.cl
math/clc_half_exp2.cl
diff --git a/libclc/clc/lib/amdgpu/math/clc_fmod.cl b/libclc/clc/lib/amdgpu/math/clc_fmod.cl
new file mode 100644
index 0000000000000..a05b75d968ce7
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/math/clc_fmod.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/math/clc_fmod.h"
+
+#define __CLC_FUNCTION __clc_fmod
+#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_fmod
+#define __CLC_BODY "clc/shared/binary_def.inc"
+
+#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_fmod.cl b/libclc/clc/lib/generic/math/clc_fmod.cl
index 8add0cefd621f..7f60b403b53e6 100644
--- a/libclc/clc/lib/generic/math/clc_fmod.cl
+++ b/libclc/clc/lib/generic/math/clc_fmod.cl
@@ -6,10 +6,187 @@
//
//===----------------------------------------------------------------------===//
-#include "clc/internal/clc.h"
+#include <clc/clc_convert.h>
+#include <clc/integer/clc_clz.h>
+#include <clc/internal/clc.h>
+#include <clc/math/clc_floor.h>
+#include <clc/math/clc_fma.h>
+#include <clc/math/clc_ldexp.h>
+#include <clc/math/clc_trunc.h>
+#include <clc/math/math.h>
+#include <clc/shared/clc_max.h>
+_CLC_DEF _CLC_OVERLOAD float __clc_fmod(float x, float y) {
+ int ux = __clc_as_int(x);
+ int ax = ux & EXSIGNBIT_SP32;
+ float xa = __clc_as_float(ax);
+ int sx = ux ^ ax;
+ int ex = ax >> EXPSHIFTBITS_SP32;
+
+ int uy = __clc_as_int(y);
+ int ay = uy & EXSIGNBIT_SP32;
+ float ya = __clc_as_float(ay);
+ int ey = ay >> EXPSHIFTBITS_SP32;
+
+ float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff));
+ float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff));
+ int c;
+ int k = ex - ey;
+
+ while (k > 0) {
+ c = xr >= yr;
+ xr -= c ? yr : 0.0f;
+ xr += xr;
+ --k;
+ }
+
+ c = xr >= yr;
+ xr -= c ? yr : 0.0f;
+
+ int lt = ex < ey;
+
+ xr = lt ? xa : xr;
+ yr = lt ? ya : yr;
+
+ float s = __clc_as_float(ey << EXPSHIFTBITS_SP32);
+ xr *= lt ? 1.0f : s;
+
+ c = ax == ay;
+ xr = c ? 0.0f : xr;
+
+ xr = __clc_as_float(sx ^ __clc_as_int(xr));
+
+ c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 |
+ ay == 0;
+ xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr;
+
+ return xr;
+}
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_fmod
+#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) {
+ ulong ux = __clc_as_ulong(x);
+ ulong ax = ux & ~SIGNBIT_DP64;
+ ulong xsgn = ux ^ ax;
+ double dx = __clc_as_double(ax);
+ int xexp = __clc_convert_int(ax >> EXPSHIFTBITS_DP64);
+ int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64);
+ xexp1 = xexp < 1 ? xexp1 : xexp;
+
+ ulong uy = __clc_as_ulong(y);
+ ulong ay = uy & ~SIGNBIT_DP64;
+ double dy = __clc_as_double(ay);
+ int yexp = __clc_convert_int(ay >> EXPSHIFTBITS_DP64);
+ int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64);
+ yexp1 = yexp < 1 ? yexp1 : yexp;
+
+ // First assume |x| > |y|
+
+ // Set ntimes to the number of times we need to do a
+ // partial remainder. If the exponent of x is an exact multiple
+ // of 53 larger than the exponent of y, and the mantissa of x is
+ // less than the mantissa of y, ntimes will be one too large
+ // but it doesn't matter - it just means that we'll go round
+ // the loop below one extra time.
+ int ntimes = __clc_max(0, (xexp1 - yexp1) / 53);
+ double w = __clc_ldexp(dy, ntimes * 53);
+ w = ntimes == 0 ? dy : w;
+ double scale = ntimes == 0 ? 1.0 : 0x1.0p-53;
+
+ // Each time round the loop we compute a partial remainder.
+ // This is done by subtracting a large multiple of w
+ // from x each time, where w is a scaled up version of y.
+ // The subtraction must be performed exactly in quad
+ // precision, though the result at each stage can
+ // fit exactly in a double precision number.
+ int i;
+ double t, v, p, pp;
+
+ for (i = 0; i < ntimes; i++) {
+ // Compute integral multiplier
+ t = __clc_trunc(dx / w);
+
+ // Compute w * t in quad precision
+ p = w * t;
+ pp = __clc_fma(w, t, -p);
+
+ // Subtract w * t from dx
+ v = dx - p;
+ dx = v + (((dx - v) - p) - pp);
+
+ // If t was one too large, dx will be negative. Add back one w.
+ dx += dx < 0.0 ? w : 0.0;
+
+ // Scale w down by 2^(-53) for the next iteration
+ w *= scale;
+ }
+
+ // One more time
+ t = __clc_floor(dx / w);
+
+ p = w * t;
+ pp = __clc_fma(w, t, -p);
+ v = dx - p;
+ dx = v + (((dx - v) - p) - pp);
+ i = dx < 0.0;
+ dx += i ? w : 0.0;
+
+ // At this point, dx lies in the range [0,dy)
+ double ret = __clc_as_double(xsgn ^ __clc_as_ulong(dx));
+ dx = __clc_as_double(ax);
+
+ // Now handle |x| == |y|
+ int c = dx == dy;
+ t = __clc_as_double(xsgn);
+ ret = c ? t : ret;
+
+ // Next, handle |x| < |y|
+ c = dx < dy;
+ ret = c ? x : ret;
+
+ // We don't need anything special for |x| == 0
+
+ // |y| is 0
+ c = dy == 0.0;
+ ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
+
+ // y is +-Inf, NaN
+ c = yexp > BIASEDEMAX_DP64;
+ t = y == y ? x : y;
+ ret = c ? t : ret;
+
+ // x is +=Inf, NaN
+ c = xexp > BIASEDEMAX_DP64;
+ ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
+
+ return ret;
+}
+
+#define __CLC_DOUBLE_ONLY
+#define __CLC_FUNCTION __clc_fmod
+#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
+
+#endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// Forward the half version of this builtin onto the float one
+#define __CLC_HALF_ONLY
#define __CLC_FUNCTION __clc_fmod
-#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_fmod
-#define __CLC_BODY "clc/shared/binary_def.inc"
+#define __CLC_BODY <clc/math/binary_def_via_fp32.inc>
+#include <clc/math/gentype.inc>
-#include "clc/math/gentype.inc"
+#endif
``````````
</details>
https://github.com/llvm/llvm-project/pull/187470
More information about the cfe-commits
mailing list