[clang] Implement macro poisoning for foreign CUDA headers from Nvidia Toolkit (PR #187696)
via cfe-commits
cfe-commits at lists.llvm.org
Fri Mar 20 06:01:22 PDT 2026
https://github.com/fenodem created https://github.com/llvm/llvm-project/pull/187696
@localspook
@yxsamliu
@emankov
Do not merge!
This PR was created to document my efforts on https://github.com/llvm/llvm-project/issues/119661
I used this .cu file:
```
// test_cuda_math_minimal.cu
__global__ void minimal_test() {
double d = -1.0;
float f = -1.0f;
// These 8 calls trigger all the errors in the issue
int a = isfinite(d);
int b = isfinite(f);
int c = isinf(d);
int d2 = isinf(f);
int e = isnan(d);
int f2 = isnan(f);
int g = signbit(d);
int h = signbit(f);
(void)a; (void)b; (void)c; (void)d2;
(void)e; (void)f2; (void)g; (void)h;
}
int main() {
minimal_test<<<1, 1>>>();
cudaDeviceSynchronize();
return 0;
}
```
with this command:
```
clang++ "C:\downloads\test_cuda_math_minimal.cu" ^
-o "C:\Downloads\test_cuda_math_minimal.cu.exe" ^
--target=x86_64-windows-gnu ^
--cuda-path="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6" ^
--cuda-gpu-arch=sm_89 ^
--sysroot="c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64" ^
-lcudart
```
Before this commit, I was getting the same errors which were present in the 2nd listing from https://github.com/llvm/llvm-project/issues/119661#issue-2734927853
If any knows how to make clang++.exe to take cuda header from near directory and not from --sysroot, please report here!
Upon applying these patches, I've found macro poisoning non-working (strange). These errors are after applying them (I couldn't find how to make clang use llvm headers, not those from gcc; I applied these diff to llvm headers where clang++.exe lives):
```
In file included from C:/Downloads/winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1/mingw64/lib/clang/18/include/__clang_cuda_runtime_wrapper.h:41:
In file included from C:/Downloads/winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1/mingw64/lib/clang/18/include/cuda_wrappers/cmath:27:
In file included from c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/cmath:49:
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/bits/std_abs.h:137:7: error:
__float128 is not supported on this target
137 | abs(__float128 __x)
| ^
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/bits/std_abs.h:136:3: error:
__float128 is not supported on this target
136 | __float128
| ^
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/bits/std_abs.h:137:18: note:
'__x' defined here
137 | abs(__float128 __x)
| ^
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/bits/std_abs.h:137:18: note:
'__x' defined here
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/bits/std_abs.h:137:18: note:
'__x' defined here
In file included from <built-in>:1:
In file included from C:/Downloads/winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1/mingw64/lib/clang/18/include/__clang_cuda_runtime_wrapper.h:41:
In file included from C:/Downloads/winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1/mingw64/lib/clang/18/include/cuda_wrappers/cmath:27:
In file included from c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/cmath:3898:
In file included from c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/bits/specfun.h:43:
In file included from c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/bits/stl_algobase.h:64:
In file included from c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/bits/stl_pair.h:60:
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/type_traits:514:39: error:
__float128 is not supported on this target
514 | struct __is_floating_point_helper<__float128>
| ^
In file included from <built-in>:1:
In file included from C:/Downloads/winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1/mingw64/lib/clang/18/include/__clang_cuda_runtime_wrapper.h:41:
In file included from C:/Downloads/winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1/mingw64/lib/clang/18/include/cuda_wrappers/cmath:27:
In file included from c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/cmath:3898:
In file included from c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/bits/specfun.h:43:
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/bits/stl_algobase.h:1079:21: error:
__float128 is not supported on this target
1079 | __size_to_integer(__float128 __n) { return (long long)__n; }
| ^
In file included from <built-in>:1:
In file included from C:/Downloads/winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1/mingw64/lib/clang/18/include/__clang_cuda_runtime_wrapper.h:41:
In file included from C:/Downloads/winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1/mingw64/lib/clang/18/include/cuda_wrappers/cmath:27:
In file included from c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/cmath:3898:
In file included from c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/bits/specfun.h:44:
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/limits:2089:27: error:
__float128 is not supported on this target
2089 | struct numeric_limits<__float128>
| ^
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/limits:2093:33: error:
__float128 is not supported on this target
2093 | static _GLIBCXX_CONSTEXPR __float128
| ^
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/limits:2104:33: error:
__float128 is not supported on this target
2104 | static _GLIBCXX_CONSTEXPR __float128
| ^
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/limits:2118:33: error:
__float128 is not supported on this target
2118 | static _GLIBCXX_CONSTEXPR __float128
| ^
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/limits:2132:33: error:
__float128 is not supported on this target
2132 | static _GLIBCXX_CONSTEXPR __float128
| ^
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/limits:2136:33: error:
__float128 is not supported on this target
2136 | static _GLIBCXX_CONSTEXPR __float128
| ^
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/limits:2156:33: error:
__float128 is not supported on this target
2156 | static _GLIBCXX_CONSTEXPR __float128
| ^
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/limits:2160:33: error:
__float128 is not supported on this target
2160 | static _GLIBCXX_CONSTEXPR __float128
| ^
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/limits:2164:33: error:
__float128 is not supported on this target
2164 | static _GLIBCXX_CONSTEXPR __float128
| ^
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/limits:2176:33: error:
__float128 is not supported on this target
2176 | static _GLIBCXX_CONSTEXPR __float128
| ^
c:\Downloads\winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1\mingw64/include/c++/14.2.0/limits:2170:28: error:
__float128 is not supported on this target
2170 | return __builtin_bit_cast(__float128, __builtin_nansf128(""));
| ^
In file included from <built-in>:1:
In file included from C:/Downloads/winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1/mingw64/lib/clang/18/include/__clang_cuda_runtime_wrapper.h:349:
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6/include/crt/math_functions.hpp:413:10: error: no matching
function for call to '__signbitl'
413 | return __signbitl(a);
| ^~~~~~~~~~
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6/include/crt/math_functions.hpp:2672:14: note: candidate
function not viable: call to __host__ function from __device__ function
2672 | __func__(int __signbitl(const long double a))
| ^
In file included from <built-in>:1:
In file included from C:/Downloads/winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1/mingw64/lib/clang/18/include/__clang_cuda_runtime_wrapper.h:349:
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6/include/crt/math_functions.hpp:418:10: error: no matching
function for call to '__signbit'
418 | return __signbit(a);
| ^~~~~~~~~
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6/include/crt/math_functions.hpp:1289:14: note: candidate
function not viable: call to __host__ function from __device__ function
1289 | __func__(int __signbit(double a))
| ^
In file included from <built-in>:1:
In file included from C:/Downloads/winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1/mingw64/lib/clang/18/include/__clang_cuda_runtime_wrapper.h:349:
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6/include/crt/math_functions.hpp:430:10: error: no matching
function for call to '__isinfl'
430 | return __isinfl(a);
| ^~~~~~~~
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6/include/crt/math_functions.hpp:2692:14: note: candidate
function not viable: call to __host__ function from __device__ function
2692 | __func__(int __isinfl(const long double a))
| ^
In file included from <built-in>:1:
In file included from C:/winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1/mingw64/lib/clang/18/include/__clang_cuda_runtime_wrapper.h:349:
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6/include/crt/math_functions.hpp:444:62: error: functions that
differ only in their return type cannot be overloaded
444 | static __inline__ __host__ __device__ __cudart_builtin__ int isinf(const double a)
| ~~~ ^
C:/Downloads/winlibs-x86_64-posix-seh-gcc-14.2.0-llvm-18.1.8-mingw-w64ucrt-12.0.0-r1/mingw64/lib/clang/18/include/__clang_cuda_math_forward_declares.h:99:17: note:
previous declaration is here
99 | __DEVICE__ bool isinf(double);
| ~~~~ ^
fatal error: too many errors emitted, stopping now [-ferror-limit=]
20 errors generated when compiling for sm_89.
```
>From f3af2cc29fb5109416c3f7a847c097e644e8eeea Mon Sep 17 00:00:00 2001
From: fenodem <fenodem at protonmail.com>
Date: Fri, 20 Mar 2026 09:34:28 +0000
Subject: [PATCH 1/4] Update __clang_cuda_math_forward_declares.h
---
.../__clang_cuda_math_forward_declares.h | 65 ++++++++++++-------
1 file changed, 40 insertions(+), 25 deletions(-)
diff --git a/clang/lib/Headers/__clang_cuda_math_forward_declares.h b/clang/lib/Headers/__clang_cuda_math_forward_declares.h
index 45fe1e5b1772d..d8091ea77ff49 100644
--- a/clang/lib/Headers/__clang_cuda_math_forward_declares.h
+++ b/clang/lib/Headers/__clang_cuda_math_forward_declares.h
@@ -12,12 +12,23 @@
#error "This file is for CUDA/HIP compilation only."
#endif
-// This file forward-declares of some math functions we (or the CUDA headers)
-// will define later. We need to do this, and do it before cmath is included,
-// because the standard library may have constexpr math functions. In the
-// absence of a prior __device__ decl, those constexpr functions may become
-// implicitly host+device. host+device functions can't be overloaded, so that
-// would preclude the use of our own __device__ overloads for these functions.
+// PURPOSE: Forward-declare __device__ math functions before <cmath> is included.
+// Prevents standard library constexpr functions from becoming implicit
+// __host__ __device__, which would clash with our __device__ overloads.
+
+// ---------------------------------------------------------------------------
+// Return Type: CUDA headers return 'bool' on MSVC, but 'int' on POSIX.
+// Mismatches here cause "functions differ only in return type" errors.
+// ---------------------------------------------------------------------------
+// CORRECTED: Force 'int' for all CUDA compilations to match CUDA SDK headers
+// (math_functions.hpp), which define these as returning int regardless of host.
+#if defined(__CUDA__)
+#define __CUDA_CLASSIFIER_RET_TYPE int
+#elif defined(__OPENMP_NVPTX__)
+#define __CUDA_CLASSIFIER_RET_TYPE int
+#else
+#define __CUDA_CLASSIFIER_RET_TYPE int
+#endif
#pragma push_macro("__DEVICE__")
#define __DEVICE__ \
@@ -89,31 +100,38 @@ __DEVICE__ double hypot(double, double);
__DEVICE__ float hypot(float, float);
__DEVICE__ int ilogb(double);
__DEVICE__ int ilogb(float);
-#ifdef _MSC_VER
-__DEVICE__ bool isfinite(long double);
+
+// ---------------------------------------------------------------------------
+// Classification Functions
+// ---------------------------------------------------------------------------
+// Note: We declare long double versions here if not MSVC to match
+// __clang_cuda_cmath.h logic, but they require implementations in
+// __clang_cuda_device_functions.h to avoid link errors.
+#if !defined(_MSC_VER)
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isfinite(long double);
#endif
-__DEVICE__ bool isfinite(double);
-__DEVICE__ bool isfinite(float);
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isfinite(double);
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isfinite(float);
__DEVICE__ bool isgreater(double, double);
__DEVICE__ bool isgreaterequal(double, double);
__DEVICE__ bool isgreaterequal(float, float);
__DEVICE__ bool isgreater(float, float);
-#ifdef _MSC_VER
-__DEVICE__ bool isinf(long double);
+#if !defined(_MSC_VER)
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isinf(long double);
#endif
-__DEVICE__ bool isinf(double);
-__DEVICE__ bool isinf(float);
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isinf(double);
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isinf(float);
__DEVICE__ bool isless(double, double);
__DEVICE__ bool islessequal(double, double);
__DEVICE__ bool islessequal(float, float);
__DEVICE__ bool isless(float, float);
__DEVICE__ bool islessgreater(double, double);
__DEVICE__ bool islessgreater(float, float);
-#ifdef _MSC_VER
-__DEVICE__ bool isnan(long double);
+#if !defined(_MSC_VER)
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isnan(long double);
#endif
-__DEVICE__ bool isnan(double);
-__DEVICE__ bool isnan(float);
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isnan(double);
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isnan(float);
__DEVICE__ bool isnormal(double);
__DEVICE__ bool isnormal(float);
__DEVICE__ bool isunordered(double, double);
@@ -165,11 +183,11 @@ __DEVICE__ double scalbln(double, long);
__DEVICE__ float scalbln(float, long);
__DEVICE__ double scalbn(double, int);
__DEVICE__ float scalbn(float, int);
-#ifdef _MSC_VER
-__DEVICE__ bool signbit(long double);
+#if !defined(_MSC_VER)
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE signbit(long double);
#endif
-__DEVICE__ bool signbit(double);
-__DEVICE__ bool signbit(float);
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE signbit(double);
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE signbit(float);
__DEVICE__ double sin(double);
__DEVICE__ float sin(float);
__DEVICE__ double sinh(double);
@@ -185,9 +203,6 @@ __DEVICE__ float tgamma(float);
__DEVICE__ double trunc(double);
__DEVICE__ float trunc(float);
-// Notably missing above is nexttoward, which we don't define on
-// the device side because libdevice doesn't give us an implementation, and we
-// don't want to be in the business of writing one ourselves.
// We need to define these overloads in exactly the namespace our standard
// library uses (including the right inline namespace), otherwise they won't be
>From a291032af6d170b921e028555c6f257ba47eece9 Mon Sep 17 00:00:00 2001
From: fenodem <fenodem at protonmail.com>
Date: Fri, 20 Mar 2026 09:48:57 +0000
Subject: [PATCH 2/4] Update __clang_cuda_device_functions.h
---
.../Headers/__clang_cuda_device_functions.h | 79 ++++++++++++++-----
1 file changed, 61 insertions(+), 18 deletions(-)
diff --git a/clang/lib/Headers/__clang_cuda_device_functions.h b/clang/lib/Headers/__clang_cuda_device_functions.h
index 0226fe95abab6..4658c92d37206 100644
--- a/clang/lib/Headers/__clang_cuda_device_functions.h
+++ b/clang/lib/Headers/__clang_cuda_device_functions.h
@@ -223,11 +223,65 @@ __DEVICE__ float __fdividef(float __a, float __b) {
}
__DEVICE__ int __ffs(int __a) { return __nv_ffs(__a); }
__DEVICE__ int __ffsll(long long __a) { return __nv_ffsll(__a); }
-__DEVICE__ int __finite(double __a) { return __nv_isfinited(__a); }
-__DEVICE__ int __finitef(float __a) { return __nv_finitef(__a); }
-#ifdef _MSC_VER
-__DEVICE__ int __finitel(long double __a);
-#endif
+
+// ---------------------------------------------------------------------------
+// Classification Function Internal Names
+// ---------------------------------------------------------------------------
+// WARNING: Do NOT consolidate these functions. CUDA's math_functions.hpp calls
+// distinct names (e.g., __signbit vs __signbitd). Removing one causes
+// "no matching function" errors.
+//
+// Note: We use __inline__ without static. This provides external linkage
+// semantics which matches the expectations of CUDA headers declaring these
+// as 'extern' for the GCC/MinGW environment, while still allowing inlining.
+// ---------------------------------------------------------------------------
+
+// Float implementations
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __finitef(float __a) { return __builtin_isfinite(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __isinff(float __a) { return __builtin_isinf(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __isnanf(float __a) { return __builtin_isnan(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __signbitf(float __a) { return __builtin_signbit(__a); }
+
+// Double implementations
+// Note: Both __finite and __isfinited are defined because CUDA headers
+// reference distinct names in different contexts (similar to __signbit/__signbitd).
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __finite(double __a) { return __builtin_isfinite(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __isfinited(double __a) { return __builtin_isfinite(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __isinf(double __a) { return __builtin_isinf(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __isnan(double __a) { return __builtin_isnan(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __signbit(double __a) { return __builtin_signbit(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __signbitd(double __a) { return __builtin_signbit(__a); }
+
+// Long double implementations (UNGUARDED - intentional)
+// IMPORTANT: Do NOT cast to double. Clang's builtins natively support long double.
+// Casting causes incorrect results on MinGW/Linux where long double has higher
+// precision than double (e.g. finite values that overflow double).
+// NOTE: Clang does NOT support __builtin_isfinitel. Using suffixed builtins
+// will fail. The generic builtin preserves precision for 80-bit long double
+// on MinGW hosts and handles double demotion on devices automatically.
+// NOTE: Do NOT add #if !defined(_MSC_VER) here. Unlike wrappers, these
+// are __inline__ with distinct names (__finitel vs __finite).
+// They have no linker visibility and are optimized away if unused.
+// CUDA headers may call these on any platform - define unconditionally.
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __finitel(long double __a) { return __builtin_isfinite(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __isinfl(long double __a) { return __builtin_isinf(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __isnanl(long double __a) { return __builtin_isnan(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __signbitl(long double __a) { return __builtin_signbit(__a); }
+
__DEVICE__ int __float2int_rd(float __a) { return __nv_float2int_rd(__a); }
__DEVICE__ int __float2int_rn(float __a) { return __nv_float2int_rn(__a); }
__DEVICE__ int __float2int_ru(float __a) { return __nv_float2int_ru(__a); }
@@ -433,17 +487,7 @@ __DEVICE__ float __int2float_rn(int __a) { return __nv_int2float_rn(__a); }
__DEVICE__ float __int2float_ru(int __a) { return __nv_int2float_ru(__a); }
__DEVICE__ float __int2float_rz(int __a) { return __nv_int2float_rz(__a); }
__DEVICE__ float __int_as_float(int __a) { return __nv_int_as_float(__a); }
-__DEVICE__ int __isfinited(double __a) { return __nv_isfinited(__a); }
-__DEVICE__ int __isinf(double __a) { return __nv_isinfd(__a); }
-__DEVICE__ int __isinff(float __a) { return __nv_isinff(__a); }
-#ifdef _MSC_VER
-__DEVICE__ int __isinfl(long double __a);
-#endif
-__DEVICE__ int __isnan(double __a) { return __nv_isnand(__a); }
-__DEVICE__ int __isnanf(float __a) { return __nv_isnanf(__a); }
-#ifdef _MSC_VER
-__DEVICE__ int __isnanl(long double __a);
-#endif
+
__DEVICE__ double __ll2double_rd(long long __a) {
return __nv_ll2double_rd(__a);
}
@@ -515,8 +559,7 @@ __DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c) {
return __nv_sad(__a, __b, __c);
}
__DEVICE__ float __saturatef(float __a) { return __nv_saturatef(__a); }
-__DEVICE__ int __signbitd(double __a) { return __nv_signbitd(__a); }
-__DEVICE__ int __signbitf(float __a) { return __nv_signbitf(__a); }
+
__DEVICE__ void __sincosf(float __a, float *__s, float *__c) {
return __nv_fast_sincosf(__a, __s, __c);
}
>From eaa6e8296429c55b7d59f425a781a4b27a906b11 Mon Sep 17 00:00:00 2001
From: fenodem <fenodem at protonmail.com>
Date: Fri, 20 Mar 2026 10:01:39 +0000
Subject: [PATCH 3/4] Update __clang_cuda_cmath.h
---
clang/lib/Headers/__clang_cuda_cmath.h | 81 +++++++++++++-------------
1 file changed, 41 insertions(+), 40 deletions(-)
diff --git a/clang/lib/Headers/__clang_cuda_cmath.h b/clang/lib/Headers/__clang_cuda_cmath.h
index 5bbb59a93b9e5..b78cad5f94544 100644
--- a/clang/lib/Headers/__clang_cuda_cmath.h
+++ b/clang/lib/Headers/__clang_cuda_cmath.h
@@ -65,54 +65,57 @@ __DEVICE__ float frexp(float __arg, int *__exp) {
return ::frexpf(__arg, __exp);
}
-// For inscrutable reasons, the CUDA headers define these functions for us on
-// Windows.
-#if !defined(_MSC_VER) || defined(__OPENMP_NVPTX__)
-
-// For OpenMP we work around some old system headers that have non-conforming
-// `isinf(float)` and `isnan(float)` implementations that return an `int`. We do
-// this by providing two versions of these functions, differing only in the
-// return type. To avoid conflicting definitions we disable implicit base
-// function generation. That means we will end up with two specializations, one
-// per type, but only one has a base function defined by the system header.
+// ---------------------------------------------------------------------------
+// Standard Classification Functions
+// ---------------------------------------------------------------------------
+// OpenMP variants return 'int' (legacy compatibility).
+// Base functions return __CUDA_CLASSIFIER_RET_TYPE (bool/int per ABI).
+// ---------------------------------------------------------------------------
+
#if defined(__OPENMP_NVPTX__)
#pragma omp begin declare variant match( \
implementation = {extension(disable_implicit_base)})
-// FIXME: We lack an extension to customize the mangling of the variants, e.g.,
-// add a suffix. This means we would clash with the names of the variants
-// (note that we do not create implicit base functions here). To avoid
-// this clash we add a new trait to some of them that is always true
-// (this is LLVM after all ;)). It will only influence the mangled name
-// of the variants inside the inner region and avoid the clash.
#pragma omp begin declare variant match(implementation = {vendor(llvm)})
-__DEVICE__ int isinf(float __x) { return ::__isinff(__x); }
-__DEVICE__ int isinf(double __x) { return ::__isinf(__x); }
-__DEVICE__ int isfinite(float __x) { return ::__finitef(__x); }
-__DEVICE__ int isfinite(double __x) { return ::__isfinited(__x); }
-__DEVICE__ int isnan(float __x) { return ::__isnanf(__x); }
-__DEVICE__ int isnan(double __x) { return ::__isnan(__x); }
+// OpenMP path: Return 'int' for legacy compatibility.
+static __host__ __device__ int isinf(float __x) { return ::__isinff(__x); }
+static __host__ __device__ int isinf(double __x) { return ::__isinf(__x); }
+static __host__ __device__ int isfinite(float __x) { return ::__finitef(__x); }
+static __host__ __device__ int isfinite(double __x) { return ::__isfinited(__x); }
+static __host__ __device__ int isnan(float __x) { return ::__isnanf(__x); }
+static __host__ __device__ int isnan(double __x) { return ::__isnan(__x); }
+static __host__ __device__ int signbit(float __x) { return ::__signbitf(__x); }
+static __host__ __device__ int signbit(double __x) { return ::__signbitd(__x); }
#pragma omp end declare variant
-#endif
-
-__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
-__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
-__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
-// For inscrutable reasons, __finite(), the double-precision version of
-// __finitef, does not exist when compiling for MacOS. __isfinited is available
-// everywhere and is just as good.
-__DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
-__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
-__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
-
-#if defined(__OPENMP_NVPTX__)
#pragma omp end declare variant
-#endif
+#else // !__OPENMP_NVPTX__
+
+// Base path (CUDA): Return type matches __CUDA_CLASSIFIER_RET_TYPE.
+// 'int' for MinGW, 'bool' for MSVC.
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isinf(float __x) { return ::__isinff(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isinf(double __x) { return ::__isinf(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isfinite(float __x) { return ::__finitef(__x); }
+// MacOS: __finite is unavailable; __isfinited works everywhere.
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isfinite(double __x) { return ::__isfinited(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isnan(float __x) { return ::__isnanf(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isnan(double __x) { return ::__isnan(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE signbit(float __x) { return ::__signbitf(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE signbit(double __x) { return ::__signbitd(__x); }
+
+// Long double support (MinGW/Linux only).
+// Long double wrappers (MSVC-guarded - intentional)
+// On MSVC, long double == double, causing overload conflicts.
+#if !defined(_MSC_VER)
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isfinite(long double __x) { return ::__finitel(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isinf(long double __x) { return ::__isinfl(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isnan(long double __x) { return ::__isnanl(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE signbit(long double __x) { return ::__signbitl(__x); }
+#endif // !_MSC_VER
-#endif
+#endif // __OPENMP_NVPTX__
__DEVICE__ bool isgreater(float __x, float __y) {
return __builtin_isgreater(__x, __y);
@@ -167,8 +170,6 @@ __DEVICE__ float pow(float __base, int __iexp) {
__DEVICE__ double pow(double __base, int __iexp) {
return ::powi(__base, __iexp);
}
-__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
-__DEVICE__ bool signbit(double __x) { return ::__signbitd(__x); }
__DEVICE__ float sin(float __x) { return ::sinf(__x); }
__DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
__DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
@@ -289,7 +290,7 @@ __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, nextafter);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, pow);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, remainder);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, rint);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, signbit)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sin)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sinh)
>From ff13ec335dace51851cb803b13180505deb52c4e Mon Sep 17 00:00:00 2001
From: fenodem <fenodem at protonmail.com>
Date: Fri, 20 Mar 2026 12:26:24 +0000
Subject: [PATCH 4/4] Update __clang_cuda_runtime_wrapper.h
---
.../Headers/__clang_cuda_runtime_wrapper.h | 65 ++++++++++++++++++-
1 file changed, 63 insertions(+), 2 deletions(-)
diff --git a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
index 295f4191f9927..ee1313092d4c6 100644
--- a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
+++ b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -39,12 +39,45 @@
// Include some standard headers to avoid CUDA headers including them
// while some required macros (like __THROW) are in a weird state.
#include <climits>
+
+// ---------------------------------------------------------------------------
+// MinGW (GCC) Compatibility Fix
+// ---------------------------------------------------------------------------
+// MinGW's math.h declares internal names like __isnanf, __signbitf as
+// __host__ functions. This conflicts with our __host__ __device__ definitions.
+// We rename them out of the way before including <cmath>.
+// ---------------------------------------------------------------------------
+#define __isnanf __mingw_hidden_isnanf
+#define __isinf __mingw_hidden_isinf
+#define __isinff __mingw_hidden_isinff
+#define __finite __mingw_hidden_finite
+#define __finitef __mingw_hidden_finitef
+#define __signbit __mingw_hidden_signbit
+#define __signbitf __mingw_hidden_signbitf
+#define __isnanl __mingw_hidden_isnanl
+#define __isinfl __mingw_hidden_isinfl
+#define __finitel __mingw_hidden_finitel
+#define __signbitl __mingw_hidden_signbitl
+
#include <cmath>
#include <cstdlib>
#include <stdlib.h>
#include <string.h>
#undef __CUDACC__
+// Restore the names so we can use them for our own definitions.
+#undef __isnanf
+#undef __isinf
+#undef __isinff
+#undef __finite
+#undef __finitef
+#undef __signbit
+#undef __signbitf
+#undef __isnanl
+#undef __isinfl
+#undef __finitel
+#undef __signbitl
+
// math_functions.h from CUDA 13.2+ defines _NV_RSQRT_SPECIFIER.
// Clang does not include it, so we need to define it ourselves.
#if defined(__GNUC__) && defined(__GLIBC_PREREQ)
@@ -218,12 +251,31 @@ inline __host__ double __signbitd(double x) {
#define __USE_FAST_MATH__ 1
#endif
+// ---------------------------------------------------------------------------
+// Macro Poisoning - Universal (NOT Platform-Specific)
+// ---------------------------------------------------------------------------
+// Poison the standard names to prevent ODR violations or incorrect overloads
+// from CUDA headers.
+// ---------------------------------------------------------------------------
+#define isfinite __cuda_disabled_isfinite
+#define isinf __cuda_disabled_isinf
+#define isnan __cuda_disabled_isnan
+#define signbit __cuda_disabled_signbit
+
#if CUDA_VERSION >= 9000
#include "crt/math_functions.hpp"
#else
#include "math_functions.hpp"
#endif
+// ---------------------------------------------------------------------------
+// Macro Restoration
+// ---------------------------------------------------------------------------
+#undef isfinite
+#undef isinf
+#undef isnan
+#undef signbit
+
#pragma pop_macro("__USE_FAST_MATH__")
#if CUDA_VERSION < 9000
@@ -342,7 +394,11 @@ __DEVICE__ unsigned int __isLocal(const void *p) {
// conditional on __GNUC__. :)
#pragma push_macro("signbit")
#pragma push_macro("__GNUC__")
-#undef __GNUC__
+#ifndef __GNUC__
+#define __GNUC__ 4
+#define __CLANG_CUDA_DEFINED_GNUC
+#endif
+
#define signbit __ignored_cuda_signbit
// CUDA-9 omits device-side definitions of some math functions if it sees
@@ -365,6 +421,12 @@ __DEVICE__ unsigned int __isLocal(const void *p) {
#endif
#pragma pop_macro("_GLIBCXX_MATH_H")
#pragma pop_macro("_LIBCPP_VERSION")
+
+// Restore original __GNUC__ state
+#ifdef __CLANG_CUDA_DEFINED_GNUC
+#undef __GNUC__
+#undef __CLANG_CUDA_DEFINED_GNUC
+#endif
#pragma pop_macro("__GNUC__")
#pragma pop_macro("signbit")
@@ -505,7 +567,6 @@ __device__ inline __cuda_builtin_gridDim_t::operator uint3() const {
#include "curand_mtgp32_kernel.h"
#pragma pop_macro("dim3")
#pragma pop_macro("uint3")
-#pragma pop_macro("__USE_FAST_MATH__")
#pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
// CUDA runtime uses this undocumented function to access kernel launch
More information about the cfe-commits
mailing list