[clang] Implement macro poisoning for foreign CUDA headers from Nvidia Toolkit (PR #187696)

Fri Mar 20 06:10:16 PDT 2026

https://github.com/fenodem updated https://github.com/llvm/llvm-project/pull/187696

>From f3af2cc29fb5109416c3f7a847c097e644e8eeea Mon Sep 17 00:00:00 2001
From: fenodem <fenodem at protonmail.com>
Date: Fri, 20 Mar 2026 09:34:28 +0000
Subject: [PATCH 1/4] Update __clang_cuda_math_forward_declares.h

---
 .../__clang_cuda_math_forward_declares.h      | 65 ++++++++++++-------
 1 file changed, 40 insertions(+), 25 deletions(-)

diff --git a/clang/lib/Headers/__clang_cuda_math_forward_declares.h b/clang/lib/Headers/__clang_cuda_math_forward_declares.h
index 45fe1e5b1772d..d8091ea77ff49 100644
--- a/clang/lib/Headers/__clang_cuda_math_forward_declares.h
+++ b/clang/lib/Headers/__clang_cuda_math_forward_declares.h
@@ -12,12 +12,23 @@
 #error "This file is for CUDA/HIP compilation only."
 #endif
 
-// This file forward-declares of some math functions we (or the CUDA headers)
-// will define later.  We need to do this, and do it before cmath is included,
-// because the standard library may have constexpr math functions.  In the
-// absence of a prior __device__ decl, those constexpr functions may become
-// implicitly host+device.  host+device functions can't be overloaded, so that
-// would preclude the use of our own __device__ overloads for these functions.
+// PURPOSE: Forward-declare __device__ math functions before <cmath> is included.
+// Prevents standard library constexpr functions from becoming implicit
+// __host__ __device__, which would clash with our __device__ overloads.
+
+// ---------------------------------------------------------------------------
+// Return Type: CUDA headers return 'bool' on MSVC, but 'int' on POSIX.
+// Mismatches here cause "functions differ only in return type" errors.
+// ---------------------------------------------------------------------------
+// CORRECTED: Force 'int' for all CUDA compilations to match CUDA SDK headers
+// (math_functions.hpp), which define these as returning int regardless of host.
+#if defined(__CUDA__)
+#define __CUDA_CLASSIFIER_RET_TYPE int
+#elif defined(__OPENMP_NVPTX__)
+#define __CUDA_CLASSIFIER_RET_TYPE int
+#else
+#define __CUDA_CLASSIFIER_RET_TYPE int
+#endif
 
 #pragma push_macro("__DEVICE__")
 #define __DEVICE__                                                             \
@@ -89,31 +100,38 @@ __DEVICE__ double hypot(double, double);
 __DEVICE__ float hypot(float, float);
 __DEVICE__ int ilogb(double);
 __DEVICE__ int ilogb(float);
-#ifdef _MSC_VER
-__DEVICE__ bool isfinite(long double);
+
+// ---------------------------------------------------------------------------
+// Classification Functions
+// ---------------------------------------------------------------------------
+// Note: We declare long double versions here if not MSVC to match
+// __clang_cuda_cmath.h logic, but they require implementations in
+// __clang_cuda_device_functions.h to avoid link errors.
+#if !defined(_MSC_VER)
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isfinite(long double);
 #endif
-__DEVICE__ bool isfinite(double);
-__DEVICE__ bool isfinite(float);
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isfinite(double);
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isfinite(float);
 __DEVICE__ bool isgreater(double, double);
 __DEVICE__ bool isgreaterequal(double, double);
 __DEVICE__ bool isgreaterequal(float, float);
 __DEVICE__ bool isgreater(float, float);
-#ifdef _MSC_VER
-__DEVICE__ bool isinf(long double);
+#if !defined(_MSC_VER)
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isinf(long double);
 #endif
-__DEVICE__ bool isinf(double);
-__DEVICE__ bool isinf(float);
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isinf(double);
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isinf(float);
 __DEVICE__ bool isless(double, double);
 __DEVICE__ bool islessequal(double, double);
 __DEVICE__ bool islessequal(float, float);
 __DEVICE__ bool isless(float, float);
 __DEVICE__ bool islessgreater(double, double);
 __DEVICE__ bool islessgreater(float, float);
-#ifdef _MSC_VER
-__DEVICE__ bool isnan(long double);
+#if !defined(_MSC_VER)
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isnan(long double);
 #endif
-__DEVICE__ bool isnan(double);
-__DEVICE__ bool isnan(float);
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isnan(double);
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE isnan(float);
 __DEVICE__ bool isnormal(double);
 __DEVICE__ bool isnormal(float);
 __DEVICE__ bool isunordered(double, double);
@@ -165,11 +183,11 @@ __DEVICE__ double scalbln(double, long);
 __DEVICE__ float scalbln(float, long);
 __DEVICE__ double scalbn(double, int);
 __DEVICE__ float scalbn(float, int);
-#ifdef _MSC_VER
-__DEVICE__ bool signbit(long double);
+#if !defined(_MSC_VER)
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE signbit(long double);
 #endif
-__DEVICE__ bool signbit(double);
-__DEVICE__ bool signbit(float);
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE signbit(double);
+__DEVICE__ __CUDA_CLASSIFIER_RET_TYPE signbit(float);
 __DEVICE__ double sin(double);
 __DEVICE__ float sin(float);
 __DEVICE__ double sinh(double);
@@ -185,9 +203,6 @@ __DEVICE__ float tgamma(float);
 __DEVICE__ double trunc(double);
 __DEVICE__ float trunc(float);
 
-// Notably missing above is nexttoward, which we don't define on
-// the device side because libdevice doesn't give us an implementation, and we
-// don't want to be in the business of writing one ourselves.
 
 // We need to define these overloads in exactly the namespace our standard
 // library uses (including the right inline namespace), otherwise they won't be

>From a291032af6d170b921e028555c6f257ba47eece9 Mon Sep 17 00:00:00 2001
From: fenodem <fenodem at protonmail.com>
Date: Fri, 20 Mar 2026 09:48:57 +0000
Subject: [PATCH 2/4] Update __clang_cuda_device_functions.h

---
 .../Headers/__clang_cuda_device_functions.h   | 79 ++++++++++++++-----
 1 file changed, 61 insertions(+), 18 deletions(-)

diff --git a/clang/lib/Headers/__clang_cuda_device_functions.h b/clang/lib/Headers/__clang_cuda_device_functions.h
index 0226fe95abab6..4658c92d37206 100644
--- a/clang/lib/Headers/__clang_cuda_device_functions.h
+++ b/clang/lib/Headers/__clang_cuda_device_functions.h
@@ -223,11 +223,65 @@ __DEVICE__ float __fdividef(float __a, float __b) {
 }
 __DEVICE__ int __ffs(int __a) { return __nv_ffs(__a); }
 __DEVICE__ int __ffsll(long long __a) { return __nv_ffsll(__a); }
-__DEVICE__ int __finite(double __a) { return __nv_isfinited(__a); }
-__DEVICE__ int __finitef(float __a) { return __nv_finitef(__a); }
-#ifdef _MSC_VER
-__DEVICE__ int __finitel(long double __a);
-#endif
+
+// ---------------------------------------------------------------------------
+// Classification Function Internal Names
+// ---------------------------------------------------------------------------
+// WARNING: Do NOT consolidate these functions. CUDA's math_functions.hpp calls
+// distinct names (e.g., __signbit vs __signbitd). Removing one causes
+// "no matching function" errors.
+//
+// Note: We use __inline__ without static. This provides external linkage
+// semantics which matches the expectations of CUDA headers declaring these
+// as 'extern' for the GCC/MinGW environment, while still allowing inlining.
+// ---------------------------------------------------------------------------
+
+// Float implementations
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __finitef(float __a) { return __builtin_isfinite(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __isinff(float __a) { return __builtin_isinf(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __isnanf(float __a) { return __builtin_isnan(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __signbitf(float __a) { return __builtin_signbit(__a); }
+
+// Double implementations
+// Note: Both __finite and __isfinited are defined because CUDA headers
+// reference distinct names in different contexts (similar to __signbit/__signbitd).
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __finite(double __a) { return __builtin_isfinite(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __isfinited(double __a) { return __builtin_isfinite(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __isinf(double __a) { return __builtin_isinf(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __isnan(double __a) { return __builtin_isnan(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __signbit(double __a) { return __builtin_signbit(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __signbitd(double __a) { return __builtin_signbit(__a); }
+
+// Long double implementations (UNGUARDED - intentional)
+// IMPORTANT: Do NOT cast to double. Clang's builtins natively support long double.
+// Casting causes incorrect results on MinGW/Linux where long double has higher
+// precision than double (e.g. finite values that overflow double).
+// NOTE: Clang does NOT support __builtin_isfinitel. Using suffixed builtins
+// will fail. The generic builtin preserves precision for 80-bit long double
+// on MinGW hosts and handles double demotion on devices automatically.
+// NOTE: Do NOT add #if !defined(_MSC_VER) here. Unlike wrappers, these
+// are __inline__ with distinct names (__finitel vs __finite).
+// They have no linker visibility and are optimized away if unused.
+// CUDA headers may call these on any platform - define unconditionally.
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __finitel(long double __a) { return __builtin_isfinite(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __isinfl(long double __a) { return __builtin_isinf(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __isnanl(long double __a) { return __builtin_isnan(__a); }
+__inline__ __host__ __device__ __attribute__((always_inline))
+int __signbitl(long double __a) { return __builtin_signbit(__a); }
+
 __DEVICE__ int __float2int_rd(float __a) { return __nv_float2int_rd(__a); }
 __DEVICE__ int __float2int_rn(float __a) { return __nv_float2int_rn(__a); }
 __DEVICE__ int __float2int_ru(float __a) { return __nv_float2int_ru(__a); }
@@ -433,17 +487,7 @@ __DEVICE__ float __int2float_rn(int __a) { return __nv_int2float_rn(__a); }
 __DEVICE__ float __int2float_ru(int __a) { return __nv_int2float_ru(__a); }
 __DEVICE__ float __int2float_rz(int __a) { return __nv_int2float_rz(__a); }
 __DEVICE__ float __int_as_float(int __a) { return __nv_int_as_float(__a); }
-__DEVICE__ int __isfinited(double __a) { return __nv_isfinited(__a); }
-__DEVICE__ int __isinf(double __a) { return __nv_isinfd(__a); }
-__DEVICE__ int __isinff(float __a) { return __nv_isinff(__a); }
-#ifdef _MSC_VER
-__DEVICE__ int __isinfl(long double __a);
-#endif
-__DEVICE__ int __isnan(double __a) { return __nv_isnand(__a); }
-__DEVICE__ int __isnanf(float __a) { return __nv_isnanf(__a); }
-#ifdef _MSC_VER
-__DEVICE__ int __isnanl(long double __a);
-#endif
+
 __DEVICE__ double __ll2double_rd(long long __a) {
   return __nv_ll2double_rd(__a);
 }
@@ -515,8 +559,7 @@ __DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c) {
   return __nv_sad(__a, __b, __c);
 }
 __DEVICE__ float __saturatef(float __a) { return __nv_saturatef(__a); }
-__DEVICE__ int __signbitd(double __a) { return __nv_signbitd(__a); }
-__DEVICE__ int __signbitf(float __a) { return __nv_signbitf(__a); }
+
 __DEVICE__ void __sincosf(float __a, float *__s, float *__c) {
   return __nv_fast_sincosf(__a, __s, __c);
 }

>From eaa6e8296429c55b7d59f425a781a4b27a906b11 Mon Sep 17 00:00:00 2001
From: fenodem <fenodem at protonmail.com>
Date: Fri, 20 Mar 2026 10:01:39 +0000
Subject: [PATCH 3/4] Update __clang_cuda_cmath.h

---
 clang/lib/Headers/__clang_cuda_cmath.h | 81 +++++++++++++-------------
 1 file changed, 41 insertions(+), 40 deletions(-)

diff --git a/clang/lib/Headers/__clang_cuda_cmath.h b/clang/lib/Headers/__clang_cuda_cmath.h
index 5bbb59a93b9e5..b78cad5f94544 100644
--- a/clang/lib/Headers/__clang_cuda_cmath.h
+++ b/clang/lib/Headers/__clang_cuda_cmath.h
@@ -65,54 +65,57 @@ __DEVICE__ float frexp(float __arg, int *__exp) {
   return ::frexpf(__arg, __exp);
 }
 
-// For inscrutable reasons, the CUDA headers define these functions for us on
-// Windows.
-#if !defined(_MSC_VER) || defined(__OPENMP_NVPTX__)
-
-// For OpenMP we work around some old system headers that have non-conforming
-// `isinf(float)` and `isnan(float)` implementations that return an `int`. We do
-// this by providing two versions of these functions, differing only in the
-// return type. To avoid conflicting definitions we disable implicit base
-// function generation. That means we will end up with two specializations, one
-// per type, but only one has a base function defined by the system header.
+// ---------------------------------------------------------------------------
+// Standard Classification Functions
+// ---------------------------------------------------------------------------
+// OpenMP variants return 'int' (legacy compatibility).
+// Base functions return __CUDA_CLASSIFIER_RET_TYPE (bool/int per ABI).
+// ---------------------------------------------------------------------------
+
 #if defined(__OPENMP_NVPTX__)
 #pragma omp begin declare variant match(                                       \
     implementation = {extension(disable_implicit_base)})
 
-// FIXME: We lack an extension to customize the mangling of the variants, e.g.,
-//        add a suffix. This means we would clash with the names of the variants
-//        (note that we do not create implicit base functions here). To avoid
-//        this clash we add a new trait to some of them that is always true
-//        (this is LLVM after all ;)). It will only influence the mangled name
-//        of the variants inside the inner region and avoid the clash.
 #pragma omp begin declare variant match(implementation = {vendor(llvm)})
 
-__DEVICE__ int isinf(float __x) { return ::__isinff(__x); }
-__DEVICE__ int isinf(double __x) { return ::__isinf(__x); }
-__DEVICE__ int isfinite(float __x) { return ::__finitef(__x); }
-__DEVICE__ int isfinite(double __x) { return ::__isfinited(__x); }
-__DEVICE__ int isnan(float __x) { return ::__isnanf(__x); }
-__DEVICE__ int isnan(double __x) { return ::__isnan(__x); }
+// OpenMP path: Return 'int' for legacy compatibility.
+static __host__ __device__ int isinf(float __x) { return ::__isinff(__x); }
+static __host__ __device__ int isinf(double __x) { return ::__isinf(__x); }
+static __host__ __device__ int isfinite(float __x) { return ::__finitef(__x); }
+static __host__ __device__ int isfinite(double __x) { return ::__isfinited(__x); }
+static __host__ __device__ int isnan(float __x) { return ::__isnanf(__x); }
+static __host__ __device__ int isnan(double __x) { return ::__isnan(__x); }
+static __host__ __device__ int signbit(float __x) { return ::__signbitf(__x); }
+static __host__ __device__ int signbit(double __x) { return ::__signbitd(__x); }
 
 #pragma omp end declare variant
 
-#endif
-
-__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
-__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
-__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
-// For inscrutable reasons, __finite(), the double-precision version of
-// __finitef, does not exist when compiling for MacOS.  __isfinited is available
-// everywhere and is just as good.
-__DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
-__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
-__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
-
-#if defined(__OPENMP_NVPTX__)
 #pragma omp end declare variant
-#endif
+#else // !__OPENMP_NVPTX__
+
+// Base path (CUDA): Return type matches __CUDA_CLASSIFIER_RET_TYPE.
+// 'int' for MinGW, 'bool' for MSVC.
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isinf(float __x) { return ::__isinff(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isinf(double __x) { return ::__isinf(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isfinite(float __x) { return ::__finitef(__x); }
+// MacOS: __finite is unavailable; __isfinited works everywhere.
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isfinite(double __x) { return ::__isfinited(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isnan(float __x) { return ::__isnanf(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isnan(double __x) { return ::__isnan(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE signbit(float __x) { return ::__signbitf(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE signbit(double __x) { return ::__signbitd(__x); }
+
+// Long double support (MinGW/Linux only).
+// Long double wrappers (MSVC-guarded - intentional)
+// On MSVC, long double == double, causing overload conflicts.
+#if !defined(_MSC_VER)
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isfinite(long double __x) { return ::__finitel(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isinf(long double __x)    { return ::__isinfl(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE isnan(long double __x)    { return ::__isnanl(__x); }
+static __host__ __device__ __CUDA_CLASSIFIER_RET_TYPE signbit(long double __x)  { return ::__signbitl(__x); }
+#endif // !_MSC_VER
 
-#endif
+#endif // __OPENMP_NVPTX__
 
 __DEVICE__ bool isgreater(float __x, float __y) {
   return __builtin_isgreater(__x, __y);
@@ -167,8 +170,6 @@ __DEVICE__ float pow(float __base, int __iexp) {
 __DEVICE__ double pow(double __base, int __iexp) {
   return ::powi(__base, __iexp);
 }
-__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
-__DEVICE__ bool signbit(double __x) { return ::__signbitd(__x); }
 __DEVICE__ float sin(float __x) { return ::sinf(__x); }
 __DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
 __DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
@@ -289,7 +290,7 @@ __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, nextafter);
 __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, pow);
 __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, remainder);
 __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, rint);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round)
 __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, signbit)
 __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sin)
 __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sinh)

>From ff13ec335dace51851cb803b13180505deb52c4e Mon Sep 17 00:00:00 2001
From: fenodem <fenodem at protonmail.com>
Date: Fri, 20 Mar 2026 12:26:24 +0000
Subject: [PATCH 4/4] Update __clang_cuda_runtime_wrapper.h

---
 .../Headers/__clang_cuda_runtime_wrapper.h    | 65 ++++++++++++++++++-
 1 file changed, 63 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
index 295f4191f9927..ee1313092d4c6 100644
--- a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
+++ b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -39,12 +39,45 @@
 // Include some standard headers to avoid CUDA headers including them
 // while some required macros (like __THROW) are in a weird state.
 #include <climits>
+
+// ---------------------------------------------------------------------------
+// MinGW (GCC) Compatibility Fix
+// ---------------------------------------------------------------------------
+// MinGW's math.h declares internal names like __isnanf, __signbitf as
+// __host__ functions. This conflicts with our __host__ __device__ definitions.
+// We rename them out of the way before including <cmath>.
+// ---------------------------------------------------------------------------
+#define __isnanf __mingw_hidden_isnanf
+#define __isinf  __mingw_hidden_isinf
+#define __isinff __mingw_hidden_isinff
+#define __finite __mingw_hidden_finite
+#define __finitef __mingw_hidden_finitef
+#define __signbit __mingw_hidden_signbit
+#define __signbitf __mingw_hidden_signbitf
+#define __isnanl __mingw_hidden_isnanl
+#define __isinfl __mingw_hidden_isinfl
+#define __finitel __mingw_hidden_finitel
+#define __signbitl __mingw_hidden_signbitl
+
 #include <cmath>
 #include <cstdlib>
 #include <stdlib.h>
 #include <string.h>
 #undef __CUDACC__
 
+// Restore the names so we can use them for our own definitions.
+#undef __isnanf
+#undef __isinf
+#undef __isinff
+#undef __finite
+#undef __finitef
+#undef __signbit
+#undef __signbitf
+#undef __isnanl
+#undef __isinfl
+#undef __finitel
+#undef __signbitl
+
 // math_functions.h from CUDA 13.2+ defines _NV_RSQRT_SPECIFIER.
 // Clang does not include it, so we need to define it ourselves.
 #if defined(__GNUC__) && defined(__GLIBC_PREREQ)
@@ -218,12 +251,31 @@ inline __host__ double __signbitd(double x) {
 #define __USE_FAST_MATH__ 1
 #endif
 
+// ---------------------------------------------------------------------------
+// Macro Poisoning - Universal (NOT Platform-Specific)
+// ---------------------------------------------------------------------------
+// Poison the standard names to prevent ODR violations or incorrect overloads
+// from CUDA headers.
+// ---------------------------------------------------------------------------
+#define isfinite __cuda_disabled_isfinite
+#define isinf    __cuda_disabled_isinf
+#define isnan    __cuda_disabled_isnan
+#define signbit  __cuda_disabled_signbit
+
 #if CUDA_VERSION >= 9000
 #include "crt/math_functions.hpp"
 #else
 #include "math_functions.hpp"
 #endif
 
+// ---------------------------------------------------------------------------
+// Macro Restoration
+// ---------------------------------------------------------------------------
+#undef isfinite
+#undef isinf
+#undef isnan
+#undef signbit
+
 #pragma pop_macro("__USE_FAST_MATH__")
 
 #if CUDA_VERSION < 9000
@@ -342,7 +394,11 @@ __DEVICE__ unsigned int __isLocal(const void *p) {
 // conditional on __GNUC__.  :)
 #pragma push_macro("signbit")
 #pragma push_macro("__GNUC__")
-#undef __GNUC__
+#ifndef __GNUC__
+#define __GNUC__ 4
+#define __CLANG_CUDA_DEFINED_GNUC
+#endif
+
 #define signbit __ignored_cuda_signbit
 
 // CUDA-9 omits device-side definitions of some math functions if it sees
@@ -365,6 +421,12 @@ __DEVICE__ unsigned int __isLocal(const void *p) {
 #endif
 #pragma pop_macro("_GLIBCXX_MATH_H")
 #pragma pop_macro("_LIBCPP_VERSION")
+
+// Restore original __GNUC__ state
+#ifdef __CLANG_CUDA_DEFINED_GNUC
+#undef __GNUC__
+#undef __CLANG_CUDA_DEFINED_GNUC
+#endif
 #pragma pop_macro("__GNUC__")
 #pragma pop_macro("signbit")
 
@@ -505,7 +567,6 @@ __device__ inline __cuda_builtin_gridDim_t::operator uint3() const {
 #include "curand_mtgp32_kernel.h"
 #pragma pop_macro("dim3")
 #pragma pop_macro("uint3")
-#pragma pop_macro("__USE_FAST_MATH__")
 #pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
 
 // CUDA runtime uses this undocumented function to access kernel launch