[clang] ca39a02 - [Clang] Fix 'gpuintrin.h' match when included with no arch set (#129927)

Thu Mar 6 17:37:42 PST 2025

Author: Joseph Huber
Date: 2025-03-06T19:37:39-06:00
New Revision: ca39a029b66e3f6d8b70fb3fe437d5c024643d6d

URL: https://github.com/llvm/llvm-project/commit/ca39a029b66e3f6d8b70fb3fe437d5c024643d6d
DIFF: https://github.com/llvm/llvm-project/commit/ca39a029b66e3f6d8b70fb3fe437d5c024643d6d.diff

LOG: [Clang] Fix 'gpuintrin.h' match when included with no arch set (#129927)

Summary:
These require `+ptx` features to be set even though they're guarded by
the `__nvvm_reflect`. Rather than figure out how to hack around that
with the `target` attribute I'm just going to disable it for 'generic'
builds and use the slow version for now.

Added: 
    

Modified: 
    clang/lib/Headers/nvptxintrin.h

Removed: 
    


################################################################################
diff  --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 7af22baccb511..73eb0af8b5926 100644

--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -180,8 +180,9 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
   // Newer targets can use the dedicated CUDA support.
-  if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
-    return __nvvm_match_any_sync_i32(__lane_mask, __x);
+#if __CUDA_ARCH__ >= 700
+  return __nvvm_match_any_sync_i32(__lane_mask, __x);
+#endif
 
   uint32_t __match_mask = 0;
   bool __done = 0;
@@ -201,8 +202,9 @@ __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
   // Newer targets can use the dedicated CUDA support.
-  if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
-    return __nvvm_match_any_sync_i64(__lane_mask, __x);
+#if __CUDA_ARCH__ >= 700
+  return __nvvm_match_any_sync_i64(__lane_mask, __x);
+#endif
 
   uint64_t __match_mask = 0;
 
@@ -224,9 +226,10 @@ __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
   // Newer targets can use the dedicated CUDA support.
+#if __CUDA_ARCH__ >= 700
   int predicate;
-  if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
-    return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
+  return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
+#endif
 
   uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
   uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
@@ -237,9 +240,10 @@ __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
   // Newer targets can use the dedicated CUDA support.
+#if __CUDA_ARCH__ >= 700
   int predicate;
-  if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
-    return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
+  return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
+#endif
 
   uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
   uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);