[clang] [Clang] Fix 'gpuintrin.h' match when included with no arch set (PR #129927)
via cfe-commits
cfe-commits at lists.llvm.org
Wed Mar 5 12:07:34 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang
@llvm/pr-subscribers-backend-x86
Author: Joseph Huber (jhuber6)
<details>
<summary>Changes</summary>
Summary:
These require `+ptx` features to be set even though they're guarded by
the `__nvvm_reflect`. Rather than figure out how to hack around that
with the `target` attribute I'm just going to disable it for 'generic'
builds and use the slow version for now.
---
Full diff: https://github.com/llvm/llvm-project/pull/129927.diff
1 Files Affected:
- (modified) clang/lib/Headers/nvptxintrin.h (+9-1)
``````````diff
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 29d0adcabc82f..b2c3097a464fe 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -179,8 +179,10 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
// Newer targets can use the dedicated CUDA support.
- if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+#if __CUDA_ARCH__ >= 700
+ if (__nvvm_reflect("__CUDA_ARCH") >= 700)
return __nvvm_match_any_sync_i32(__lane_mask, __x);
+#endif
uint32_t __match_mask = 0;
bool __done = 0;
@@ -200,8 +202,10 @@ __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
// Newer targets can use the dedicated CUDA support.
+#if __CUDA_ARCH__ >= 700
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
return __nvvm_match_any_sync_i64(__lane_mask, __x);
+#endif
uint64_t __match_mask = 0;
@@ -223,9 +227,11 @@ __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
// Newer targets can use the dedicated CUDA support.
+#if __CUDA_ARCH__ >= 700
int predicate;
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
+#endif
uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
@@ -236,9 +242,11 @@ __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
// Newer targets can use the dedicated CUDA support.
+#if __CUDA_ARCH__ >= 700
int predicate;
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
+#endif
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
``````````
</details>
https://github.com/llvm/llvm-project/pull/129927
More information about the cfe-commits
mailing list