[clang] 380ac9e - [NVPTX][clang] Ensure CLZ(0) is defined on NVPTX (#185630)
via cfe-commits
cfe-commits at lists.llvm.org
Wed Mar 11 02:49:07 PDT 2026
Author: Lewis Crawford
Date: 2026-03-11T09:49:02Z
New Revision: 380ac9e301ec63052795101a087c704645d569a5
URL: https://github.com/llvm/llvm-project/commit/380ac9e301ec63052795101a087c704645d569a5
DIFF: https://github.com/llvm/llvm-project/commit/380ac9e301ec63052795101a087c704645d569a5.diff
LOG: [NVPTX][clang] Ensure CLZ(0) is defined on NVPTX (#185630)
CUDA semantics specify that clz(0) = bitwidth, so clang should emit clz
/ ctz intrinsics for NVPTX with zero-is-poison = false.
Added:
clang/test/CodeGenCUDA/builtin-count-zeros-nvptx.cu
Modified:
clang/lib/Basic/Targets/NVPTX.h
clang/test/Headers/gpuintrin.c
Removed:
################################################################################
diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h
index 6f8df323f379c..7921a042e9e9b 100644
--- a/clang/lib/Basic/Targets/NVPTX.h
+++ b/clang/lib/Basic/Targets/NVPTX.h
@@ -83,6 +83,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo {
bool useFP16ConversionIntrinsics() const override { return false; }
+ bool isCLZForZeroUndef() const override { return false; }
+
bool
initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
StringRef CPU,
diff --git a/clang/test/CodeGenCUDA/builtin-count-zeros-nvptx.cu b/clang/test/CodeGenCUDA/builtin-count-zeros-nvptx.cu
new file mode 100644
index 0000000000000..f003b32ca73b2
--- /dev/null
+++ b/clang/test/CodeGenCUDA/builtin-count-zeros-nvptx.cu
@@ -0,0 +1,12 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -x cuda -triple nvptx64-unknown-unknown -fcuda-is-device -emit-llvm %s -o - | FileCheck %s
+//
+// Ensure NVPTX uses isCLZForZeroUndef() = false (CUDA semantics: CLZ(i32 0) == 32).
+
+#include "Inputs/cuda.h"
+
+__device__ int f(int x) {
+ return __builtin_ctz(x) + __builtin_clz(x);
+}
+// CHECK: call i32 @llvm.cttz.i32({{.*}}, i1 false)
+// CHECK: call i32 @llvm.ctlz.i32({{.*}}, i1 false)
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 04b50acc4a049..c6a20dec210bb 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -1109,7 +1109,7 @@ __gpu_kernel void foo() {
// NVPTX-NEXT: br i1 [[TOBOOL]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
// NVPTX: [[COND_TRUE]]:
// NVPTX-NEXT: [[TMP3:%.*]] = load i64, ptr [[__BELOW]], align 8
-// NVPTX-NEXT: [[TMP4:%.*]] = call i64 @llvm.ctlz.i64(i64 [[TMP3]], i1 true)
+// NVPTX-NEXT: [[TMP4:%.*]] = call i64 @llvm.ctlz.i64(i64 [[TMP3]], i1 false)
// NVPTX-NEXT: [[CAST:%.*]] = trunc i64 [[TMP4]] to i32
// NVPTX-NEXT: [[SUB2:%.*]] = sub nsw i32 63, [[CAST]]
// NVPTX-NEXT: br label %[[COND_END:.*]]
More information about the cfe-commits
mailing list