[clang] clang/AMDGPU: Add __builtin_amdgcn_inverse_ballot_w{32,64} (PR #155724)
via cfe-commits
cfe-commits at lists.llvm.org
Wed Aug 27 16:46:30 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Nicolai Hähnle (nhaehnle)
<details>
<summary>Changes</summary>
Add builtins that expose the underlying llvm.amdgcn.inverse.ballot intrinsic that we've had for a while.
This allows more explicitly writing code that selects or branches in terms of lane masks, which can lead to better code quality.
---
Full diff: https://github.com/llvm/llvm-project/pull/155724.diff
5 Files Affected:
- (modified) clang/docs/LanguageExtensions.rst (+17)
- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+3)
- (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+6)
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl (+7)
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl (+7)
``````````diff
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index a13e0a5952fe4..2ce60de05fff2 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -5162,6 +5162,23 @@ If no address spaces names are provided, all address spaces are fenced.
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local")
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local", "global")
+__builtin_amdgcn_ballot_w{32,64}
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``__builtin_amdgcn_ballot_w{32,64}`` returns a bitmask that contains its
+boolean argument as a bit for every lane of the current wave that is currently
+active (i.e., that is converged with the executing thread), and a 0 bit for
+every lane that is not active.
+
+The result is uniform, i.e. it is the same in every active thread of the wave.
+
+__builtin_amdgcn_inverse_ballot_w{32,64}
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Given a wave-uniform bitmask, ``__builtin_amdgcn_inverse_ballot_w{32,64}(mask)``
+returns the bit at the position of the current lane. It is almost equivalent to
+``(mask & (1 << lane_id)) != 0``, except that its behavior is only defined if
+the given mask has the same value for all active lanes of the current wave.
ARM/AArch64 Language Extensions
-------------------------------
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index f8f55772db8fe..6f5d1e024b91d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -183,6 +183,9 @@ TARGET_BUILTIN(__builtin_amdgcn_struct_ptr_buffer_load_lds, "vQbv*3IUiiiiIiIi",
TARGET_BUILTIN(__builtin_amdgcn_ballot_w32, "ZUib", "nc", "wavefrontsize32")
BUILTIN(__builtin_amdgcn_ballot_w64, "WUib", "nc")
+TARGET_BUILTIN(__builtin_amdgcn_inverse_ballot_w32, "bZUi", "nc", "wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_inverse_ballot_w64, "bWUi", "nc", "wavefrontsize64")
+
// Deprecated intrinsics in favor of __builtin_amdgn_ballot_{w32|w64}
BUILTIN(__builtin_amdgcn_uicmp, "WUiUiUiIi", "nc")
BUILTIN(__builtin_amdgcn_uicmpl, "WUiWUiWUiIi", "nc")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index dad1f95ac710d..ac674f3cd59e8 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -504,6 +504,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
return Builder.CreateCall(F, { Src });
}
+ case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w32:
+ case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w64: {
+ llvm::Value *Src = EmitScalarExpr(E->getArg(0));
+ Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_inverse_ballot, { Src->getType() });
+ return Builder.CreateCall(F, { Src });
+ }
case AMDGPU::BI__builtin_amdgcn_tanhf:
case AMDGPU::BI__builtin_amdgcn_tanhh:
case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
index 5e587cb87e073..d390418523694 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
@@ -24,6 +24,13 @@ void test_ballot_wave32_target_attr(global uint* out, int a, int b)
*out = __builtin_amdgcn_ballot_w32(a == b);
}
+// CHECK-LABEL: @test_inverse_ballot_wave32(
+// CHECK: call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %{{.+}})
+void test_inverse_ballot_wave32(global bool* out, int a)
+{
+ *out = __builtin_amdgcn_inverse_ballot_w32(a);
+}
+
// CHECK-LABEL: @test_read_exec(
// CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 true)
void test_read_exec(global uint* out) {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
index 1fc2ac0d3141e..d851ec7e6734f 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
@@ -23,6 +23,13 @@ void test_ballot_wave64_target_attr(global ulong* out, int a, int b)
*out = __builtin_amdgcn_ballot_w64(a == b);
}
+// CHECK-LABEL: @test_inverse_ballot_wave64(
+// CHECK: call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %{{.+}})
+void test_inverse_ballot_wave64(global bool* out, ulong a)
+{
+ *out = __builtin_amdgcn_inverse_ballot_w64(a);
+}
+
// CHECK-LABEL: @test_read_exec(
// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true)
void test_read_exec(global ulong* out) {
``````````
</details>
https://github.com/llvm/llvm-project/pull/155724
More information about the cfe-commits
mailing list