[clang] clang/AMDGPU: Add __builtin_amdgcn_inverse_ballot_w{32,64} (PR #155724)

Wed Aug 27 16:46:30 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Nicolai Hähnle (nhaehnle)

<details>
<summary>Changes</summary>

Add builtins that expose the underlying llvm.amdgcn.inverse.ballot intrinsic that we've had for a while.

This allows more explicitly writing code that selects or branches in terms of lane masks, which can lead to better code quality.

---
Full diff: https://github.com/llvm/llvm-project/pull/155724.diff


5 Files Affected:

- (modified) clang/docs/LanguageExtensions.rst (+17) 
- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+3) 
- (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+6) 
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl (+7) 
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl (+7) 


``````````diff

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index a13e0a5952fe4..2ce60de05fff2 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -5162,6 +5162,23 @@ If no address spaces names are provided, all address spaces are fenced.
   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local")
   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local", "global")
 
+__builtin_amdgcn_ballot_w{32,64}
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``__builtin_amdgcn_ballot_w{32,64}`` returns a bitmask that contains its
+boolean argument as a bit for every lane of the current wave that is currently
+active (i.e., that is converged with the executing thread), and a 0 bit for
+every lane that is not active.
+
+The result is uniform, i.e. it is the same in every active thread of the wave.
+
+__builtin_amdgcn_inverse_ballot_w{32,64}
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Given a wave-uniform bitmask, ``__builtin_amdgcn_inverse_ballot_w{32,64}(mask)``
+returns the bit at the position of the current lane. It is almost equivalent to
+``(mask & (1 << lane_id)) != 0``, except that its behavior is only defined if
+the given mask has the same value for all active lanes of the current wave.
 
 ARM/AArch64 Language Extensions
 -------------------------------
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index f8f55772db8fe..6f5d1e024b91d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -183,6 +183,9 @@ TARGET_BUILTIN(__builtin_amdgcn_struct_ptr_buffer_load_lds, "vQbv*3IUiiiiIiIi",
 TARGET_BUILTIN(__builtin_amdgcn_ballot_w32, "ZUib", "nc", "wavefrontsize32")
 BUILTIN(__builtin_amdgcn_ballot_w64, "WUib", "nc")
 
+TARGET_BUILTIN(__builtin_amdgcn_inverse_ballot_w32, "bZUi", "nc", "wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_inverse_ballot_w64, "bWUi", "nc", "wavefrontsize64")
+
 // Deprecated intrinsics in favor of __builtin_amdgn_ballot_{w32|w64}
 BUILTIN(__builtin_amdgcn_uicmp, "WUiUiUiIi", "nc")
 BUILTIN(__builtin_amdgcn_uicmpl, "WUiWUiWUiIi", "nc")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index dad1f95ac710d..ac674f3cd59e8 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -504,6 +504,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
     return Builder.CreateCall(F, { Src });
   }
+  case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w32:
+  case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w64: {
+    llvm::Value *Src = EmitScalarExpr(E->getArg(0));
+    Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_inverse_ballot, { Src->getType() });
+    return Builder.CreateCall(F, { Src });
+  }
   case AMDGPU::BI__builtin_amdgcn_tanhf:
   case AMDGPU::BI__builtin_amdgcn_tanhh:
   case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
index 5e587cb87e073..d390418523694 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
@@ -24,6 +24,13 @@ void test_ballot_wave32_target_attr(global uint* out, int a, int b)
   *out = __builtin_amdgcn_ballot_w32(a == b);
 }
 
+// CHECK-LABEL: @test_inverse_ballot_wave32(
+// CHECK: call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %{{.+}})
+void test_inverse_ballot_wave32(global bool* out, int a)
+{
+  *out = __builtin_amdgcn_inverse_ballot_w32(a);
+}
+
 // CHECK-LABEL: @test_read_exec(
 // CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 true)
 void test_read_exec(global uint* out) {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
index 1fc2ac0d3141e..d851ec7e6734f 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
@@ -23,6 +23,13 @@ void test_ballot_wave64_target_attr(global ulong* out, int a, int b)
   *out = __builtin_amdgcn_ballot_w64(a == b);
 }
 
+// CHECK-LABEL: @test_inverse_ballot_wave64(
+// CHECK: call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %{{.+}})
+void test_inverse_ballot_wave64(global bool* out, ulong a)
+{
+  *out = __builtin_amdgcn_inverse_ballot_w64(a);
+}
+
 // CHECK-LABEL: @test_read_exec(
 // CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true)
 void test_read_exec(global ulong* out) {

``````````

</details>


https://github.com/llvm/llvm-project/pull/155724