[llvm] [InstCombine] Optimize AMDGPU ballot + assume uniformity patterns (PR #160670)

Thu Sep 25 02:28:33 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Teja Alaghari (TejaX-Alaghari)

<details>
<summary>Changes</summary>

## Summary

This PR implements an InstCombine optimization that recognizes when AMDGPU ballot intrinsics are used with assumptions about uniformity, specifically the pattern `assume(ballot(cmp) == -1)`.

## Problem

In AMDGPU code, developers often use ballot intrinsics to test uniformity of conditions across a wavefront:

```llvm
%cmp = icmp eq i32 %x, 0
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
%uniform = icmp eq i64 %ballot, -1
call void @llvm.assume(i1 %uniform)
br i1 %cmp, label %then, label %else
```

When `ballot(cmp) == -1`, we know that cmp evaluates to true on all active lanes, making it uniform. However, existing optimizations didn't recognize this pattern, leaving expensive ballot calls and potentially divergent control flow.

## Solution
This optimization adds pattern matching in InstCombine's `visitCallInst` for `llvm.assume` intrinsics to detect:
- `assume(icmp eq (ballot(cmp), -1))` patterns
- Both `i32` and `i64` ballot variants (`@llvm.amdgcn.ballot.i32` and `@llvm.amdgcn.ballot.i64`)

When detected, it:
1. Replaces the ballot condition `cmp` with `ConstantInt::getTrue()`
2. Simplifies the assumption to always true
3. Enables subsequent passes (like SimplifyCFG) to eliminate dead branches

## Example Transformation

**Before:**
```llvm
%cmp = icmp eq i32 %x, 0
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
%uniform = icmp eq i64 %ballot, -1
call void @llvm.assume(i1 %uniform)
```

**After InstCombine:**
```llvm
br i1 true, label %then, label %else
```

**After SimplifyCFG:**
```llvm
; Direct branch to %then, %else eliminated
```

---
Full diff: https://github.com/llvm/llvm-project/pull/160670.diff


2 Files Affected:

- (modified) llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp (+33) 
- (added) llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll (+108) 


``````````diff

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 6ad493772d170..c23a4e3dfbaf3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3519,6 +3519,39 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
+    // Optimize AMDGPU ballot uniformity assumptions:
+    // assume(icmp eq (ballot(cmp), -1)) implies that cmp is uniform and true
+    // This allows us to optimize away the ballot and replace cmp with true
+    Value *BallotInst;
+    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst),
+                                        m_AllOnes()))) {
+      // Check if this is an AMDGPU ballot intrinsic
+      if (auto *BallotCall = dyn_cast<IntrinsicInst>(BallotInst)) {
+        if (BallotCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
+          Value *BallotCondition = BallotCall->getArgOperand(0);
+
+          // If ballot(cmp) == -1, then cmp is uniform across all lanes and
+          // evaluates to true We can safely replace BallotCondition with true
+          // since ballot == -1 implies all lanes are true
+          if (BallotCondition->getType()->isIntOrIntVectorTy(1) &&
+              !isa<Constant>(BallotCondition)) {
+
+            // Add the condition to the worklist for further optimization
+            Worklist.pushValue(BallotCondition);
+
+            // Replace BallotCondition with true
+            BallotCondition->replaceAllUsesWith(
+                ConstantInt::getTrue(BallotCondition->getType()));
+
+            // The assumption is now always true, so we can simplify it
+            replaceUse(II->getOperandUse(0),
+                       ConstantInt::getTrue(II->getContext()));
+            return II;
+          }
+        }
+      }
+    }
+
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
     KnownBits Known(1);
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll b/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
new file mode 100644
index 0000000000000..3bf3b317b0771
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
@@ -0,0 +1,108 @@
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; Test case for optimizing AMDGPU ballot + assume patterns
+; When we assume that ballot(cmp) == -1, we know that cmp is uniform
+; This allows us to optimize away the ballot and directly branch
+
+define void @test_assume_ballot_uniform(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_uniform(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Test case with partial optimization - only ballot removal without branch optimization
+define void @test_assume_ballot_partial(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_partial(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Negative test - ballot not compared to -1
+define void @test_assume_ballot_not_uniform(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_not_uniform(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[SOME:%.*]] = icmp ne i64 [[BALLOT]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[SOME]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %some = icmp ne i64 %ballot, 0
+  call void @llvm.assume(i1 %some)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Test with 32-bit ballot
+define void @test_assume_ballot_uniform_i32(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_uniform_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
+  %all = icmp eq i32 %ballot, -1  
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+declare i64 @llvm.amdgcn.ballot.i64(i1)
+declare i32 @llvm.amdgcn.ballot.i32(i1)
+declare void @llvm.assume(i1)

``````````

</details>


https://github.com/llvm/llvm-project/pull/160670