[llvm] [WIP][Uniformity Analysis][Assume] Optimize AMDGPU ballot + assume uniformity patterns (PR #160670)

Tue Sep 30 04:03:22 PDT 2025

https://github.com/TejaX-Alaghari updated https://github.com/llvm/llvm-project/pull/160670

>From 6ce6a55c5b8397996f9680564310ddf855b8fd68 Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Thu, 25 Sep 2025 14:27:36 +0530
Subject: [PATCH 1/3] [InstCombine] Optimize AMDGPU ballot + assume uniformity
 patterns

When we encounter assume(ballot(cmp) == -1), we know that cmp is uniform
across all lanes and evaluates to true. This optimization recognizes this
pattern and replaces the condition with a constant true, allowing
subsequent passes to eliminate dead code and optimize control flow.

The optimization handles both i32 and i64 ballot intrinsics and only
applies when the ballot result is compared against -1 (all lanes active).
This is a conservative approach that ensures correctness while enabling
significant optimizations for uniform control flow patterns.
---
 .../InstCombine/InstCombineCalls.cpp          |  33 ++++++
 .../amdgpu-assume-ballot-uniform.ll           | 108 ++++++++++++++++++
 2 files changed, 141 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index cf6d0ecab4f69..58aa3982a279b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3540,6 +3540,39 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
+    // Optimize AMDGPU ballot uniformity assumptions:
+    // assume(icmp eq (ballot(cmp), -1)) implies that cmp is uniform and true
+    // This allows us to optimize away the ballot and replace cmp with true
+    Value *BallotInst;
+    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst),
+                                        m_AllOnes()))) {
+      // Check if this is an AMDGPU ballot intrinsic
+      if (auto *BallotCall = dyn_cast<IntrinsicInst>(BallotInst)) {
+        if (BallotCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
+          Value *BallotCondition = BallotCall->getArgOperand(0);
+
+          // If ballot(cmp) == -1, then cmp is uniform across all lanes and
+          // evaluates to true We can safely replace BallotCondition with true
+          // since ballot == -1 implies all lanes are true
+          if (BallotCondition->getType()->isIntOrIntVectorTy(1) &&
+              !isa<Constant>(BallotCondition)) {
+
+            // Add the condition to the worklist for further optimization
+            Worklist.pushValue(BallotCondition);
+
+            // Replace BallotCondition with true
+            BallotCondition->replaceAllUsesWith(
+                ConstantInt::getTrue(BallotCondition->getType()));
+
+            // The assumption is now always true, so we can simplify it
+            replaceUse(II->getOperandUse(0),
+                       ConstantInt::getTrue(II->getContext()));
+            return II;
+          }
+        }
+      }
+    }
+
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
     KnownBits Known(1);
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll b/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
new file mode 100644
index 0000000000000..3bf3b317b0771
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
@@ -0,0 +1,108 @@
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; Test case for optimizing AMDGPU ballot + assume patterns
+; When we assume that ballot(cmp) == -1, we know that cmp is uniform
+; This allows us to optimize away the ballot and directly branch
+
+define void @test_assume_ballot_uniform(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_uniform(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Test case with partial optimization - only ballot removal without branch optimization
+define void @test_assume_ballot_partial(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_partial(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Negative test - ballot not compared to -1
+define void @test_assume_ballot_not_uniform(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_not_uniform(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[SOME:%.*]] = icmp ne i64 [[BALLOT]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[SOME]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %some = icmp ne i64 %ballot, 0
+  call void @llvm.assume(i1 %some)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Test with 32-bit ballot
+define void @test_assume_ballot_uniform_i32(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_uniform_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
+  %all = icmp eq i32 %ballot, -1  
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+declare i64 @llvm.amdgcn.ballot.i64(i1)
+declare i32 @llvm.amdgcn.ballot.i32(i1)
+declare void @llvm.assume(i1)

>From 4b007d523b91104515cbd65ade0e5e82c88baa4f Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Mon, 29 Sep 2025 12:50:10 +0530
Subject: [PATCH 2/3] [InstCombine] Add constant folding for AMDGPU ballot
 intrinsics

Address reviewer feedback by implementing free-form ballot intrinsic optimization
instead of assume-dependent patterns. This approach:

1. Optimizes ballot(constant) directly as a standard intrinsic optimization
2. Allows uniformity analysis to handle assumes through proper channels
3. Follows established AMDGPU intrinsic patterns (amdgcn_cos, amdgcn_sin)
4. Enables broader optimization opportunities beyond assume contexts

Optimizations:
- ballot(true) -> -1 (all lanes active)
- ballot(false) -> 0 (no lanes active)

This addresses the core reviewer concern about performing optimization
in assume context rather than as a free-form pattern, and lets the
uniformity analysis framework handle assumes as intended.

Test cases focus on constant folding rather than assume-specific patterns,
demonstrating the more general applicability of this approach.
---
 .github/copilot-instructions.md               |  78 ++++++++++++-
 .../InstCombine/InstCombineCalls.cpp          |  51 +++-----
 .../InstCombine/InstCombineInternal.h         |   2 +
 .../amdgpu-assume-ballot-uniform.ll           | 108 -----------------
 .../amdgpu-ballot-constant-fold.ll            | 109 ++++++++++++++++++
 5 files changed, 204 insertions(+), 144 deletions(-)
 delete mode 100644 llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
 create mode 100644 llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 03748938700e3..922584f7bc9aa 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -1,4 +1,74 @@
-When performing a code review, pay close attention to code modifying a function's
-control flow. Could the change result in the corruption of performance profile
-data? Could the change result in invalid debug information, in particular for
-branches and calls?
+# LLVM Project AI Coding Agent Instructions
+
+## Architecture Overview
+
+LLVM is a compiler infrastructure with modular components:
+- **Core LLVM** (`llvm/`): IR processing, optimizations, code generation
+- **Clang** (`clang/`): C/C++/Objective-C frontend 
+- **LLD** (`lld/`): Linker
+- **libc++** (`libcxx/`): C++ standard library
+- **Target backends** (`llvm/lib/Target/{AMDGPU,X86,ARM,...}/`): Architecture-specific code generation
+
+## Essential Development Workflows
+
+### Build System (CMake + Ninja)
+```bash
+# Configure with common options for development
+cmake -G Ninja -S llvm-project/llvm -B build \
+  -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+  -DLLVM_ENABLE_PROJECTS="clang;lld" \
+  -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" \
+  -DLLVM_ENABLE_ASSERTIONS=ON
+
+# Build and install
+cmake --build build
+cmake --install build --prefix install/
+```
+
+### Testing with LIT
+- Use `opt < file.ll -passes=instcombine -S | FileCheck %s` pattern for IR transforms
+- Test files go in `llvm/test/Transforms/{PassName}/` with `.ll` extension
+- Always include both positive and negative test cases
+- Use `CHECK-LABEL:` for function boundaries, `CHECK-NEXT:` for strict sequence
+
+### Key Patterns for Transforms
+
+**InstCombine Pattern** (`llvm/lib/Transforms/InstCombine/`):
+- Implement in `InstCombine*.cpp` using visitor pattern (`visitCallInst`, `visitBinaryOperator`)
+- Use `PatternMatch.h` matchers: `match(V, m_Add(m_Value(X), m_ConstantInt()))`
+- Return `nullptr` for no change, modified instruction, or replacement
+- Add to worklist with `Worklist.pushValue()` for dependent values
+
+**Target-Specific Intrinsics**:
+- AMDGPU: `@llvm.amdgcn.*` intrinsics in `llvm/include/llvm/IR/IntrinsicsAMDGPU.td`
+- Pattern: `if (II->getIntrinsicID() == Intrinsic::amdgcn_ballot)`
+
+## Code Quality Standards
+
+### Control Flow & Debug Info
+When modifying control flow, ensure changes don't corrupt:
+- Performance profiling data (branch weights, call counts)
+- Debug information for branches and calls
+- Exception handling unwind information
+
+### Target-Specific Considerations
+- **AMDGPU**: Wavefront uniformity analysis affects ballot intrinsics
+- **X86**: Vector width and ISA feature dependencies
+- Use `TargetTransformInfo` for cost models and capability queries
+
+### Testing Requirements
+- Every optimization needs regression tests showing before/after IR
+- Include edge cases: constants, undef, poison values
+- Test target-specific intrinsics with appropriate triple
+- Use `; RUN: opt < %s -passes=... -S | FileCheck %s` format
+
+## Common Development Pitfalls
+- Don't assume instruction operand order without checking `isCommutative()`
+- Verify type compatibility before creating new instructions
+- Consider poison/undef propagation in optimizations
+- Check for side effects before eliminating instructions
+
+## Pass Pipeline Context
+- InstCombine runs early and multiple times in the pipeline
+- Subsequent passes like SimplifyCFG will clean up control flow
+- Use `replaceAllUsesWith()` carefully to maintain SSA form
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 58aa3982a279b..292f7816b6bf9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -85,6 +85,8 @@ using namespace PatternMatch;
 
 STATISTIC(NumSimplified, "Number of library calls simplified");
 
+
+
 static cl::opt<unsigned> GuardWideningWindow(
     "instcombine-guard-widening-window",
     cl::init(3),
@@ -2987,6 +2989,20 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
+  case Intrinsic::amdgcn_ballot: {
+    // Optimize ballot intrinsics when the condition is known to be uniform
+    Value *Condition = II->getArgOperand(0);
+    
+    // If the condition is a constant, we can evaluate the ballot directly
+    if (auto *ConstCond = dyn_cast<ConstantInt>(Condition)) {
+      // ballot(true) -> -1 (all lanes active)
+      // ballot(false) -> 0 (no lanes active)
+      uint64_t Result = ConstCond->isOne() ? ~0ULL : 0ULL;
+      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
+    }
+    
+    break;
+  }
   case Intrinsic::ldexp: {
     // ldexp(ldexp(x, a), b) -> ldexp(x, a + b)
     //
@@ -3540,38 +3556,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
-    // Optimize AMDGPU ballot uniformity assumptions:
-    // assume(icmp eq (ballot(cmp), -1)) implies that cmp is uniform and true
-    // This allows us to optimize away the ballot and replace cmp with true
-    Value *BallotInst;
-    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst),
-                                        m_AllOnes()))) {
-      // Check if this is an AMDGPU ballot intrinsic
-      if (auto *BallotCall = dyn_cast<IntrinsicInst>(BallotInst)) {
-        if (BallotCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
-          Value *BallotCondition = BallotCall->getArgOperand(0);
-
-          // If ballot(cmp) == -1, then cmp is uniform across all lanes and
-          // evaluates to true We can safely replace BallotCondition with true
-          // since ballot == -1 implies all lanes are true
-          if (BallotCondition->getType()->isIntOrIntVectorTy(1) &&
-              !isa<Constant>(BallotCondition)) {
-
-            // Add the condition to the worklist for further optimization
-            Worklist.pushValue(BallotCondition);
-
-            // Replace BallotCondition with true
-            BallotCondition->replaceAllUsesWith(
-                ConstantInt::getTrue(BallotCondition->getType()));
-
-            // The assumption is now always true, so we can simplify it
-            replaceUse(II->getOperandUse(0),
-                       ConstantInt::getTrue(II->getContext()));
-            return II;
-          }
-        }
-      }
-    }
+
 
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
@@ -3586,6 +3571,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       return eraseInstFromFunction(*II);
     }
 
+
+
     // Update the cache of affected values for this assumption (we might be
     // here because we just simplified the condition).
     AC.updateAffectedValues(cast<AssumeInst>(II));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 4f94aa2d38541..ab98c86f95306 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -124,6 +124,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
       BinaryOperator &I);
   Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract(
       BinaryOperator &OldAShr);
+  
+
   Instruction *visitAShr(BinaryOperator &I);
   Instruction *visitLShr(BinaryOperator &I);
   Instruction *commonShiftTransforms(BinaryOperator &I);
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll b/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
deleted file mode 100644
index 3bf3b317b0771..0000000000000
--- a/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
+++ /dev/null
@@ -1,108 +0,0 @@
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-; Test case for optimizing AMDGPU ballot + assume patterns
-; When we assume that ballot(cmp) == -1, we know that cmp is uniform
-; This allows us to optimize away the ballot and directly branch
-
-define void @test_assume_ballot_uniform(i32 %x) {
-; CHECK-LABEL: @test_assume_ballot_uniform(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-; Test case with partial optimization - only ballot removal without branch optimization
-define void @test_assume_ballot_partial(i32 %x) {
-; CHECK-LABEL: @test_assume_ballot_partial(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-; Negative test - ballot not compared to -1
-define void @test_assume_ballot_not_uniform(i32 %x) {
-; CHECK-LABEL: @test_assume_ballot_not_uniform(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[SOME:%.*]] = icmp ne i64 [[BALLOT]], 0
-; CHECK-NEXT:    call void @llvm.assume(i1 [[SOME]])
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %some = icmp ne i64 %ballot, 0
-  call void @llvm.assume(i1 %some)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-; Test with 32-bit ballot
-define void @test_assume_ballot_uniform_i32(i32 %x) {
-; CHECK-LABEL: @test_assume_ballot_uniform_i32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
-  %all = icmp eq i32 %ballot, -1  
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-declare i64 @llvm.amdgcn.ballot.i64(i1)
-declare i32 @llvm.amdgcn.ballot.i32(i1)
-declare void @llvm.assume(i1)
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
new file mode 100644
index 0000000000000..6180760f7d511
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
@@ -0,0 +1,109 @@
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; Test cases for optimizing AMDGPU ballot intrinsics
+; Focus on constant folding ballot(true) -> -1 and ballot(false) -> 0
+
+define void @test_ballot_constant_true() {
+; CHECK-LABEL: @test_ballot_constant_true(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 -1, -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 true, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+define void @test_ballot_constant_false() {
+; CHECK-LABEL: @test_ballot_constant_false(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NONE:%.*]] = icmp ne i64 0, 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[NONE]])
+; CHECK-NEXT:    br i1 false, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 false)
+  %none = icmp ne i64 %ballot, 0
+  call void @llvm.assume(i1 %none)
+  br i1 false, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Test with 32-bit ballot constants
+define void @test_ballot_i32_constant_true() {
+; CHECK-LABEL: @test_ballot_i32_constant_true(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i32 -1, -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 true)
+  %all = icmp eq i32 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 true, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Negative test - variable condition should not be optimized
+define void @test_ballot_variable_condition(i32 %x) {
+; CHECK-LABEL: @test_ballot_variable_condition(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+declare i64 @llvm.amdgcn.ballot.i64(i1)
+declare i32 @llvm.amdgcn.ballot.i32(i1)
+declare void @llvm.assume(i1)

>From 13c34000d29edc638e13a6a179d3788a2c54c1e0 Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Mon, 29 Sep 2025 21:24:37 +0530
Subject: [PATCH 3/3] [InstCombine] Implement generic assume-based uniformity
 optimization

Implement a comprehensive generic optimization for assume intrinsics that extracts
uniformity information and optimizes dominated uses. The optimization recognizes
multiple patterns that establish value uniformity and replaces dominated uses with
uniform constants.

Addresses uniformity analysis optimization opportunities identified in
AMDGPU ballot/readfirstlane + assume patterns for improved code generation
through constant propagation.
---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     |  16 ++-
 .../InstCombine/InstCombineCalls.cpp          | 133 ++++++++++++++++--
 .../InstCombine/InstCombineInternal.h         |   3 +
 .../amdgpu-ballot-constant-fold.ll            | 117 +++++----------
 llvm/test/Transforms/InstCombine/assume.ll    |   2 +-
 5 files changed, 164 insertions(+), 107 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 4fe5d00679436..fc4e64fcd52a1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1322,12 +1322,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     if (isa<PoisonValue>(Arg))
       return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
 
-    if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
-      if (Src->isZero()) {
-        // amdgcn.ballot(i1 0) is zero.
-        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
-      }
-    }
+    // For Wave32 targets, convert i64 ballot to i32 ballot + zext
     if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
       // %b64 = call i64 ballot.i64(...)
       // =>
@@ -1341,6 +1336,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       Call->takeName(&II);
       return IC.replaceInstUsesWith(II, Call);
     }
+
+    if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
+      if (Src->isZero()) {
+        // amdgcn.ballot(i1 0) is zero.
+        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
+      }
+      // Note: ballot(true) is NOT constant folded because the result depends
+      // on the active lanes in the wavefront, not just the condition value.
+    }
     break;
   }
   case Intrinsic::amdgcn_wavefrontsize: {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 292f7816b6bf9..f9ff666c49ffd 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2989,20 +2989,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
-  case Intrinsic::amdgcn_ballot: {
-    // Optimize ballot intrinsics when the condition is known to be uniform
-    Value *Condition = II->getArgOperand(0);
-    
-    // If the condition is a constant, we can evaluate the ballot directly
-    if (auto *ConstCond = dyn_cast<ConstantInt>(Condition)) {
-      // ballot(true) -> -1 (all lanes active)
-      // ballot(false) -> 0 (no lanes active)
-      uint64_t Result = ConstCond->isOne() ? ~0ULL : 0ULL;
-      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
-    }
-    
-    break;
-  }
+
   case Intrinsic::ldexp: {
     // ldexp(ldexp(x, a), b) -> ldexp(x, a + b)
     //
@@ -3571,7 +3558,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       return eraseInstFromFunction(*II);
     }
 
-
+    // Try to extract uniformity information from the assume and optimize
+    // dominated uses of any variables that are established as uniform.
+    optimizeAssumedUniformValues(cast<AssumeInst>(II));
 
     // Update the cache of affected values for this assumption (we might be
     // here because we just simplified the condition).
@@ -5026,3 +5015,117 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
   Call.setCalledFunction(FTy, NestF);
   return &Call;
 }
+
+/// Extract uniformity information from assume and optimize dominated uses.
+/// This works with any assume pattern that establishes value uniformity.
+void InstCombinerImpl::optimizeAssumedUniformValues(AssumeInst *Assume) {
+  Value *AssumedCondition = Assume->getArgOperand(0);
+  
+  // Map of uniform values to their uniform constants
+  SmallDenseMap<Value *, Constant *> UniformValues;
+  
+  // Pattern 1: assume(icmp eq (X, C)) -> X is uniform and equals C
+  if (auto *ICmp = dyn_cast<ICmpInst>(AssumedCondition)) {
+    if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) {
+      Value *LHS = ICmp->getOperand(0);
+      Value *RHS = ICmp->getOperand(1);
+      
+      // X == constant -> X is uniform and equals constant
+      if (auto *C = dyn_cast<Constant>(RHS)) {
+        UniformValues[LHS] = C;
+      } else if (auto *C = dyn_cast<Constant>(LHS)) {
+        UniformValues[RHS] = C;
+      }
+      
+      // Handle intrinsic patterns in equality comparisons
+      // Pattern: assume(ballot(cmp) == -1) -> cmp is uniform and true
+      if (auto *IntrinsicCall = dyn_cast<IntrinsicInst>(LHS)) {
+        if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
+          if (match(RHS, m_AllOnes())) {
+            Value *BallotArg = IntrinsicCall->getArgOperand(0);
+            if (BallotArg->getType()->isIntegerTy(1)) {
+              UniformValues[BallotArg] = ConstantInt::getTrue(BallotArg->getType());
+              
+              // Special case: if BallotArg is an equality comparison, 
+              // we know the operands are equal
+              if (auto *CmpInst = dyn_cast<ICmpInst>(BallotArg)) {
+                if (CmpInst->getPredicate() == ICmpInst::ICMP_EQ) {
+                  Value *CmpLHS = CmpInst->getOperand(0);
+                  Value *CmpRHS = CmpInst->getOperand(1);
+                  
+                  // If one operand is constant, the other is uniform and equals that constant
+                  if (auto *C = dyn_cast<Constant>(CmpRHS)) {
+                    UniformValues[CmpLHS] = C;
+                  } else if (auto *C = dyn_cast<Constant>(CmpLHS)) {
+                    UniformValues[CmpRHS] = C;
+                  }
+                  // TODO: Handle case where both operands are variables
+                }
+              }
+            }
+          }
+        } else if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_readfirstlane) {
+          // assume(readfirstlane(x) == c) -> x is uniform and equals c
+          if (auto *C = dyn_cast<Constant>(RHS)) {
+            Value *ReadFirstLaneArg = IntrinsicCall->getArgOperand(0);
+            UniformValues[ReadFirstLaneArg] = C;
+          }
+        }
+      }
+      
+      // Handle the reverse case too
+      if (auto *IntrinsicCall = dyn_cast<IntrinsicInst>(RHS)) {
+        if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
+          if (match(LHS, m_AllOnes())) {
+            Value *BallotArg = IntrinsicCall->getArgOperand(0);
+            if (BallotArg->getType()->isIntegerTy(1)) {
+              UniformValues[BallotArg] = ConstantInt::getTrue(BallotArg->getType());
+            }
+          }
+        } else if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_readfirstlane) {
+          if (auto *C = dyn_cast<Constant>(LHS)) {
+            Value *ReadFirstLaneArg = IntrinsicCall->getArgOperand(0);
+            UniformValues[ReadFirstLaneArg] = C;
+          }
+        }
+      }
+    }
+  }
+  
+  // Pattern 2: assume(X) where X is i1 -> X is uniform and equals true  
+  if (AssumedCondition->getType()->isIntegerTy(1) && !isa<ICmpInst>(AssumedCondition)) {
+    UniformValues[AssumedCondition] = ConstantInt::getTrue(AssumedCondition->getType());
+  }
+  
+  // Now optimize dominated uses of all discovered uniform values
+  for (auto &[UniformValue, UniformConstant] : UniformValues) {
+    SmallVector<Use *, 8> DominatedUses;
+    
+    // Find all uses dominated by the assume
+    // Skip if the value doesn't have a use list (e.g., constants)
+    if (!UniformValue->hasUseList())
+      continue;
+      
+    for (Use &U : UniformValue->uses()) {
+      Instruction *UseInst = dyn_cast<Instruction>(U.getUser());
+      if (!UseInst || UseInst == Assume)
+        continue;
+        
+      // Critical: Check dominance using InstCombine's infrastructure  
+      if (isValidAssumeForContext(Assume, UseInst, &DT)) {
+        DominatedUses.push_back(&U);
+      }
+    }
+    
+    // Replace only dominated uses with the uniform constant
+    for (Use *U : DominatedUses) {
+      U->set(UniformConstant);
+      Worklist.pushValue(U->getUser());
+    }
+    
+    // Mark for further optimization if we made changes
+    if (!DominatedUses.empty()) {
+      Worklist.pushValue(UniformValue);
+    }
+  }
+}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index ab98c86f95306..b51479aa19da6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -231,6 +231,9 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
 private:
   bool annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI);
   bool isDesirableIntType(unsigned BitWidth) const;
+  
+  /// Optimize uses of variables that are established as uniform by assume intrinsics.
+  void optimizeAssumedUniformValues(AssumeInst *Assume);
   bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
   bool shouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
index 6180760f7d511..b146487af9990 100644
--- a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
+++ b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
@@ -1,109 +1,56 @@
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -passes=instcombine -S | FileCheck %s
 
 ; Test cases for optimizing AMDGPU ballot intrinsics
-; Focus on constant folding ballot(true) -> -1 and ballot(false) -> 0
+; Focus on constant folding ballot(false) -> 0 and poison handling
 
-define void @test_ballot_constant_true() {
-; CHECK-LABEL: @test_ballot_constant_true(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 -1, -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
+; Test ballot with constant false condition gets folded
+define i32 @test_ballot_constant_false() {
+; CHECK-LABEL: @test_ballot_constant_false(
+; CHECK-NEXT:    ret i32 0
 ;
-entry:
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 true, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 false)
+  ret i32 %ballot
 }
 
-define void @test_ballot_constant_false() {
-; CHECK-LABEL: @test_ballot_constant_false(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[NONE:%.*]] = icmp ne i64 0, 0
-; CHECK-NEXT:    call void @llvm.assume(i1 [[NONE]])
-; CHECK-NEXT:    br i1 false, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
+; Test ballot.i64 with constant false condition gets folded
+define i64 @test_ballot_i64_constant_false() {
+; CHECK-LABEL: @test_ballot_i64_constant_false(
+; CHECK-NEXT:    ret i64 0
 ;
-entry:
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 false)
-  %none = icmp ne i64 %ballot, 0
-  call void @llvm.assume(i1 %none)
-  br i1 false, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
+  ret i64 %ballot
 }
 
-; Test with 32-bit ballot constants
-define void @test_ballot_i32_constant_true() {
-; CHECK-LABEL: @test_ballot_i32_constant_true(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i32 -1, -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
+; Test ballot with poison condition gets folded to poison
+define i64 @test_ballot_poison() {
+; CHECK-LABEL: @test_ballot_poison(
+; CHECK-NEXT:    ret i64 poison
 ;
-entry:
-  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 true)
-  %all = icmp eq i32 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 true, label %foo, label %bar
-
-foo:
-  ret void
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 poison)
+  ret i64 %ballot
+}
 
-bar:
-  ret void
+; Test that ballot(true) is NOT constant folded (depends on active lanes)
+define i64 @test_ballot_constant_true() {
+; CHECK-LABEL: @test_ballot_constant_true(
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; CHECK-NEXT:    ret i64 [[BALLOT]]
+;
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+  ret i64 %ballot
 }
 
-; Negative test - variable condition should not be optimized
-define void @test_ballot_variable_condition(i32 %x) {
+; Test that ballot with variable condition is not optimized
+define i64 @test_ballot_variable_condition(i32 %x) {
 ; CHECK-LABEL: @test_ballot_variable_condition(
-; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret i64 [[BALLOT]]
 ;
-entry:
   %cmp = icmp eq i32 %x, 0
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
+  ret i64 %ballot
 }
 
 declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare i32 @llvm.amdgcn.ballot.i32(i1)
-declare void @llvm.assume(i1)
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index 7b0b871513513..9f2dc4af2f1f8 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -82,7 +82,7 @@ define i32 @simple(i32 %a) #1 {
 ; CHECK-LABEL: @simple(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 4
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
-; CHECK-NEXT:    ret i32 [[A]]
+; CHECK-NEXT:    ret i32 4
 ;
   %cmp = icmp eq i32 %a, 4
   tail call void @llvm.assume(i1 %cmp)