[llvm] [WIP][InstCombine] Add assume-based optimizations for equality and AMDGPU ballot patterns (PR #160670)

Thu Oct 30 23:53:24 PDT 2025

https://github.com/TejaX-Alaghari updated https://github.com/llvm/llvm-project/pull/160670

>From 61c3a13a8dde47e8224b51429012ebd985af8520 Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Thu, 25 Sep 2025 14:27:36 +0530
Subject: [PATCH 1/8] [InstCombine] Optimize AMDGPU ballot + assume uniformity
 patterns

When we encounter assume(ballot(cmp) == -1), we know that cmp is uniform
across all lanes and evaluates to true. This optimization recognizes this
pattern and replaces the condition with a constant true, allowing
subsequent passes to eliminate dead code and optimize control flow.

The optimization handles both i32 and i64 ballot intrinsics and only
applies when the ballot result is compared against -1 (all lanes active).
This is a conservative approach that ensures correctness while enabling
significant optimizations for uniform control flow patterns.
---
 .../InstCombine/InstCombineCalls.cpp          |  33 ++++++
 .../amdgpu-assume-ballot-uniform.ll           | 108 ++++++++++++++++++
 2 files changed, 141 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 92fca90ddb88a..06fb168233211 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3549,6 +3549,39 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
+    // Optimize AMDGPU ballot uniformity assumptions:
+    // assume(icmp eq (ballot(cmp), -1)) implies that cmp is uniform and true
+    // This allows us to optimize away the ballot and replace cmp with true
+    Value *BallotInst;
+    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst),
+                                        m_AllOnes()))) {
+      // Check if this is an AMDGPU ballot intrinsic
+      if (auto *BallotCall = dyn_cast<IntrinsicInst>(BallotInst)) {
+        if (BallotCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
+          Value *BallotCondition = BallotCall->getArgOperand(0);
+
+          // If ballot(cmp) == -1, then cmp is uniform across all lanes and
+          // evaluates to true We can safely replace BallotCondition with true
+          // since ballot == -1 implies all lanes are true
+          if (BallotCondition->getType()->isIntOrIntVectorTy(1) &&
+              !isa<Constant>(BallotCondition)) {
+
+            // Add the condition to the worklist for further optimization
+            Worklist.pushValue(BallotCondition);
+
+            // Replace BallotCondition with true
+            BallotCondition->replaceAllUsesWith(
+                ConstantInt::getTrue(BallotCondition->getType()));
+
+            // The assumption is now always true, so we can simplify it
+            replaceUse(II->getOperandUse(0),
+                       ConstantInt::getTrue(II->getContext()));
+            return II;
+          }
+        }
+      }
+    }
+
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
     KnownBits Known(1);
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll b/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
new file mode 100644
index 0000000000000..3bf3b317b0771
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
@@ -0,0 +1,108 @@
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; Test case for optimizing AMDGPU ballot + assume patterns
+; When we assume that ballot(cmp) == -1, we know that cmp is uniform
+; This allows us to optimize away the ballot and directly branch
+
+define void @test_assume_ballot_uniform(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_uniform(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Test case with partial optimization - only ballot removal without branch optimization
+define void @test_assume_ballot_partial(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_partial(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Negative test - ballot not compared to -1
+define void @test_assume_ballot_not_uniform(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_not_uniform(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[SOME:%.*]] = icmp ne i64 [[BALLOT]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[SOME]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %some = icmp ne i64 %ballot, 0
+  call void @llvm.assume(i1 %some)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Test with 32-bit ballot
+define void @test_assume_ballot_uniform_i32(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_uniform_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
+  %all = icmp eq i32 %ballot, -1  
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+declare i64 @llvm.amdgcn.ballot.i64(i1)
+declare i32 @llvm.amdgcn.ballot.i32(i1)
+declare void @llvm.assume(i1)

>From fd67e9f0da6e5497af476b15ac3c9500860623df Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Mon, 29 Sep 2025 12:50:10 +0530
Subject: [PATCH 2/8] [InstCombine] Add constant folding for AMDGPU ballot
 intrinsics

Address reviewer feedback by implementing free-form ballot intrinsic optimization
instead of assume-dependent patterns. This approach:

1. Optimizes ballot(constant) directly as a standard intrinsic optimization
2. Allows uniformity analysis to handle assumes through proper channels
3. Follows established AMDGPU intrinsic patterns (amdgcn_cos, amdgcn_sin)
4. Enables broader optimization opportunities beyond assume contexts

Optimizations:
- ballot(true) -> -1 (all lanes active)
- ballot(false) -> 0 (no lanes active)

This addresses the core reviewer concern about performing optimization
in assume context rather than as a free-form pattern, and lets the
uniformity analysis framework handle assumes as intended.

Test cases focus on constant folding rather than assume-specific patterns,
demonstrating the more general applicability of this approach.
---
 .../InstCombine/InstCombineCalls.cpp          |  51 +++-----
 .../InstCombine/InstCombineInternal.h         |   2 +
 .../amdgpu-assume-ballot-uniform.ll           | 108 -----------------
 .../amdgpu-ballot-constant-fold.ll            | 109 ++++++++++++++++++
 4 files changed, 130 insertions(+), 140 deletions(-)
 delete mode 100644 llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
 create mode 100644 llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 06fb168233211..a7cbb8899bd38 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -85,6 +85,8 @@ using namespace PatternMatch;
 
 STATISTIC(NumSimplified, "Number of library calls simplified");
 
+
+
 static cl::opt<unsigned> GuardWideningWindow(
     "instcombine-guard-widening-window",
     cl::init(3),
@@ -2996,6 +2998,20 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
+  case Intrinsic::amdgcn_ballot: {
+    // Optimize ballot intrinsics when the condition is known to be uniform
+    Value *Condition = II->getArgOperand(0);
+    
+    // If the condition is a constant, we can evaluate the ballot directly
+    if (auto *ConstCond = dyn_cast<ConstantInt>(Condition)) {
+      // ballot(true) -> -1 (all lanes active)
+      // ballot(false) -> 0 (no lanes active)
+      uint64_t Result = ConstCond->isOne() ? ~0ULL : 0ULL;
+      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
+    }
+    
+    break;
+  }
   case Intrinsic::ldexp: {
     // ldexp(ldexp(x, a), b) -> ldexp(x, a + b)
     //
@@ -3549,38 +3565,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
-    // Optimize AMDGPU ballot uniformity assumptions:
-    // assume(icmp eq (ballot(cmp), -1)) implies that cmp is uniform and true
-    // This allows us to optimize away the ballot and replace cmp with true
-    Value *BallotInst;
-    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst),
-                                        m_AllOnes()))) {
-      // Check if this is an AMDGPU ballot intrinsic
-      if (auto *BallotCall = dyn_cast<IntrinsicInst>(BallotInst)) {
-        if (BallotCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
-          Value *BallotCondition = BallotCall->getArgOperand(0);
-
-          // If ballot(cmp) == -1, then cmp is uniform across all lanes and
-          // evaluates to true We can safely replace BallotCondition with true
-          // since ballot == -1 implies all lanes are true
-          if (BallotCondition->getType()->isIntOrIntVectorTy(1) &&
-              !isa<Constant>(BallotCondition)) {
-
-            // Add the condition to the worklist for further optimization
-            Worklist.pushValue(BallotCondition);
-
-            // Replace BallotCondition with true
-            BallotCondition->replaceAllUsesWith(
-                ConstantInt::getTrue(BallotCondition->getType()));
-
-            // The assumption is now always true, so we can simplify it
-            replaceUse(II->getOperandUse(0),
-                       ConstantInt::getTrue(II->getContext()));
-            return II;
-          }
-        }
-      }
-    }
+
 
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
@@ -3595,6 +3580,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       return eraseInstFromFunction(*II);
     }
 
+
+
     // Update the cache of affected values for this assumption (we might be
     // here because we just simplified the condition).
     AC.updateAffectedValues(cast<AssumeInst>(II));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index d85e4f7590197..5f8fdf25d5bbb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -124,6 +124,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
       BinaryOperator &I);
   Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract(
       BinaryOperator &OldAShr);
+  
+
   Instruction *visitAShr(BinaryOperator &I);
   Instruction *visitLShr(BinaryOperator &I);
   Instruction *commonShiftTransforms(BinaryOperator &I);
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll b/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
deleted file mode 100644
index 3bf3b317b0771..0000000000000
--- a/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
+++ /dev/null
@@ -1,108 +0,0 @@
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-; Test case for optimizing AMDGPU ballot + assume patterns
-; When we assume that ballot(cmp) == -1, we know that cmp is uniform
-; This allows us to optimize away the ballot and directly branch
-
-define void @test_assume_ballot_uniform(i32 %x) {
-; CHECK-LABEL: @test_assume_ballot_uniform(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-; Test case with partial optimization - only ballot removal without branch optimization
-define void @test_assume_ballot_partial(i32 %x) {
-; CHECK-LABEL: @test_assume_ballot_partial(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-; Negative test - ballot not compared to -1
-define void @test_assume_ballot_not_uniform(i32 %x) {
-; CHECK-LABEL: @test_assume_ballot_not_uniform(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[SOME:%.*]] = icmp ne i64 [[BALLOT]], 0
-; CHECK-NEXT:    call void @llvm.assume(i1 [[SOME]])
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %some = icmp ne i64 %ballot, 0
-  call void @llvm.assume(i1 %some)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-; Test with 32-bit ballot
-define void @test_assume_ballot_uniform_i32(i32 %x) {
-; CHECK-LABEL: @test_assume_ballot_uniform_i32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
-  %all = icmp eq i32 %ballot, -1  
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-declare i64 @llvm.amdgcn.ballot.i64(i1)
-declare i32 @llvm.amdgcn.ballot.i32(i1)
-declare void @llvm.assume(i1)
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
new file mode 100644
index 0000000000000..6180760f7d511
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
@@ -0,0 +1,109 @@
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; Test cases for optimizing AMDGPU ballot intrinsics
+; Focus on constant folding ballot(true) -> -1 and ballot(false) -> 0
+
+define void @test_ballot_constant_true() {
+; CHECK-LABEL: @test_ballot_constant_true(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 -1, -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 true, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+define void @test_ballot_constant_false() {
+; CHECK-LABEL: @test_ballot_constant_false(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NONE:%.*]] = icmp ne i64 0, 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[NONE]])
+; CHECK-NEXT:    br i1 false, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 false)
+  %none = icmp ne i64 %ballot, 0
+  call void @llvm.assume(i1 %none)
+  br i1 false, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Test with 32-bit ballot constants
+define void @test_ballot_i32_constant_true() {
+; CHECK-LABEL: @test_ballot_i32_constant_true(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i32 -1, -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 true)
+  %all = icmp eq i32 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 true, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Negative test - variable condition should not be optimized
+define void @test_ballot_variable_condition(i32 %x) {
+; CHECK-LABEL: @test_ballot_variable_condition(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+declare i64 @llvm.amdgcn.ballot.i64(i1)
+declare i32 @llvm.amdgcn.ballot.i32(i1)
+declare void @llvm.assume(i1)

>From 67b97a3eec1421b8abf0fc4475713157e495110d Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Mon, 29 Sep 2025 21:24:37 +0530
Subject: [PATCH 3/8] [InstCombine] Implement generic assume-based uniformity
 optimization

Implement a comprehensive generic optimization for assume intrinsics that extracts
uniformity information and optimizes dominated uses. The optimization recognizes
multiple patterns that establish value uniformity and replaces dominated uses with
uniform constants.

Addresses uniformity analysis optimization opportunities identified in
AMDGPU ballot/readfirstlane + assume patterns for improved code generation
through constant propagation.
---
 .github/copilot-instructions.md               |   4 +
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     |  16 ++-
 .../InstCombine/InstCombineCalls.cpp          | 136 +++++++++++++++---
 .../InstCombine/InstCombineInternal.h         |   5 +-
 .../amdgpu-ballot-constant-fold.ll            | 117 +++++----------
 llvm/test/Transforms/InstCombine/assume.ll    |  30 +++-
 6 files changed, 195 insertions(+), 113 deletions(-)
 create mode 100644 .github/copilot-instructions.md

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
new file mode 100644
index 0000000000000..03748938700e3
--- /dev/null
+++ b/.github/copilot-instructions.md
@@ -0,0 +1,4 @@
+When performing a code review, pay close attention to code modifying a function's
+control flow. Could the change result in the corruption of performance profile
+data? Could the change result in invalid debug information, in particular for
+branches and calls?
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 4fe5d00679436..fc4e64fcd52a1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1322,12 +1322,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     if (isa<PoisonValue>(Arg))
       return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
 
-    if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
-      if (Src->isZero()) {
-        // amdgcn.ballot(i1 0) is zero.
-        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
-      }
-    }
+    // For Wave32 targets, convert i64 ballot to i32 ballot + zext
     if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
       // %b64 = call i64 ballot.i64(...)
       // =>
@@ -1341,6 +1336,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       Call->takeName(&II);
       return IC.replaceInstUsesWith(II, Call);
     }
+
+    if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
+      if (Src->isZero()) {
+        // amdgcn.ballot(i1 0) is zero.
+        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
+      }
+      // Note: ballot(true) is NOT constant folded because the result depends
+      // on the active lanes in the wavefront, not just the condition value.
+    }
     break;
   }
   case Intrinsic::amdgcn_wavefrontsize: {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index a7cbb8899bd38..663df634f4633 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -85,8 +85,6 @@ using namespace PatternMatch;
 
 STATISTIC(NumSimplified, "Number of library calls simplified");
 
-
-
 static cl::opt<unsigned> GuardWideningWindow(
     "instcombine-guard-widening-window",
     cl::init(3),
@@ -2998,20 +2996,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
-  case Intrinsic::amdgcn_ballot: {
-    // Optimize ballot intrinsics when the condition is known to be uniform
-    Value *Condition = II->getArgOperand(0);
-    
-    // If the condition is a constant, we can evaluate the ballot directly
-    if (auto *ConstCond = dyn_cast<ConstantInt>(Condition)) {
-      // ballot(true) -> -1 (all lanes active)
-      // ballot(false) -> 0 (no lanes active)
-      uint64_t Result = ConstCond->isOne() ? ~0ULL : 0ULL;
-      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
-    }
-    
-    break;
-  }
   case Intrinsic::ldexp: {
     // ldexp(ldexp(x, a), b) -> ldexp(x, a + b)
     //
@@ -3565,8 +3549,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
-
-
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
     KnownBits Known(1);
@@ -3580,7 +3562,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       return eraseInstFromFunction(*II);
     }
 
-
+    // Try to extract uniformity information from the assume and optimize
+    // dominated uses of any variables that are established as uniform.
+    optimizeAssumedUniformValues(cast<AssumeInst>(II));
 
     // Update the cache of affected values for this assumption (we might be
     // here because we just simplified the condition).
@@ -5046,3 +5030,117 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
   Call.setCalledFunction(FTy, NestF);
   return &Call;
 }
+
+/// Extract uniformity information from assume and optimize dominated uses.
+/// This works with any assume pattern that establishes value uniformity.
+void InstCombinerImpl::optimizeAssumedUniformValues(AssumeInst *Assume) {
+  Value *AssumedCondition = Assume->getArgOperand(0);
+  
+  // Map of uniform values to their uniform constants
+  SmallDenseMap<Value *, Constant *> UniformValues;
+  
+  // Pattern 1: assume(icmp eq (X, C)) -> X is uniform and equals C
+  if (auto *ICmp = dyn_cast<ICmpInst>(AssumedCondition)) {
+    if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) {
+      Value *LHS = ICmp->getOperand(0);
+      Value *RHS = ICmp->getOperand(1);
+      
+      // X == constant -> X is uniform and equals constant
+      if (auto *C = dyn_cast<Constant>(RHS)) {
+        UniformValues[LHS] = C;
+      } else if (auto *C = dyn_cast<Constant>(LHS)) {
+        UniformValues[RHS] = C;
+      }
+      
+      // Handle intrinsic patterns in equality comparisons
+      // Pattern: assume(ballot(cmp) == -1) -> cmp is uniform and true
+      if (auto *IntrinsicCall = dyn_cast<IntrinsicInst>(LHS)) {
+        if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
+          if (match(RHS, m_AllOnes())) {
+            Value *BallotArg = IntrinsicCall->getArgOperand(0);
+            if (BallotArg->getType()->isIntegerTy(1)) {
+              UniformValues[BallotArg] = ConstantInt::getTrue(BallotArg->getType());
+              
+              // Special case: if BallotArg is an equality comparison, 
+              // we know the operands are equal
+              if (auto *CmpInst = dyn_cast<ICmpInst>(BallotArg)) {
+                if (CmpInst->getPredicate() == ICmpInst::ICMP_EQ) {
+                  Value *CmpLHS = CmpInst->getOperand(0);
+                  Value *CmpRHS = CmpInst->getOperand(1);
+                  
+                  // If one operand is constant, the other is uniform and equals that constant
+                  if (auto *C = dyn_cast<Constant>(CmpRHS)) {
+                    UniformValues[CmpLHS] = C;
+                  } else if (auto *C = dyn_cast<Constant>(CmpLHS)) {
+                    UniformValues[CmpRHS] = C;
+                  }
+                  // TODO: Handle case where both operands are variables
+                }
+              }
+            }
+          }
+        } else if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_readfirstlane) {
+          // assume(readfirstlane(x) == c) -> x is uniform and equals c
+          if (auto *C = dyn_cast<Constant>(RHS)) {
+            Value *ReadFirstLaneArg = IntrinsicCall->getArgOperand(0);
+            UniformValues[ReadFirstLaneArg] = C;
+          }
+        }
+      }
+      
+      // Handle the reverse case too
+      if (auto *IntrinsicCall = dyn_cast<IntrinsicInst>(RHS)) {
+        if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
+          if (match(LHS, m_AllOnes())) {
+            Value *BallotArg = IntrinsicCall->getArgOperand(0);
+            if (BallotArg->getType()->isIntegerTy(1)) {
+              UniformValues[BallotArg] = ConstantInt::getTrue(BallotArg->getType());
+            }
+          }
+        } else if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_readfirstlane) {
+          if (auto *C = dyn_cast<Constant>(LHS)) {
+            Value *ReadFirstLaneArg = IntrinsicCall->getArgOperand(0);
+            UniformValues[ReadFirstLaneArg] = C;
+          }
+        }
+      }
+    }
+  }
+  
+  // Pattern 2: assume(X) where X is i1 -> X is uniform and equals true  
+  if (AssumedCondition->getType()->isIntegerTy(1) && !isa<ICmpInst>(AssumedCondition)) {
+    UniformValues[AssumedCondition] = ConstantInt::getTrue(AssumedCondition->getType());
+  }
+  
+  // Now optimize dominated uses of all discovered uniform values
+  for (auto &[UniformValue, UniformConstant] : UniformValues) {
+    SmallVector<Use *, 8> DominatedUses;
+    
+    // Find all uses dominated by the assume
+    // Skip if the value doesn't have a use list (e.g., constants)
+    if (!UniformValue->hasUseList())
+      continue;
+      
+    for (Use &U : UniformValue->uses()) {
+      Instruction *UseInst = dyn_cast<Instruction>(U.getUser());
+      if (!UseInst || UseInst == Assume)
+        continue;
+        
+      // Critical: Check dominance using InstCombine's infrastructure  
+      if (isValidAssumeForContext(Assume, UseInst, &DT)) {
+        DominatedUses.push_back(&U);
+      }
+    }
+    
+    // Replace only dominated uses with the uniform constant
+    for (Use *U : DominatedUses) {
+      U->set(UniformConstant);
+      Worklist.pushValue(U->getUser());
+    }
+    
+    // Mark for further optimization if we made changes
+    if (!DominatedUses.empty()) {
+      Worklist.pushValue(UniformValue);
+    }
+  }
+}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 5f8fdf25d5bbb..220e832f190ca 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -124,8 +124,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
       BinaryOperator &I);
   Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract(
       BinaryOperator &OldAShr);
-  
-
   Instruction *visitAShr(BinaryOperator &I);
   Instruction *visitLShr(BinaryOperator &I);
   Instruction *commonShiftTransforms(BinaryOperator &I);
@@ -232,6 +230,9 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
 private:
   bool annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI);
   bool isDesirableIntType(unsigned BitWidth) const;
+  
+  /// Optimize uses of variables that are established as uniform by assume intrinsics.
+  void optimizeAssumedUniformValues(AssumeInst *Assume);
   bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
   bool shouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
index 6180760f7d511..b146487af9990 100644
--- a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
+++ b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
@@ -1,109 +1,56 @@
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -passes=instcombine -S | FileCheck %s
 
 ; Test cases for optimizing AMDGPU ballot intrinsics
-; Focus on constant folding ballot(true) -> -1 and ballot(false) -> 0
+; Focus on constant folding ballot(false) -> 0 and poison handling
 
-define void @test_ballot_constant_true() {
-; CHECK-LABEL: @test_ballot_constant_true(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 -1, -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
+; Test ballot with constant false condition gets folded
+define i32 @test_ballot_constant_false() {
+; CHECK-LABEL: @test_ballot_constant_false(
+; CHECK-NEXT:    ret i32 0
 ;
-entry:
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 true, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 false)
+  ret i32 %ballot
 }
 
-define void @test_ballot_constant_false() {
-; CHECK-LABEL: @test_ballot_constant_false(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[NONE:%.*]] = icmp ne i64 0, 0
-; CHECK-NEXT:    call void @llvm.assume(i1 [[NONE]])
-; CHECK-NEXT:    br i1 false, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
+; Test ballot.i64 with constant false condition gets folded
+define i64 @test_ballot_i64_constant_false() {
+; CHECK-LABEL: @test_ballot_i64_constant_false(
+; CHECK-NEXT:    ret i64 0
 ;
-entry:
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 false)
-  %none = icmp ne i64 %ballot, 0
-  call void @llvm.assume(i1 %none)
-  br i1 false, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
+  ret i64 %ballot
 }
 
-; Test with 32-bit ballot constants
-define void @test_ballot_i32_constant_true() {
-; CHECK-LABEL: @test_ballot_i32_constant_true(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i32 -1, -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
+; Test ballot with poison condition gets folded to poison
+define i64 @test_ballot_poison() {
+; CHECK-LABEL: @test_ballot_poison(
+; CHECK-NEXT:    ret i64 poison
 ;
-entry:
-  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 true)
-  %all = icmp eq i32 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 true, label %foo, label %bar
-
-foo:
-  ret void
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 poison)
+  ret i64 %ballot
+}
 
-bar:
-  ret void
+; Test that ballot(true) is NOT constant folded (depends on active lanes)
+define i64 @test_ballot_constant_true() {
+; CHECK-LABEL: @test_ballot_constant_true(
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; CHECK-NEXT:    ret i64 [[BALLOT]]
+;
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+  ret i64 %ballot
 }
 
-; Negative test - variable condition should not be optimized
-define void @test_ballot_variable_condition(i32 %x) {
+; Test that ballot with variable condition is not optimized
+define i64 @test_ballot_variable_condition(i32 %x) {
 ; CHECK-LABEL: @test_ballot_variable_condition(
-; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret i64 [[BALLOT]]
 ;
-entry:
   %cmp = icmp eq i32 %x, 0
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
+  ret i64 %ballot
 }
 
 declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare i32 @llvm.amdgcn.ballot.i32(i1)
-declare void @llvm.assume(i1)
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index cc87d6542fa12..299f2f1625797 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -104,7 +104,7 @@ define i32 @simple(i32 %a) #1 {
 ; CHECK-LABEL: @simple(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 4
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
-; CHECK-NEXT:    ret i32 [[A]]
+; CHECK-NEXT:    ret i32 4
 ;
   %cmp = icmp eq i32 %a, 4
   tail call void @llvm.assume(i1 %cmp)
@@ -1056,6 +1056,34 @@ define i1 @neg_assume_trunc_eq_one(i8 %x) {
   ret i1 %q
 }
 
+; Test AMDGPU ballot uniformity pattern optimization  
+; This demonstrates that assume(ballot(cmp) == -1) enables the optimization
+; of cmp to true, which then optimizes the branch condition
+define void @assume_ballot_uniform(i32 %x) {
+; CHECK-LABEL: @assume_ballot_uniform(
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare void @use(i1)
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 

>From 32cc62e00938bfbb41af145776aefe99e4ec873c Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Thu, 2 Oct 2025 11:51:43 +0530
Subject: [PATCH 4/8] [InstCombine] Add focused assume-based optimizations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit implements two targeted optimizations for assume intrinsics:

1. Basic equality optimization: assume(x == c) replaces dominated uses of x with c
2. AMDGPU ballot optimization: assume(ballot(cmp) == -1) replaces dominated
   uses of cmp with true, since ballot == -1 means cmp is true on all active lanes

Key design principles:
- No uniformity analysis concepts - uses simple mathematical facts
- Dominance-based replacement for correctness
- Clean pattern matching without complex framework
- Addresses reviewer feedback to keep it simple and focused

Examples:
  assume(x == 42); use = add x, 1  →  use = 43
  assume(ballot(cmp) == -1); br cmp  →  br true

This enables better optimization of GPU code patterns while remaining
architecture-agnostic through the mathematical properties of the operations.
---
 .../InstCombine/InstCombineCalls.cpp          | 191 +++++++-----------
 .../InstCombine/InstCombineInternal.h         |   3 -
 llvm/test/Transforms/InstCombine/assume.ll    |   9 +-
 3 files changed, 79 insertions(+), 124 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 663df634f4633..e29b7294b3b02 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3549,6 +3549,79 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
+    // Basic assume equality optimization: assume(x == c) -> replace dominated uses of x with c
+    if (auto *ICmp = dyn_cast<ICmpInst>(IIOperand)) {
+      if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) {
+        Value *LHS = ICmp->getOperand(0);
+        Value *RHS = ICmp->getOperand(1);
+        Value *Variable = nullptr;
+        Constant *ConstantVal = nullptr;
+        
+        if (auto *C = dyn_cast<Constant>(RHS)) {
+          Variable = LHS;
+          ConstantVal = C;
+        } else if (auto *C = dyn_cast<Constant>(LHS)) {
+          Variable = RHS;
+          ConstantVal = C;
+        }
+        
+        if (Variable && ConstantVal && Variable->hasUseList()) {
+          SmallVector<Use *, 8> DominatedUses;
+          for (Use &U : Variable->uses()) {
+            if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
+              if (UseInst != II && UseInst != ICmp &&
+                  isValidAssumeForContext(II, UseInst, &DT)) {
+                DominatedUses.push_back(&U);
+              }
+            }
+          }
+          
+          for (Use *U : DominatedUses) {
+            U->set(ConstantVal);
+            Worklist.pushValue(U->getUser());
+          }
+          
+          if (!DominatedUses.empty()) {
+            Worklist.pushValue(Variable);
+          }
+        }
+      }
+    }
+
+    // Optimize AMDGPU ballot patterns in assumes:
+    // assume(ballot(cmp) == -1) means cmp is true on all active lanes
+    // We can replace uses of cmp with true in dominated contexts
+    Value *BallotInst;
+    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst), m_AllOnes()))) {
+      if (auto *IntrCall = dyn_cast<IntrinsicInst>(BallotInst)) {
+        if (IntrCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
+          Value *BallotArg = IntrCall->getArgOperand(0);
+          if (BallotArg->getType()->isIntegerTy(1) && BallotArg->hasUseList()) {
+            // Find dominated uses and replace with true
+            SmallVector<Use *, 8> DominatedUses;
+            for (Use &U : BallotArg->uses()) {
+              if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
+                if (UseInst != II && UseInst != IntrCall &&
+                    isValidAssumeForContext(II, UseInst, &DT)) {
+                  DominatedUses.push_back(&U);
+                }
+              }
+            }
+            
+            // Replace dominated uses with true
+            for (Use *U : DominatedUses) {
+              U->set(ConstantInt::getTrue(BallotArg->getType()));
+              Worklist.pushValue(U->getUser());
+            }
+            
+            if (!DominatedUses.empty()) {
+              Worklist.pushValue(BallotArg);
+            }
+          }
+        }
+      }
+    }
+
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
     KnownBits Known(1);
@@ -3562,10 +3635,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       return eraseInstFromFunction(*II);
     }
 
-    // Try to extract uniformity information from the assume and optimize
-    // dominated uses of any variables that are established as uniform.
-    optimizeAssumedUniformValues(cast<AssumeInst>(II));
-
     // Update the cache of affected values for this assumption (we might be
     // here because we just simplified the condition).
     AC.updateAffectedValues(cast<AssumeInst>(II));
@@ -5031,116 +5100,4 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
   return &Call;
 }
 
-/// Extract uniformity information from assume and optimize dominated uses.
-/// This works with any assume pattern that establishes value uniformity.
-void InstCombinerImpl::optimizeAssumedUniformValues(AssumeInst *Assume) {
-  Value *AssumedCondition = Assume->getArgOperand(0);
-  
-  // Map of uniform values to their uniform constants
-  SmallDenseMap<Value *, Constant *> UniformValues;
-  
-  // Pattern 1: assume(icmp eq (X, C)) -> X is uniform and equals C
-  if (auto *ICmp = dyn_cast<ICmpInst>(AssumedCondition)) {
-    if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) {
-      Value *LHS = ICmp->getOperand(0);
-      Value *RHS = ICmp->getOperand(1);
-      
-      // X == constant -> X is uniform and equals constant
-      if (auto *C = dyn_cast<Constant>(RHS)) {
-        UniformValues[LHS] = C;
-      } else if (auto *C = dyn_cast<Constant>(LHS)) {
-        UniformValues[RHS] = C;
-      }
-      
-      // Handle intrinsic patterns in equality comparisons
-      // Pattern: assume(ballot(cmp) == -1) -> cmp is uniform and true
-      if (auto *IntrinsicCall = dyn_cast<IntrinsicInst>(LHS)) {
-        if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
-          if (match(RHS, m_AllOnes())) {
-            Value *BallotArg = IntrinsicCall->getArgOperand(0);
-            if (BallotArg->getType()->isIntegerTy(1)) {
-              UniformValues[BallotArg] = ConstantInt::getTrue(BallotArg->getType());
-              
-              // Special case: if BallotArg is an equality comparison, 
-              // we know the operands are equal
-              if (auto *CmpInst = dyn_cast<ICmpInst>(BallotArg)) {
-                if (CmpInst->getPredicate() == ICmpInst::ICMP_EQ) {
-                  Value *CmpLHS = CmpInst->getOperand(0);
-                  Value *CmpRHS = CmpInst->getOperand(1);
-                  
-                  // If one operand is constant, the other is uniform and equals that constant
-                  if (auto *C = dyn_cast<Constant>(CmpRHS)) {
-                    UniformValues[CmpLHS] = C;
-                  } else if (auto *C = dyn_cast<Constant>(CmpLHS)) {
-                    UniformValues[CmpRHS] = C;
-                  }
-                  // TODO: Handle case where both operands are variables
-                }
-              }
-            }
-          }
-        } else if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_readfirstlane) {
-          // assume(readfirstlane(x) == c) -> x is uniform and equals c
-          if (auto *C = dyn_cast<Constant>(RHS)) {
-            Value *ReadFirstLaneArg = IntrinsicCall->getArgOperand(0);
-            UniformValues[ReadFirstLaneArg] = C;
-          }
-        }
-      }
-      
-      // Handle the reverse case too
-      if (auto *IntrinsicCall = dyn_cast<IntrinsicInst>(RHS)) {
-        if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
-          if (match(LHS, m_AllOnes())) {
-            Value *BallotArg = IntrinsicCall->getArgOperand(0);
-            if (BallotArg->getType()->isIntegerTy(1)) {
-              UniformValues[BallotArg] = ConstantInt::getTrue(BallotArg->getType());
-            }
-          }
-        } else if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_readfirstlane) {
-          if (auto *C = dyn_cast<Constant>(LHS)) {
-            Value *ReadFirstLaneArg = IntrinsicCall->getArgOperand(0);
-            UniformValues[ReadFirstLaneArg] = C;
-          }
-        }
-      }
-    }
-  }
-  
-  // Pattern 2: assume(X) where X is i1 -> X is uniform and equals true  
-  if (AssumedCondition->getType()->isIntegerTy(1) && !isa<ICmpInst>(AssumedCondition)) {
-    UniformValues[AssumedCondition] = ConstantInt::getTrue(AssumedCondition->getType());
-  }
-  
-  // Now optimize dominated uses of all discovered uniform values
-  for (auto &[UniformValue, UniformConstant] : UniformValues) {
-    SmallVector<Use *, 8> DominatedUses;
-    
-    // Find all uses dominated by the assume
-    // Skip if the value doesn't have a use list (e.g., constants)
-    if (!UniformValue->hasUseList())
-      continue;
-      
-    for (Use &U : UniformValue->uses()) {
-      Instruction *UseInst = dyn_cast<Instruction>(U.getUser());
-      if (!UseInst || UseInst == Assume)
-        continue;
-        
-      // Critical: Check dominance using InstCombine's infrastructure  
-      if (isValidAssumeForContext(Assume, UseInst, &DT)) {
-        DominatedUses.push_back(&U);
-      }
-    }
-    
-    // Replace only dominated uses with the uniform constant
-    for (Use *U : DominatedUses) {
-      U->set(UniformConstant);
-      Worklist.pushValue(U->getUser());
-    }
-    
-    // Mark for further optimization if we made changes
-    if (!DominatedUses.empty()) {
-      Worklist.pushValue(UniformValue);
-    }
-  }
-}
+
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 220e832f190ca..d85e4f7590197 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -230,9 +230,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
 private:
   bool annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI);
   bool isDesirableIntType(unsigned BitWidth) const;
-  
-  /// Optimize uses of variables that are established as uniform by assume intrinsics.
-  void optimizeAssumedUniformValues(AssumeInst *Assume);
   bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
   bool shouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index 299f2f1625797..bc0dd4398b5d6 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -1056,12 +1056,13 @@ define i1 @neg_assume_trunc_eq_one(i8 %x) {
   ret i1 %q
 }
 
-; Test AMDGPU ballot uniformity pattern optimization  
-; This demonstrates that assume(ballot(cmp) == -1) enables the optimization
-; of cmp to true, which then optimizes the branch condition
+; Test AMDGPU ballot pattern optimization  
+; assume(ballot(cmp) == -1) means cmp is true on all active lanes
+; so dominated uses of cmp can be replaced with true
 define void @assume_ballot_uniform(i32 %x) {
 ; CHECK-LABEL: @assume_ballot_uniform(
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
 ; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
 ; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]

>From ab201b6ae15b2393141c97739759c7c393b8fd5b Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Sun, 5 Oct 2025 13:51:02 +0530
Subject: [PATCH 5/8] Address @ssahasra's review feedback

- Remove 'dominated' terminology from comments and variable names
  (SSA values always dominate their uses)
- Rename DominatedUses -> Uses throughout
- Remove redundant UseInst != II check in ICmp block
- Fix code formatting (clang-format)
- Split long comment lines
- Remove extra blank lines at EOF
---
 .../InstCombine/InstCombineCalls.cpp          | 46 +++++++++----------
 llvm/test/Transforms/InstCombine/assume.ll    |  4 +-
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index e29b7294b3b02..7d585f63383fd 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3549,14 +3549,15 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
-    // Basic assume equality optimization: assume(x == c) -> replace dominated uses of x with c
+    // Basic assume equality optimization: assume(x == c) -> replace uses of x
+    // with c
     if (auto *ICmp = dyn_cast<ICmpInst>(IIOperand)) {
       if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) {
         Value *LHS = ICmp->getOperand(0);
         Value *RHS = ICmp->getOperand(1);
         Value *Variable = nullptr;
         Constant *ConstantVal = nullptr;
-        
+
         if (auto *C = dyn_cast<Constant>(RHS)) {
           Variable = LHS;
           ConstantVal = C;
@@ -3564,24 +3565,24 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
           Variable = RHS;
           ConstantVal = C;
         }
-        
+
         if (Variable && ConstantVal && Variable->hasUseList()) {
-          SmallVector<Use *, 8> DominatedUses;
+          SmallVector<Use *, 8> Uses;
           for (Use &U : Variable->uses()) {
             if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
-              if (UseInst != II && UseInst != ICmp &&
+              if (UseInst != ICmp &&
                   isValidAssumeForContext(II, UseInst, &DT)) {
-                DominatedUses.push_back(&U);
+                Uses.push_back(&U);
               }
             }
           }
-          
-          for (Use *U : DominatedUses) {
+
+          for (Use *U : Uses) {
             U->set(ConstantVal);
             Worklist.pushValue(U->getUser());
           }
-          
-          if (!DominatedUses.empty()) {
+
+          if (!Uses.empty()) {
             Worklist.pushValue(Variable);
           }
         }
@@ -3590,31 +3591,32 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
 
     // Optimize AMDGPU ballot patterns in assumes:
     // assume(ballot(cmp) == -1) means cmp is true on all active lanes
-    // We can replace uses of cmp with true in dominated contexts
+    // We can replace uses of cmp with true
     Value *BallotInst;
-    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst), m_AllOnes()))) {
+    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst),
+                                        m_AllOnes()))) {
       if (auto *IntrCall = dyn_cast<IntrinsicInst>(BallotInst)) {
         if (IntrCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
           Value *BallotArg = IntrCall->getArgOperand(0);
           if (BallotArg->getType()->isIntegerTy(1) && BallotArg->hasUseList()) {
-            // Find dominated uses and replace with true
-            SmallVector<Use *, 8> DominatedUses;
+            // Find uses and replace with true
+            SmallVector<Use *, 8> Uses;
             for (Use &U : BallotArg->uses()) {
               if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
-                if (UseInst != II && UseInst != IntrCall &&
+                if (UseInst != IntrCall &&
                     isValidAssumeForContext(II, UseInst, &DT)) {
-                  DominatedUses.push_back(&U);
+                  Uses.push_back(&U);
                 }
               }
             }
-            
-            // Replace dominated uses with true
-            for (Use *U : DominatedUses) {
+
+            // Replace uses with true
+            for (Use *U : Uses) {
               U->set(ConstantInt::getTrue(BallotArg->getType()));
               Worklist.pushValue(U->getUser());
             }
-            
-            if (!DominatedUses.empty()) {
+
+            if (!Uses.empty()) {
               Worklist.pushValue(BallotArg);
             }
           }
@@ -5099,5 +5101,3 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
   Call.setCalledFunction(FTy, NestF);
   return &Call;
 }
-
-
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index bc0dd4398b5d6..3279986121e34 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -1056,9 +1056,9 @@ define i1 @neg_assume_trunc_eq_one(i8 %x) {
   ret i1 %q
 }
 
-; Test AMDGPU ballot pattern optimization  
+; Test AMDGPU ballot pattern optimization
 ; assume(ballot(cmp) == -1) means cmp is true on all active lanes
-; so dominated uses of cmp can be replaced with true
+; so uses of cmp can be replaced with true
 define void @assume_ballot_uniform(i32 %x) {
 ; CHECK-LABEL: @assume_ballot_uniform(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0

>From f4c3bcb5880fe5ec29de2ee25a9d516476774167 Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Tue, 7 Oct 2025 23:08:11 +0530
Subject: [PATCH 6/8] Address feedback on the location of the opt

- Remove redundant const propagration (assume equality opt) from InstCombine.
- Moved assume(ballot(cmp) == -1) optimization from InstCombine to GVN.
---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     | 16 ++--
 .../InstCombine/InstCombineCalls.cpp          | 75 -------------------
 llvm/lib/Transforms/Scalar/GVN.cpp            |  1 +
 llvm/test/Transforms/GVN/assume-equal.ll      | 54 +++++++++++++
 .../amdgpu-ballot-constant-fold.ll            | 56 --------------
 llvm/test/Transforms/InstCombine/assume.ll    | 31 +-------
 6 files changed, 62 insertions(+), 171 deletions(-)
 delete mode 100644 llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index fc4e64fcd52a1..4fe5d00679436 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1322,7 +1322,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     if (isa<PoisonValue>(Arg))
       return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
 
-    // For Wave32 targets, convert i64 ballot to i32 ballot + zext
+    if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
+      if (Src->isZero()) {
+        // amdgcn.ballot(i1 0) is zero.
+        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
+      }
+    }
     if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
       // %b64 = call i64 ballot.i64(...)
       // =>
@@ -1336,15 +1341,6 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       Call->takeName(&II);
       return IC.replaceInstUsesWith(II, Call);
     }
-
-    if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
-      if (Src->isZero()) {
-        // amdgcn.ballot(i1 0) is zero.
-        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
-      }
-      // Note: ballot(true) is NOT constant folded because the result depends
-      // on the active lanes in the wavefront, not just the condition value.
-    }
     break;
   }
   case Intrinsic::amdgcn_wavefrontsize: {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 7d585f63383fd..92fca90ddb88a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3549,81 +3549,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
-    // Basic assume equality optimization: assume(x == c) -> replace uses of x
-    // with c
-    if (auto *ICmp = dyn_cast<ICmpInst>(IIOperand)) {
-      if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) {
-        Value *LHS = ICmp->getOperand(0);
-        Value *RHS = ICmp->getOperand(1);
-        Value *Variable = nullptr;
-        Constant *ConstantVal = nullptr;
-
-        if (auto *C = dyn_cast<Constant>(RHS)) {
-          Variable = LHS;
-          ConstantVal = C;
-        } else if (auto *C = dyn_cast<Constant>(LHS)) {
-          Variable = RHS;
-          ConstantVal = C;
-        }
-
-        if (Variable && ConstantVal && Variable->hasUseList()) {
-          SmallVector<Use *, 8> Uses;
-          for (Use &U : Variable->uses()) {
-            if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
-              if (UseInst != ICmp &&
-                  isValidAssumeForContext(II, UseInst, &DT)) {
-                Uses.push_back(&U);
-              }
-            }
-          }
-
-          for (Use *U : Uses) {
-            U->set(ConstantVal);
-            Worklist.pushValue(U->getUser());
-          }
-
-          if (!Uses.empty()) {
-            Worklist.pushValue(Variable);
-          }
-        }
-      }
-    }
-
-    // Optimize AMDGPU ballot patterns in assumes:
-    // assume(ballot(cmp) == -1) means cmp is true on all active lanes
-    // We can replace uses of cmp with true
-    Value *BallotInst;
-    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst),
-                                        m_AllOnes()))) {
-      if (auto *IntrCall = dyn_cast<IntrinsicInst>(BallotInst)) {
-        if (IntrCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
-          Value *BallotArg = IntrCall->getArgOperand(0);
-          if (BallotArg->getType()->isIntegerTy(1) && BallotArg->hasUseList()) {
-            // Find uses and replace with true
-            SmallVector<Use *, 8> Uses;
-            for (Use &U : BallotArg->uses()) {
-              if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
-                if (UseInst != IntrCall &&
-                    isValidAssumeForContext(II, UseInst, &DT)) {
-                  Uses.push_back(&U);
-                }
-              }
-            }
-
-            // Replace uses with true
-            for (Use *U : Uses) {
-              U->set(ConstantInt::getTrue(BallotArg->getType()));
-              Worklist.pushValue(U->getUser());
-            }
-
-            if (!Uses.empty()) {
-              Worklist.pushValue(BallotArg);
-            }
-          }
-        }
-      }
-    }
-
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
     KnownBits Known(1);
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 72e1131a54a86..958826aba2699 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -54,6 +54,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
diff --git a/llvm/test/Transforms/GVN/assume-equal.ll b/llvm/test/Transforms/GVN/assume-equal.ll
index a38980169fc52..54e5267e573b3 100644
--- a/llvm/test/Transforms/GVN/assume-equal.ll
+++ b/llvm/test/Transforms/GVN/assume-equal.ll
@@ -462,6 +462,60 @@ define i8 @assume_ptr_eq_same_prov(ptr %p, i64 %x) {
   ret i8 %v
 }
 
+; Test AMDGPU ballot pattern optimization
+; assume(ballot(cmp) == -1) means cmp is true on all active lanes
+; so uses of cmp can be replaced with true
+define void @assume_ballot_const(i32 %x) {
+; CHECK-LABEL: @assume_ballot_const(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+define void @assume_ballot_exec_mask(i32 %x, i64 %exec_mask) {
+; CHECK-LABEL: @assume_ballot_exec_mask(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], %exec_mask
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, %exec_mask
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare noalias ptr @_Znwm(i64)
 declare void @_ZN1AC1Ev(ptr)
 declare void @llvm.assume(i1)
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
deleted file mode 100644
index b146487af9990..0000000000000
--- a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -passes=instcombine -S | FileCheck %s
-
-; Test cases for optimizing AMDGPU ballot intrinsics
-; Focus on constant folding ballot(false) -> 0 and poison handling
-
-; Test ballot with constant false condition gets folded
-define i32 @test_ballot_constant_false() {
-; CHECK-LABEL: @test_ballot_constant_false(
-; CHECK-NEXT:    ret i32 0
-;
-  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 false)
-  ret i32 %ballot
-}
-
-; Test ballot.i64 with constant false condition gets folded
-define i64 @test_ballot_i64_constant_false() {
-; CHECK-LABEL: @test_ballot_i64_constant_false(
-; CHECK-NEXT:    ret i64 0
-;
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 false)
-  ret i64 %ballot
-}
-
-; Test ballot with poison condition gets folded to poison
-define i64 @test_ballot_poison() {
-; CHECK-LABEL: @test_ballot_poison(
-; CHECK-NEXT:    ret i64 poison
-;
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 poison)
-  ret i64 %ballot
-}
-
-; Test that ballot(true) is NOT constant folded (depends on active lanes)
-define i64 @test_ballot_constant_true() {
-; CHECK-LABEL: @test_ballot_constant_true(
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; CHECK-NEXT:    ret i64 [[BALLOT]]
-;
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-  ret i64 %ballot
-}
-
-; Test that ballot with variable condition is not optimized
-define i64 @test_ballot_variable_condition(i32 %x) {
-; CHECK-LABEL: @test_ballot_variable_condition(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    ret i64 [[BALLOT]]
-;
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  ret i64 %ballot
-}
-
-declare i64 @llvm.amdgcn.ballot.i64(i1)
-declare i32 @llvm.amdgcn.ballot.i32(i1)
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index 3279986121e34..cc87d6542fa12 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -104,7 +104,7 @@ define i32 @simple(i32 %a) #1 {
 ; CHECK-LABEL: @simple(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 4
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
-; CHECK-NEXT:    ret i32 4
+; CHECK-NEXT:    ret i32 [[A]]
 ;
   %cmp = icmp eq i32 %a, 4
   tail call void @llvm.assume(i1 %cmp)
@@ -1056,35 +1056,6 @@ define i1 @neg_assume_trunc_eq_one(i8 %x) {
   ret i1 %q
 }
 
-; Test AMDGPU ballot pattern optimization
-; assume(ballot(cmp) == -1) means cmp is true on all active lanes
-; so uses of cmp can be replaced with true
-define void @assume_ballot_uniform(i32 %x) {
-; CHECK-LABEL: @assume_ballot_uniform(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare void @use(i1)
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 

>From f4a87ff11762c2a56bf0bfa2bb66bccce37882fd Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Fri, 10 Oct 2025 18:35:00 +0530
Subject: [PATCH 7/8] Refactored the ballot optimization condition into
 propagateEquality method

---
 llvm/lib/Transforms/Scalar/GVN.cpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 958826aba2699..4290d68d9cac6 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -2540,6 +2540,21 @@ bool GVNPass::propagateEquality(
       }
     }
 
+    // If "ballot(cond) == -1" or "ballot(cond) == exec_mask" then cond is true
+    // on all active lanes, so cond can be replaced with true.
+    if (IntrinsicInst *IntrCall = dyn_cast<IntrinsicInst>(LHS)) {
+      if (IntrCall->getIntrinsicID() ==
+          Intrinsic::AMDGCNIntrinsics::amdgcn_ballot) {
+        Value *BallotArg = IntrCall->getArgOperand(0);
+        if (BallotArg->getType()->isIntegerTy(1) &&
+            (match(RHS, m_AllOnes()) || !isa<Constant>(RHS))) {
+          Worklist.push_back(std::make_pair(
+              BallotArg, ConstantInt::getTrue(BallotArg->getType())));
+          continue;
+        }
+      }
+    }
+
     // Now try to deduce additional equalities from this one. For example, if
     // the known equality was "(A != B)" == "false" then it follows that A and B
     // are equal in the scope. Only boolean equalities with an explicit true or

>From e6930aa3dbc30197ae04cbf98bf0205449fd7e5e Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Fri, 31 Oct 2025 11:53:13 +0530
Subject: [PATCH 8/8] Implement reviewer's suggestions to - 1. Add logic to
 handle swapped operands in icmp 2. Introduce priliminary logic for
 identifying an exec mask 3. Add a separate test file and include
 comprehensive cases for ballot with assume

---
 .github/copilot-instructions.md           |   4 -
 llvm/lib/Transforms/Scalar/GVN.cpp        |  58 ++-
 llvm/test/Transforms/GVN/assume-ballot.ll | 445 ++++++++++++++++++++++
 llvm/test/Transforms/GVN/assume-equal.ll  |  54 ---
 4 files changed, 495 insertions(+), 66 deletions(-)
 delete mode 100644 .github/copilot-instructions.md
 create mode 100644 llvm/test/Transforms/GVN/assume-ballot.ll

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
deleted file mode 100644
index 03748938700e3..0000000000000
--- a/.github/copilot-instructions.md
+++ /dev/null
@@ -1,4 +0,0 @@
-When performing a code review, pay close attention to code modifying a function's
-control flow. Could the change result in the corruption of performance profile
-data? Could the change result in invalid debug information, in particular for
-branches and calls?
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 4290d68d9cac6..e452dffd9035a 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -2540,18 +2540,60 @@ bool GVNPass::propagateEquality(
       }
     }
 
-    // If "ballot(cond) == -1" or "ballot(cond) == exec_mask" then cond is true
-    // on all active lanes, so cond can be replaced with true.
-    if (IntrinsicInst *IntrCall = dyn_cast<IntrinsicInst>(LHS)) {
-      if (IntrCall->getIntrinsicID() ==
-          Intrinsic::AMDGCNIntrinsics::amdgcn_ballot) {
-        Value *BallotArg = IntrCall->getArgOperand(0);
-        if (BallotArg->getType()->isIntegerTy(1) &&
-            (match(RHS, m_AllOnes()) || !isa<Constant>(RHS))) {
+    // Helper function to check if a value represents the current exec mask.
+    auto IsExecMask = [](Value *V) -> bool {
+      // Pattern 1: ballot(true)
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(V)) {
+        if (II->getIntrinsicID() == Intrinsic::AMDGCNIntrinsics::amdgcn_ballot) {
+          // Check if argument is constant true
+          if (match(II->getArgOperand(0), m_One())) {
+            return true;
+          }
+        }
+      }
+
+      return false;
+    };
+
+    // Check if either of the operands is a ballot intrinsic.
+    IntrinsicInst *BallotCall = nullptr;
+    Value *CompareValue = nullptr;
+
+    // Check both LHS and RHS for ballot intrinsic and its value since GVN may
+    // swap the operands.
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(LHS)) {
+      if (II->getIntrinsicID() == Intrinsic::AMDGCNIntrinsics::amdgcn_ballot) {
+        BallotCall = II;
+        CompareValue = RHS;
+      }
+    }
+    if (!BallotCall && isa<IntrinsicInst>(RHS)) {
+      IntrinsicInst *II = cast<IntrinsicInst>(RHS);
+      if (II->getIntrinsicID() == Intrinsic::AMDGCNIntrinsics::amdgcn_ballot) {
+        BallotCall = II;
+        CompareValue = LHS;
+      }
+    }
+
+    // If a ballot intrinsic is found, calculate the truth value of the ballot
+    // argument based on the RHS.
+    if (BallotCall) {
+      Value *BallotArg = BallotCall->getArgOperand(0);
+      if (BallotArg->getType()->isIntegerTy(1)) {
+        // Case 1: ballot(cond) == -1 or exec_mask: all active lanes true ->
+        // cond = true.
+        if (match(CompareValue, m_AllOnes()) || IsExecMask(CompareValue)) {
           Worklist.push_back(std::make_pair(
               BallotArg, ConstantInt::getTrue(BallotArg->getType())));
           continue;
         }
+        // Case 2: ballot(cond) == 0: all active lanes false ->
+        // cond = false.
+        if (match(CompareValue, m_Zero())) {
+          Worklist.push_back(std::make_pair(
+              BallotArg, ConstantInt::getFalse(BallotArg->getType())));
+          continue;
+        }
       }
     }
 
diff --git a/llvm/test/Transforms/GVN/assume-ballot.ll b/llvm/test/Transforms/GVN/assume-ballot.ll
new file mode 100644
index 0000000000000..eef98c3e81a2d
--- /dev/null
+++ b/llvm/test/Transforms/GVN/assume-ballot.ll
@@ -0,0 +1,445 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=gvn -S | FileCheck %s
+;
+; Tests for assume-based ballot optimizations
+; This optimization recognizes patterns like:
+;   assume(ballot(cmp) == -1) -> cmp is true on all lanes
+;   assume(ballot(cmp) == 0)  -> cmp is false on all lanes
+
+declare void @llvm.assume(i1)
+declare i64 @llvm.amdgcn.ballot.i64(i1)
+declare i32 @llvm.amdgcn.ballot.i32(i1)
+
+; ============================================================================
+; POSITIVE CASES
+; ============================================================================
+
+; Test 1: assume(ballot(cmp) == -1) -> cmp replaced with true
+define amdgpu_kernel void @assume_ballot_all_lanes_i64(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_all_lanes_i64(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  store i32 1, ptr addrspace(1) %out
+  ret void
+bar:
+  store i32 0, ptr addrspace(1) %out
+  ret void
+}
+
+; Test 2: assume(ballot(cmp) == 0) -> cmp replaced with false
+define amdgpu_kernel void @assume_ballot_no_lanes_i64(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_no_lanes_i64(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[NONE:%.*]] = icmp eq i64 [[BALLOT]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[NONE]])
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %none = icmp eq i64 %ballot, 0
+  call void @llvm.assume(i1 %none)
+  %sel = select i1 %cmp, i32 1, i32 0
+  store i32 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; Test 3: ballot(cmp) == ballot(true) -> cmp replaced with true (EXEC MASK)
+define amdgpu_kernel void @assume_ballot_exec_mask_ballot_true(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_exec_mask_ballot_true(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[EXEC:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], [[EXEC]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %exec = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+  %all = icmp eq i64 %ballot, %exec
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 4: assume(ballot(cmp) == -1) -> cmp replaced with true (wave32 variant with ballot.i32)
+define amdgpu_kernel void @assume_ballot_all_lanes_i32(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_all_lanes_i32(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i32 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
+  %all = icmp eq i32 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 5: assume(ballot(cmp) == 0) -> cmp replaced with false (wave32 variant with ballot.i32)
+define amdgpu_kernel void @assume_ballot_no_lanes_i32(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_no_lanes_i32(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[CMP]])
+; CHECK-NEXT:    [[NONE:%.*]] = icmp eq i32 [[BALLOT]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[NONE]])
+; CHECK-NEXT:    br i1 false, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
+  %none = icmp eq i32 %ballot, 0
+  call void @llvm.assume(i1 %none)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 6: assume(ballot(cmp) == -1) -> cmp replaced with true (EXEC MASK - wave32 variant with ballot.i32)
+define amdgpu_kernel void @assume_ballot_exec_mask_wave32(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_exec_mask_wave32(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[CMP]])
+; CHECK-NEXT:    [[EXEC:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 true)
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i32 [[BALLOT]], [[EXEC]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
+  %exec = call i32 @llvm.amdgcn.ballot.i32(i1 true)
+  %all = icmp eq i32 %ballot, %exec
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 7: Dominance -> only dominated uses replaced with truth values
+define amdgpu_kernel void @assume_ballot_dominance(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_dominance(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[USE_BEFORE:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    store i32 [[USE_BEFORE]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    [[OUT2:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[OUT2]], align 4
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %use_before = zext i1 %cmp to i32
+  store i32 %use_before, ptr addrspace(1) %out
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  %use_after = zext i1 %cmp to i32
+  %out2 = getelementptr i32, ptr addrspace(1) %out, i64 1
+  store i32 %use_after, ptr addrspace(1) %out2
+  ret void
+}
+
+; Test 8: Swapped operands in icmp -> cmp replaced with true
+define amdgpu_kernel void @assume_ballot_swapped(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_swapped(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 -1, [[BALLOT]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 -1, %ballot
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 9: Swapped operands in icmp -> cmp replaced with true (EXEC MASK)
+define amdgpu_kernel void @assume_ballot_exec_mask_swapped(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_exec_mask_swapped(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[EXEC:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[EXEC]], [[BALLOT]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %exec = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+  %all = icmp eq i64 %exec, %ballot
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 10: Multiple uses of cmp after assume -> uses 1 & 2 replaced with truth values
+define amdgpu_kernel void @assume_ballot_multiple_uses(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_multiple_uses(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[OUT2:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NEXT:    store i32 10, ptr addrspace(1) [[OUT2]], align 4
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  %use1 = zext i1 %cmp to i32
+  store i32 %use1, ptr addrspace(1) %out
+  %use2 = select i1 %cmp, i32 10, i32 20
+  %out2 = getelementptr i32, ptr addrspace(1) %out, i64 1
+  store i32 %use2, ptr addrspace(1) %out2
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 11: Multiple uses of cmp after assume -> uses 1 & 2 replaced with truth values (EXEC MASK)
+define amdgpu_kernel void @assume_ballot_exec_mask_multiple_uses(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_exec_mask_multiple_uses(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[EXEC:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], [[EXEC]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    [[USE1:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    store i32 [[USE1]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[USE2:%.*]] = select i1 [[CMP]], i32 10, i32 20
+; CHECK-NEXT:    [[OUT2:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NEXT:    store i32 [[USE2]], ptr addrspace(1) [[OUT2]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %exec = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+  %all = icmp eq i64 %ballot, %exec
+  call void @llvm.assume(i1 %all)
+  %use1 = zext i1 %cmp to i32
+  store i32 %use1, ptr addrspace(1) %out
+  %use2 = select i1 %cmp, i32 10, i32 20
+  %out2 = getelementptr i32, ptr addrspace(1) %out, i64 1
+  store i32 %use2, ptr addrspace(1) %out2
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; ============================================================================
+; NEGATIVE CASES
+; ============================================================================
+
+; Test 1: Arbitrary mask -> cmp should not be transformed
+define amdgpu_kernel void @assume_ballot_arbitrary_mask(i32 %x, i64 %mask, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_arbitrary_mask(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[MATCHES:%.*]] = icmp eq i64 [[BALLOT]], [[MASK:%.*]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[MATCHES]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %matches = icmp eq i64 %ballot, %mask
+  call void @llvm.assume(i1 %matches)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 2: assume(ballot != -1) -> cmp should not be transformed
+define amdgpu_kernel void @assume_ballot_ne_negative(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_ne_negative(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[NOT_ALL:%.*]] = icmp ne i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[NOT_ALL]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %not_all = icmp ne i64 %ballot, -1
+  call void @llvm.assume(i1 %not_all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 3: assume(ballot != 0) -> cmp should not be transformed
+define amdgpu_kernel void @assume_ballot_ne_zero_negative(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_ne_zero_negative(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[SOME:%.*]] = icmp ne i64 [[BALLOT]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[SOME]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %some = icmp ne i64 %ballot, 0
+  call void @llvm.assume(i1 %some)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 4: Constant as mask value (other than -1 or 0) -> cmp should not be transformed
+define amdgpu_kernel void @assume_ballot_constant_mask(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_constant_mask(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[MATCHES:%.*]] = icmp eq i64 [[BALLOT]], 255
+; CHECK-NEXT:    call void @llvm.assume(i1 [[MATCHES]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %matches = icmp eq i64 %ballot, 255  ; partial mask
+  call void @llvm.assume(i1 %matches)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 5: ballot(cmp) == ballot(false) -> cmp should not be transformed (EXEC MASK)
+define amdgpu_kernel void @assume_ballot_not_exec_mask(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_not_exec_mask(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[NOT_EXEC:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 false)
+; CHECK-NEXT:    [[MATCHES:%.*]] = icmp eq i64 [[BALLOT]], [[NOT_EXEC]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[MATCHES]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %not_exec = call i64 @llvm.amdgcn.ballot.i64(i1 false)
+  %matches = icmp eq i64 %ballot, %not_exec
+  call void @llvm.assume(i1 %matches)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
diff --git a/llvm/test/Transforms/GVN/assume-equal.ll b/llvm/test/Transforms/GVN/assume-equal.ll
index 54e5267e573b3..a38980169fc52 100644
--- a/llvm/test/Transforms/GVN/assume-equal.ll
+++ b/llvm/test/Transforms/GVN/assume-equal.ll
@@ -462,60 +462,6 @@ define i8 @assume_ptr_eq_same_prov(ptr %p, i64 %x) {
   ret i8 %v
 }
 
-; Test AMDGPU ballot pattern optimization
-; assume(ballot(cmp) == -1) means cmp is true on all active lanes
-; so uses of cmp can be replaced with true
-define void @assume_ballot_const(i32 %x) {
-; CHECK-LABEL: @assume_ballot_const(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-define void @assume_ballot_exec_mask(i32 %x, i64 %exec_mask) {
-; CHECK-LABEL: @assume_ballot_exec_mask(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], %exec_mask
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, %exec_mask
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare noalias ptr @_Znwm(i64)
 declare void @_ZN1AC1Ev(ptr)
 declare void @llvm.assume(i1)