[llvm] [AMDGPU][InstCombine] Fold ballot intrinsic based on llvm.assume hints (PR #160670)

Mon Nov 10 08:53:11 PST 2025

https://github.com/TejaX-Alaghari updated https://github.com/llvm/llvm-project/pull/160670

>From 3e0532bdab99f9bd58ccf5a44856a76db50901bd Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Thu, 25 Sep 2025 14:27:36 +0530
Subject: [PATCH 1/9] [InstCombine] Optimize AMDGPU ballot + assume uniformity
 patterns

When we encounter assume(ballot(cmp) == -1), we know that cmp is uniform
across all lanes and evaluates to true. This optimization recognizes this
pattern and replaces the condition with a constant true, allowing
subsequent passes to eliminate dead code and optimize control flow.

The optimization handles both i32 and i64 ballot intrinsics and only
applies when the ballot result is compared against -1 (all lanes active).
This is a conservative approach that ensures correctness while enabling
significant optimizations for uniform control flow patterns.
---
 .../InstCombine/InstCombineCalls.cpp          |  33 ++++++
 .../amdgpu-assume-ballot-uniform.ll           | 108 ++++++++++++++++++
 2 files changed, 141 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 92fca90ddb88a..06fb168233211 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3549,6 +3549,39 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
+    // Optimize AMDGPU ballot uniformity assumptions:
+    // assume(icmp eq (ballot(cmp), -1)) implies that cmp is uniform and true
+    // This allows us to optimize away the ballot and replace cmp with true
+    Value *BallotInst;
+    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst),
+                                        m_AllOnes()))) {
+      // Check if this is an AMDGPU ballot intrinsic
+      if (auto *BallotCall = dyn_cast<IntrinsicInst>(BallotInst)) {
+        if (BallotCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
+          Value *BallotCondition = BallotCall->getArgOperand(0);
+
+          // If ballot(cmp) == -1, then cmp is uniform across all lanes and
+          // evaluates to true We can safely replace BallotCondition with true
+          // since ballot == -1 implies all lanes are true
+          if (BallotCondition->getType()->isIntOrIntVectorTy(1) &&
+              !isa<Constant>(BallotCondition)) {
+
+            // Add the condition to the worklist for further optimization
+            Worklist.pushValue(BallotCondition);
+
+            // Replace BallotCondition with true
+            BallotCondition->replaceAllUsesWith(
+                ConstantInt::getTrue(BallotCondition->getType()));
+
+            // The assumption is now always true, so we can simplify it
+            replaceUse(II->getOperandUse(0),
+                       ConstantInt::getTrue(II->getContext()));
+            return II;
+          }
+        }
+      }
+    }
+
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
     KnownBits Known(1);
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll b/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
new file mode 100644
index 0000000000000..3bf3b317b0771
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
@@ -0,0 +1,108 @@
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; Test case for optimizing AMDGPU ballot + assume patterns
+; When we assume that ballot(cmp) == -1, we know that cmp is uniform
+; This allows us to optimize away the ballot and directly branch
+
+define void @test_assume_ballot_uniform(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_uniform(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Test case with partial optimization - only ballot removal without branch optimization
+define void @test_assume_ballot_partial(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_partial(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Negative test - ballot not compared to -1
+define void @test_assume_ballot_not_uniform(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_not_uniform(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[SOME:%.*]] = icmp ne i64 [[BALLOT]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[SOME]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %some = icmp ne i64 %ballot, 0
+  call void @llvm.assume(i1 %some)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Test with 32-bit ballot
+define void @test_assume_ballot_uniform_i32(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_uniform_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
+  %all = icmp eq i32 %ballot, -1  
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+declare i64 @llvm.amdgcn.ballot.i64(i1)
+declare i32 @llvm.amdgcn.ballot.i32(i1)
+declare void @llvm.assume(i1)

>From 654e36532abbe87b0309269aa8374b5b47643108 Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Mon, 29 Sep 2025 12:50:10 +0530
Subject: [PATCH 2/9] [InstCombine] Add constant folding for AMDGPU ballot
 intrinsics

Address reviewer feedback by implementing free-form ballot intrinsic optimization
instead of assume-dependent patterns. This approach:

1. Optimizes ballot(constant) directly as a standard intrinsic optimization
2. Allows uniformity analysis to handle assumes through proper channels
3. Follows established AMDGPU intrinsic patterns (amdgcn_cos, amdgcn_sin)
4. Enables broader optimization opportunities beyond assume contexts

Optimizations:
- ballot(true) -> -1 (all lanes active)
- ballot(false) -> 0 (no lanes active)

This addresses the core reviewer concern about performing optimization
in assume context rather than as a free-form pattern, and lets the
uniformity analysis framework handle assumes as intended.

Test cases focus on constant folding rather than assume-specific patterns,
demonstrating the more general applicability of this approach.
---
 .../InstCombine/InstCombineCalls.cpp          |  51 +++-----
 .../InstCombine/InstCombineInternal.h         |   2 +
 .../amdgpu-assume-ballot-uniform.ll           | 108 -----------------
 .../amdgpu-ballot-constant-fold.ll            | 109 ++++++++++++++++++
 4 files changed, 130 insertions(+), 140 deletions(-)
 delete mode 100644 llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
 create mode 100644 llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 06fb168233211..a7cbb8899bd38 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -85,6 +85,8 @@ using namespace PatternMatch;
 
 STATISTIC(NumSimplified, "Number of library calls simplified");
 
+
+
 static cl::opt<unsigned> GuardWideningWindow(
     "instcombine-guard-widening-window",
     cl::init(3),
@@ -2996,6 +2998,20 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
+  case Intrinsic::amdgcn_ballot: {
+    // Optimize ballot intrinsics when the condition is known to be uniform
+    Value *Condition = II->getArgOperand(0);
+    
+    // If the condition is a constant, we can evaluate the ballot directly
+    if (auto *ConstCond = dyn_cast<ConstantInt>(Condition)) {
+      // ballot(true) -> -1 (all lanes active)
+      // ballot(false) -> 0 (no lanes active)
+      uint64_t Result = ConstCond->isOne() ? ~0ULL : 0ULL;
+      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
+    }
+    
+    break;
+  }
   case Intrinsic::ldexp: {
     // ldexp(ldexp(x, a), b) -> ldexp(x, a + b)
     //
@@ -3549,38 +3565,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
-    // Optimize AMDGPU ballot uniformity assumptions:
-    // assume(icmp eq (ballot(cmp), -1)) implies that cmp is uniform and true
-    // This allows us to optimize away the ballot and replace cmp with true
-    Value *BallotInst;
-    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst),
-                                        m_AllOnes()))) {
-      // Check if this is an AMDGPU ballot intrinsic
-      if (auto *BallotCall = dyn_cast<IntrinsicInst>(BallotInst)) {
-        if (BallotCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
-          Value *BallotCondition = BallotCall->getArgOperand(0);
-
-          // If ballot(cmp) == -1, then cmp is uniform across all lanes and
-          // evaluates to true We can safely replace BallotCondition with true
-          // since ballot == -1 implies all lanes are true
-          if (BallotCondition->getType()->isIntOrIntVectorTy(1) &&
-              !isa<Constant>(BallotCondition)) {
-
-            // Add the condition to the worklist for further optimization
-            Worklist.pushValue(BallotCondition);
-
-            // Replace BallotCondition with true
-            BallotCondition->replaceAllUsesWith(
-                ConstantInt::getTrue(BallotCondition->getType()));
-
-            // The assumption is now always true, so we can simplify it
-            replaceUse(II->getOperandUse(0),
-                       ConstantInt::getTrue(II->getContext()));
-            return II;
-          }
-        }
-      }
-    }
+
 
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
@@ -3595,6 +3580,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       return eraseInstFromFunction(*II);
     }
 
+
+
     // Update the cache of affected values for this assumption (we might be
     // here because we just simplified the condition).
     AC.updateAffectedValues(cast<AssumeInst>(II));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 9bdd8cb71f7f3..bdccc2e59f234 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -124,6 +124,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
       BinaryOperator &I);
   Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract(
       BinaryOperator &OldAShr);
+  
+
   Instruction *visitAShr(BinaryOperator &I);
   Instruction *visitLShr(BinaryOperator &I);
   Instruction *commonShiftTransforms(BinaryOperator &I);
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll b/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
deleted file mode 100644
index 3bf3b317b0771..0000000000000
--- a/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
+++ /dev/null
@@ -1,108 +0,0 @@
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-; Test case for optimizing AMDGPU ballot + assume patterns
-; When we assume that ballot(cmp) == -1, we know that cmp is uniform
-; This allows us to optimize away the ballot and directly branch
-
-define void @test_assume_ballot_uniform(i32 %x) {
-; CHECK-LABEL: @test_assume_ballot_uniform(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-; Test case with partial optimization - only ballot removal without branch optimization
-define void @test_assume_ballot_partial(i32 %x) {
-; CHECK-LABEL: @test_assume_ballot_partial(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-; Negative test - ballot not compared to -1
-define void @test_assume_ballot_not_uniform(i32 %x) {
-; CHECK-LABEL: @test_assume_ballot_not_uniform(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[SOME:%.*]] = icmp ne i64 [[BALLOT]], 0
-; CHECK-NEXT:    call void @llvm.assume(i1 [[SOME]])
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %some = icmp ne i64 %ballot, 0
-  call void @llvm.assume(i1 %some)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-; Test with 32-bit ballot
-define void @test_assume_ballot_uniform_i32(i32 %x) {
-; CHECK-LABEL: @test_assume_ballot_uniform_i32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
-  %all = icmp eq i32 %ballot, -1  
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-declare i64 @llvm.amdgcn.ballot.i64(i1)
-declare i32 @llvm.amdgcn.ballot.i32(i1)
-declare void @llvm.assume(i1)
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
new file mode 100644
index 0000000000000..6180760f7d511
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
@@ -0,0 +1,109 @@
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; Test cases for optimizing AMDGPU ballot intrinsics
+; Focus on constant folding ballot(true) -> -1 and ballot(false) -> 0
+
+define void @test_ballot_constant_true() {
+; CHECK-LABEL: @test_ballot_constant_true(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 -1, -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 true, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+define void @test_ballot_constant_false() {
+; CHECK-LABEL: @test_ballot_constant_false(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NONE:%.*]] = icmp ne i64 0, 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[NONE]])
+; CHECK-NEXT:    br i1 false, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 false)
+  %none = icmp ne i64 %ballot, 0
+  call void @llvm.assume(i1 %none)
+  br i1 false, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Test with 32-bit ballot constants
+define void @test_ballot_i32_constant_true() {
+; CHECK-LABEL: @test_ballot_i32_constant_true(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i32 -1, -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 true)
+  %all = icmp eq i32 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 true, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Negative test - variable condition should not be optimized
+define void @test_ballot_variable_condition(i32 %x) {
+; CHECK-LABEL: @test_ballot_variable_condition(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+declare i64 @llvm.amdgcn.ballot.i64(i1)
+declare i32 @llvm.amdgcn.ballot.i32(i1)
+declare void @llvm.assume(i1)

>From 2947714f589ef41a5613335ef1a478607d51636a Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Mon, 29 Sep 2025 21:24:37 +0530
Subject: [PATCH 3/9] [InstCombine] Implement generic assume-based uniformity
 optimization

Implement a comprehensive generic optimization for assume intrinsics that extracts
uniformity information and optimizes dominated uses. The optimization recognizes
multiple patterns that establish value uniformity and replaces dominated uses with
uniform constants.

Addresses uniformity analysis optimization opportunities identified in
AMDGPU ballot/readfirstlane + assume patterns for improved code generation
through constant propagation.
---
 .github/copilot-instructions.md               |   4 +
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     |  16 ++-
 .../InstCombine/InstCombineCalls.cpp          | 136 +++++++++++++++---
 .../InstCombine/InstCombineInternal.h         |   5 +-
 .../amdgpu-ballot-constant-fold.ll            | 117 +++++----------
 llvm/test/Transforms/InstCombine/assume.ll    |  30 +++-
 6 files changed, 195 insertions(+), 113 deletions(-)
 create mode 100644 .github/copilot-instructions.md

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
new file mode 100644
index 0000000000000..03748938700e3
--- /dev/null
+++ b/.github/copilot-instructions.md
@@ -0,0 +1,4 @@
+When performing a code review, pay close attention to code modifying a function's
+control flow. Could the change result in the corruption of performance profile
+data? Could the change result in invalid debug information, in particular for
+branches and calls?
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 4fe5d00679436..fc4e64fcd52a1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1322,12 +1322,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     if (isa<PoisonValue>(Arg))
       return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
 
-    if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
-      if (Src->isZero()) {
-        // amdgcn.ballot(i1 0) is zero.
-        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
-      }
-    }
+    // For Wave32 targets, convert i64 ballot to i32 ballot + zext
     if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
       // %b64 = call i64 ballot.i64(...)
       // =>
@@ -1341,6 +1336,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       Call->takeName(&II);
       return IC.replaceInstUsesWith(II, Call);
     }
+
+    if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
+      if (Src->isZero()) {
+        // amdgcn.ballot(i1 0) is zero.
+        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
+      }
+      // Note: ballot(true) is NOT constant folded because the result depends
+      // on the active lanes in the wavefront, not just the condition value.
+    }
     break;
   }
   case Intrinsic::amdgcn_wavefrontsize: {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index a7cbb8899bd38..663df634f4633 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -85,8 +85,6 @@ using namespace PatternMatch;
 
 STATISTIC(NumSimplified, "Number of library calls simplified");
 
-
-
 static cl::opt<unsigned> GuardWideningWindow(
     "instcombine-guard-widening-window",
     cl::init(3),
@@ -2998,20 +2996,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
-  case Intrinsic::amdgcn_ballot: {
-    // Optimize ballot intrinsics when the condition is known to be uniform
-    Value *Condition = II->getArgOperand(0);
-    
-    // If the condition is a constant, we can evaluate the ballot directly
-    if (auto *ConstCond = dyn_cast<ConstantInt>(Condition)) {
-      // ballot(true) -> -1 (all lanes active)
-      // ballot(false) -> 0 (no lanes active)
-      uint64_t Result = ConstCond->isOne() ? ~0ULL : 0ULL;
-      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
-    }
-    
-    break;
-  }
   case Intrinsic::ldexp: {
     // ldexp(ldexp(x, a), b) -> ldexp(x, a + b)
     //
@@ -3565,8 +3549,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
-
-
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
     KnownBits Known(1);
@@ -3580,7 +3562,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       return eraseInstFromFunction(*II);
     }
 
-
+    // Try to extract uniformity information from the assume and optimize
+    // dominated uses of any variables that are established as uniform.
+    optimizeAssumedUniformValues(cast<AssumeInst>(II));
 
     // Update the cache of affected values for this assumption (we might be
     // here because we just simplified the condition).
@@ -5046,3 +5030,117 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
   Call.setCalledFunction(FTy, NestF);
   return &Call;
 }
+
+/// Extract uniformity information from assume and optimize dominated uses.
+/// This works with any assume pattern that establishes value uniformity.
+void InstCombinerImpl::optimizeAssumedUniformValues(AssumeInst *Assume) {
+  Value *AssumedCondition = Assume->getArgOperand(0);
+  
+  // Map of uniform values to their uniform constants
+  SmallDenseMap<Value *, Constant *> UniformValues;
+  
+  // Pattern 1: assume(icmp eq (X, C)) -> X is uniform and equals C
+  if (auto *ICmp = dyn_cast<ICmpInst>(AssumedCondition)) {
+    if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) {
+      Value *LHS = ICmp->getOperand(0);
+      Value *RHS = ICmp->getOperand(1);
+      
+      // X == constant -> X is uniform and equals constant
+      if (auto *C = dyn_cast<Constant>(RHS)) {
+        UniformValues[LHS] = C;
+      } else if (auto *C = dyn_cast<Constant>(LHS)) {
+        UniformValues[RHS] = C;
+      }
+      
+      // Handle intrinsic patterns in equality comparisons
+      // Pattern: assume(ballot(cmp) == -1) -> cmp is uniform and true
+      if (auto *IntrinsicCall = dyn_cast<IntrinsicInst>(LHS)) {
+        if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
+          if (match(RHS, m_AllOnes())) {
+            Value *BallotArg = IntrinsicCall->getArgOperand(0);
+            if (BallotArg->getType()->isIntegerTy(1)) {
+              UniformValues[BallotArg] = ConstantInt::getTrue(BallotArg->getType());
+              
+              // Special case: if BallotArg is an equality comparison, 
+              // we know the operands are equal
+              if (auto *CmpInst = dyn_cast<ICmpInst>(BallotArg)) {
+                if (CmpInst->getPredicate() == ICmpInst::ICMP_EQ) {
+                  Value *CmpLHS = CmpInst->getOperand(0);
+                  Value *CmpRHS = CmpInst->getOperand(1);
+                  
+                  // If one operand is constant, the other is uniform and equals that constant
+                  if (auto *C = dyn_cast<Constant>(CmpRHS)) {
+                    UniformValues[CmpLHS] = C;
+                  } else if (auto *C = dyn_cast<Constant>(CmpLHS)) {
+                    UniformValues[CmpRHS] = C;
+                  }
+                  // TODO: Handle case where both operands are variables
+                }
+              }
+            }
+          }
+        } else if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_readfirstlane) {
+          // assume(readfirstlane(x) == c) -> x is uniform and equals c
+          if (auto *C = dyn_cast<Constant>(RHS)) {
+            Value *ReadFirstLaneArg = IntrinsicCall->getArgOperand(0);
+            UniformValues[ReadFirstLaneArg] = C;
+          }
+        }
+      }
+      
+      // Handle the reverse case too
+      if (auto *IntrinsicCall = dyn_cast<IntrinsicInst>(RHS)) {
+        if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
+          if (match(LHS, m_AllOnes())) {
+            Value *BallotArg = IntrinsicCall->getArgOperand(0);
+            if (BallotArg->getType()->isIntegerTy(1)) {
+              UniformValues[BallotArg] = ConstantInt::getTrue(BallotArg->getType());
+            }
+          }
+        } else if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_readfirstlane) {
+          if (auto *C = dyn_cast<Constant>(LHS)) {
+            Value *ReadFirstLaneArg = IntrinsicCall->getArgOperand(0);
+            UniformValues[ReadFirstLaneArg] = C;
+          }
+        }
+      }
+    }
+  }
+  
+  // Pattern 2: assume(X) where X is i1 -> X is uniform and equals true  
+  if (AssumedCondition->getType()->isIntegerTy(1) && !isa<ICmpInst>(AssumedCondition)) {
+    UniformValues[AssumedCondition] = ConstantInt::getTrue(AssumedCondition->getType());
+  }
+  
+  // Now optimize dominated uses of all discovered uniform values
+  for (auto &[UniformValue, UniformConstant] : UniformValues) {
+    SmallVector<Use *, 8> DominatedUses;
+    
+    // Find all uses dominated by the assume
+    // Skip if the value doesn't have a use list (e.g., constants)
+    if (!UniformValue->hasUseList())
+      continue;
+      
+    for (Use &U : UniformValue->uses()) {
+      Instruction *UseInst = dyn_cast<Instruction>(U.getUser());
+      if (!UseInst || UseInst == Assume)
+        continue;
+        
+      // Critical: Check dominance using InstCombine's infrastructure  
+      if (isValidAssumeForContext(Assume, UseInst, &DT)) {
+        DominatedUses.push_back(&U);
+      }
+    }
+    
+    // Replace only dominated uses with the uniform constant
+    for (Use *U : DominatedUses) {
+      U->set(UniformConstant);
+      Worklist.pushValue(U->getUser());
+    }
+    
+    // Mark for further optimization if we made changes
+    if (!DominatedUses.empty()) {
+      Worklist.pushValue(UniformValue);
+    }
+  }
+}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index bdccc2e59f234..17c0f0a46f87f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -124,8 +124,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
       BinaryOperator &I);
   Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract(
       BinaryOperator &OldAShr);
-  
-
   Instruction *visitAShr(BinaryOperator &I);
   Instruction *visitLShr(BinaryOperator &I);
   Instruction *commonShiftTransforms(BinaryOperator &I);
@@ -232,6 +230,9 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
 private:
   bool annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI);
   bool isDesirableIntType(unsigned BitWidth) const;
+  
+  /// Optimize uses of variables that are established as uniform by assume intrinsics.
+  void optimizeAssumedUniformValues(AssumeInst *Assume);
   bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
   bool shouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
index 6180760f7d511..b146487af9990 100644
--- a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
+++ b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
@@ -1,109 +1,56 @@
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -passes=instcombine -S | FileCheck %s
 
 ; Test cases for optimizing AMDGPU ballot intrinsics
-; Focus on constant folding ballot(true) -> -1 and ballot(false) -> 0
+; Focus on constant folding ballot(false) -> 0 and poison handling
 
-define void @test_ballot_constant_true() {
-; CHECK-LABEL: @test_ballot_constant_true(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 -1, -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
+; Test ballot with constant false condition gets folded
+define i32 @test_ballot_constant_false() {
+; CHECK-LABEL: @test_ballot_constant_false(
+; CHECK-NEXT:    ret i32 0
 ;
-entry:
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 true, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 false)
+  ret i32 %ballot
 }
 
-define void @test_ballot_constant_false() {
-; CHECK-LABEL: @test_ballot_constant_false(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[NONE:%.*]] = icmp ne i64 0, 0
-; CHECK-NEXT:    call void @llvm.assume(i1 [[NONE]])
-; CHECK-NEXT:    br i1 false, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
+; Test ballot.i64 with constant false condition gets folded
+define i64 @test_ballot_i64_constant_false() {
+; CHECK-LABEL: @test_ballot_i64_constant_false(
+; CHECK-NEXT:    ret i64 0
 ;
-entry:
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 false)
-  %none = icmp ne i64 %ballot, 0
-  call void @llvm.assume(i1 %none)
-  br i1 false, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
+  ret i64 %ballot
 }
 
-; Test with 32-bit ballot constants
-define void @test_ballot_i32_constant_true() {
-; CHECK-LABEL: @test_ballot_i32_constant_true(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i32 -1, -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
+; Test ballot with poison condition gets folded to poison
+define i64 @test_ballot_poison() {
+; CHECK-LABEL: @test_ballot_poison(
+; CHECK-NEXT:    ret i64 poison
 ;
-entry:
-  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 true)
-  %all = icmp eq i32 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 true, label %foo, label %bar
-
-foo:
-  ret void
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 poison)
+  ret i64 %ballot
+}
 
-bar:
-  ret void
+; Test that ballot(true) is NOT constant folded (depends on active lanes)
+define i64 @test_ballot_constant_true() {
+; CHECK-LABEL: @test_ballot_constant_true(
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; CHECK-NEXT:    ret i64 [[BALLOT]]
+;
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+  ret i64 %ballot
 }
 
-; Negative test - variable condition should not be optimized
-define void @test_ballot_variable_condition(i32 %x) {
+; Test that ballot with variable condition is not optimized
+define i64 @test_ballot_variable_condition(i32 %x) {
 ; CHECK-LABEL: @test_ballot_variable_condition(
-; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret i64 [[BALLOT]]
 ;
-entry:
   %cmp = icmp eq i32 %x, 0
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
+  ret i64 %ballot
 }
 
 declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare i32 @llvm.amdgcn.ballot.i32(i1)
-declare void @llvm.assume(i1)
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index cc87d6542fa12..299f2f1625797 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -104,7 +104,7 @@ define i32 @simple(i32 %a) #1 {
 ; CHECK-LABEL: @simple(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 4
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
-; CHECK-NEXT:    ret i32 [[A]]
+; CHECK-NEXT:    ret i32 4
 ;
   %cmp = icmp eq i32 %a, 4
   tail call void @llvm.assume(i1 %cmp)
@@ -1056,6 +1056,34 @@ define i1 @neg_assume_trunc_eq_one(i8 %x) {
   ret i1 %q
 }
 
+; Test AMDGPU ballot uniformity pattern optimization  
+; This demonstrates that assume(ballot(cmp) == -1) enables the optimization
+; of cmp to true, which then optimizes the branch condition
+define void @assume_ballot_uniform(i32 %x) {
+; CHECK-LABEL: @assume_ballot_uniform(
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare void @use(i1)
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 

>From d9d12cd23f5dc1124125223333a68add50ba4fb6 Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Thu, 2 Oct 2025 11:51:43 +0530
Subject: [PATCH 4/9] [InstCombine] Add focused assume-based optimizations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit implements two targeted optimizations for assume intrinsics:

1. Basic equality optimization: assume(x == c) replaces dominated uses of x with c
2. AMDGPU ballot optimization: assume(ballot(cmp) == -1) replaces dominated
   uses of cmp with true, since ballot == -1 means cmp is true on all active lanes

Key design principles:
- No uniformity analysis concepts - uses simple mathematical facts
- Dominance-based replacement for correctness
- Clean pattern matching without complex framework
- Addresses reviewer feedback to keep it simple and focused

Examples:
  assume(x == 42); use = add x, 1  →  use = 43
  assume(ballot(cmp) == -1); br cmp  →  br true

This enables better optimization of GPU code patterns while remaining
architecture-agnostic through the mathematical properties of the operations.
---
 .../InstCombine/InstCombineCalls.cpp          | 191 +++++++-----------
 .../InstCombine/InstCombineInternal.h         |   3 -
 llvm/test/Transforms/InstCombine/assume.ll    |   9 +-
 3 files changed, 79 insertions(+), 124 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 663df634f4633..e29b7294b3b02 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3549,6 +3549,79 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
+    // Basic assume equality optimization: assume(x == c) -> replace dominated uses of x with c
+    if (auto *ICmp = dyn_cast<ICmpInst>(IIOperand)) {
+      if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) {
+        Value *LHS = ICmp->getOperand(0);
+        Value *RHS = ICmp->getOperand(1);
+        Value *Variable = nullptr;
+        Constant *ConstantVal = nullptr;
+        
+        if (auto *C = dyn_cast<Constant>(RHS)) {
+          Variable = LHS;
+          ConstantVal = C;
+        } else if (auto *C = dyn_cast<Constant>(LHS)) {
+          Variable = RHS;
+          ConstantVal = C;
+        }
+        
+        if (Variable && ConstantVal && Variable->hasUseList()) {
+          SmallVector<Use *, 8> DominatedUses;
+          for (Use &U : Variable->uses()) {
+            if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
+              if (UseInst != II && UseInst != ICmp &&
+                  isValidAssumeForContext(II, UseInst, &DT)) {
+                DominatedUses.push_back(&U);
+              }
+            }
+          }
+          
+          for (Use *U : DominatedUses) {
+            U->set(ConstantVal);
+            Worklist.pushValue(U->getUser());
+          }
+          
+          if (!DominatedUses.empty()) {
+            Worklist.pushValue(Variable);
+          }
+        }
+      }
+    }
+
+    // Optimize AMDGPU ballot patterns in assumes:
+    // assume(ballot(cmp) == -1) means cmp is true on all active lanes
+    // We can replace uses of cmp with true in dominated contexts
+    Value *BallotInst;
+    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst), m_AllOnes()))) {
+      if (auto *IntrCall = dyn_cast<IntrinsicInst>(BallotInst)) {
+        if (IntrCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
+          Value *BallotArg = IntrCall->getArgOperand(0);
+          if (BallotArg->getType()->isIntegerTy(1) && BallotArg->hasUseList()) {
+            // Find dominated uses and replace with true
+            SmallVector<Use *, 8> DominatedUses;
+            for (Use &U : BallotArg->uses()) {
+              if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
+                if (UseInst != II && UseInst != IntrCall &&
+                    isValidAssumeForContext(II, UseInst, &DT)) {
+                  DominatedUses.push_back(&U);
+                }
+              }
+            }
+            
+            // Replace dominated uses with true
+            for (Use *U : DominatedUses) {
+              U->set(ConstantInt::getTrue(BallotArg->getType()));
+              Worklist.pushValue(U->getUser());
+            }
+            
+            if (!DominatedUses.empty()) {
+              Worklist.pushValue(BallotArg);
+            }
+          }
+        }
+      }
+    }
+
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
     KnownBits Known(1);
@@ -3562,10 +3635,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       return eraseInstFromFunction(*II);
     }
 
-    // Try to extract uniformity information from the assume and optimize
-    // dominated uses of any variables that are established as uniform.
-    optimizeAssumedUniformValues(cast<AssumeInst>(II));
-
     // Update the cache of affected values for this assumption (we might be
     // here because we just simplified the condition).
     AC.updateAffectedValues(cast<AssumeInst>(II));
@@ -5031,116 +5100,4 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
   return &Call;
 }
 
-/// Extract uniformity information from assume and optimize dominated uses.
-/// This works with any assume pattern that establishes value uniformity.
-void InstCombinerImpl::optimizeAssumedUniformValues(AssumeInst *Assume) {
-  Value *AssumedCondition = Assume->getArgOperand(0);
-  
-  // Map of uniform values to their uniform constants
-  SmallDenseMap<Value *, Constant *> UniformValues;
-  
-  // Pattern 1: assume(icmp eq (X, C)) -> X is uniform and equals C
-  if (auto *ICmp = dyn_cast<ICmpInst>(AssumedCondition)) {
-    if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) {
-      Value *LHS = ICmp->getOperand(0);
-      Value *RHS = ICmp->getOperand(1);
-      
-      // X == constant -> X is uniform and equals constant
-      if (auto *C = dyn_cast<Constant>(RHS)) {
-        UniformValues[LHS] = C;
-      } else if (auto *C = dyn_cast<Constant>(LHS)) {
-        UniformValues[RHS] = C;
-      }
-      
-      // Handle intrinsic patterns in equality comparisons
-      // Pattern: assume(ballot(cmp) == -1) -> cmp is uniform and true
-      if (auto *IntrinsicCall = dyn_cast<IntrinsicInst>(LHS)) {
-        if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
-          if (match(RHS, m_AllOnes())) {
-            Value *BallotArg = IntrinsicCall->getArgOperand(0);
-            if (BallotArg->getType()->isIntegerTy(1)) {
-              UniformValues[BallotArg] = ConstantInt::getTrue(BallotArg->getType());
-              
-              // Special case: if BallotArg is an equality comparison, 
-              // we know the operands are equal
-              if (auto *CmpInst = dyn_cast<ICmpInst>(BallotArg)) {
-                if (CmpInst->getPredicate() == ICmpInst::ICMP_EQ) {
-                  Value *CmpLHS = CmpInst->getOperand(0);
-                  Value *CmpRHS = CmpInst->getOperand(1);
-                  
-                  // If one operand is constant, the other is uniform and equals that constant
-                  if (auto *C = dyn_cast<Constant>(CmpRHS)) {
-                    UniformValues[CmpLHS] = C;
-                  } else if (auto *C = dyn_cast<Constant>(CmpLHS)) {
-                    UniformValues[CmpRHS] = C;
-                  }
-                  // TODO: Handle case where both operands are variables
-                }
-              }
-            }
-          }
-        } else if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_readfirstlane) {
-          // assume(readfirstlane(x) == c) -> x is uniform and equals c
-          if (auto *C = dyn_cast<Constant>(RHS)) {
-            Value *ReadFirstLaneArg = IntrinsicCall->getArgOperand(0);
-            UniformValues[ReadFirstLaneArg] = C;
-          }
-        }
-      }
-      
-      // Handle the reverse case too
-      if (auto *IntrinsicCall = dyn_cast<IntrinsicInst>(RHS)) {
-        if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
-          if (match(LHS, m_AllOnes())) {
-            Value *BallotArg = IntrinsicCall->getArgOperand(0);
-            if (BallotArg->getType()->isIntegerTy(1)) {
-              UniformValues[BallotArg] = ConstantInt::getTrue(BallotArg->getType());
-            }
-          }
-        } else if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_readfirstlane) {
-          if (auto *C = dyn_cast<Constant>(LHS)) {
-            Value *ReadFirstLaneArg = IntrinsicCall->getArgOperand(0);
-            UniformValues[ReadFirstLaneArg] = C;
-          }
-        }
-      }
-    }
-  }
-  
-  // Pattern 2: assume(X) where X is i1 -> X is uniform and equals true  
-  if (AssumedCondition->getType()->isIntegerTy(1) && !isa<ICmpInst>(AssumedCondition)) {
-    UniformValues[AssumedCondition] = ConstantInt::getTrue(AssumedCondition->getType());
-  }
-  
-  // Now optimize dominated uses of all discovered uniform values
-  for (auto &[UniformValue, UniformConstant] : UniformValues) {
-    SmallVector<Use *, 8> DominatedUses;
-    
-    // Find all uses dominated by the assume
-    // Skip if the value doesn't have a use list (e.g., constants)
-    if (!UniformValue->hasUseList())
-      continue;
-      
-    for (Use &U : UniformValue->uses()) {
-      Instruction *UseInst = dyn_cast<Instruction>(U.getUser());
-      if (!UseInst || UseInst == Assume)
-        continue;
-        
-      // Critical: Check dominance using InstCombine's infrastructure  
-      if (isValidAssumeForContext(Assume, UseInst, &DT)) {
-        DominatedUses.push_back(&U);
-      }
-    }
-    
-    // Replace only dominated uses with the uniform constant
-    for (Use *U : DominatedUses) {
-      U->set(UniformConstant);
-      Worklist.pushValue(U->getUser());
-    }
-    
-    // Mark for further optimization if we made changes
-    if (!DominatedUses.empty()) {
-      Worklist.pushValue(UniformValue);
-    }
-  }
-}
+
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 17c0f0a46f87f..9bdd8cb71f7f3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -230,9 +230,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
 private:
   bool annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI);
   bool isDesirableIntType(unsigned BitWidth) const;
-  
-  /// Optimize uses of variables that are established as uniform by assume intrinsics.
-  void optimizeAssumedUniformValues(AssumeInst *Assume);
   bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
   bool shouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index 299f2f1625797..bc0dd4398b5d6 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -1056,12 +1056,13 @@ define i1 @neg_assume_trunc_eq_one(i8 %x) {
   ret i1 %q
 }
 
-; Test AMDGPU ballot uniformity pattern optimization  
-; This demonstrates that assume(ballot(cmp) == -1) enables the optimization
-; of cmp to true, which then optimizes the branch condition
+; Test AMDGPU ballot pattern optimization  
+; assume(ballot(cmp) == -1) means cmp is true on all active lanes
+; so dominated uses of cmp can be replaced with true
 define void @assume_ballot_uniform(i32 %x) {
 ; CHECK-LABEL: @assume_ballot_uniform(
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
 ; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
 ; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]

>From 9e3b4ac23d3680255e38b285b380adbe1f07f5a6 Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Sun, 5 Oct 2025 13:51:02 +0530
Subject: [PATCH 5/9] Address @ssahasra's review feedback

- Remove 'dominated' terminology from comments and variable names
  (SSA values always dominate their uses)
- Rename DominatedUses -> Uses throughout
- Remove redundant UseInst != II check in ICmp block
- Fix code formatting (clang-format)
- Split long comment lines
- Remove extra blank lines at EOF
---
 .../InstCombine/InstCombineCalls.cpp          | 46 +++++++++----------
 llvm/test/Transforms/InstCombine/assume.ll    |  4 +-
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index e29b7294b3b02..7d585f63383fd 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3549,14 +3549,15 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
-    // Basic assume equality optimization: assume(x == c) -> replace dominated uses of x with c
+    // Basic assume equality optimization: assume(x == c) -> replace uses of x
+    // with c
     if (auto *ICmp = dyn_cast<ICmpInst>(IIOperand)) {
       if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) {
         Value *LHS = ICmp->getOperand(0);
         Value *RHS = ICmp->getOperand(1);
         Value *Variable = nullptr;
         Constant *ConstantVal = nullptr;
-        
+
         if (auto *C = dyn_cast<Constant>(RHS)) {
           Variable = LHS;
           ConstantVal = C;
@@ -3564,24 +3565,24 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
           Variable = RHS;
           ConstantVal = C;
         }
-        
+
         if (Variable && ConstantVal && Variable->hasUseList()) {
-          SmallVector<Use *, 8> DominatedUses;
+          SmallVector<Use *, 8> Uses;
           for (Use &U : Variable->uses()) {
             if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
-              if (UseInst != II && UseInst != ICmp &&
+              if (UseInst != ICmp &&
                   isValidAssumeForContext(II, UseInst, &DT)) {
-                DominatedUses.push_back(&U);
+                Uses.push_back(&U);
               }
             }
           }
-          
-          for (Use *U : DominatedUses) {
+
+          for (Use *U : Uses) {
             U->set(ConstantVal);
             Worklist.pushValue(U->getUser());
           }
-          
-          if (!DominatedUses.empty()) {
+
+          if (!Uses.empty()) {
             Worklist.pushValue(Variable);
           }
         }
@@ -3590,31 +3591,32 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
 
     // Optimize AMDGPU ballot patterns in assumes:
     // assume(ballot(cmp) == -1) means cmp is true on all active lanes
-    // We can replace uses of cmp with true in dominated contexts
+    // We can replace uses of cmp with true
     Value *BallotInst;
-    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst), m_AllOnes()))) {
+    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst),
+                                        m_AllOnes()))) {
       if (auto *IntrCall = dyn_cast<IntrinsicInst>(BallotInst)) {
         if (IntrCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
           Value *BallotArg = IntrCall->getArgOperand(0);
           if (BallotArg->getType()->isIntegerTy(1) && BallotArg->hasUseList()) {
-            // Find dominated uses and replace with true
-            SmallVector<Use *, 8> DominatedUses;
+            // Find uses and replace with true
+            SmallVector<Use *, 8> Uses;
             for (Use &U : BallotArg->uses()) {
               if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
-                if (UseInst != II && UseInst != IntrCall &&
+                if (UseInst != IntrCall &&
                     isValidAssumeForContext(II, UseInst, &DT)) {
-                  DominatedUses.push_back(&U);
+                  Uses.push_back(&U);
                 }
               }
             }
-            
-            // Replace dominated uses with true
-            for (Use *U : DominatedUses) {
+
+            // Replace uses with true
+            for (Use *U : Uses) {
               U->set(ConstantInt::getTrue(BallotArg->getType()));
               Worklist.pushValue(U->getUser());
             }
-            
-            if (!DominatedUses.empty()) {
+
+            if (!Uses.empty()) {
               Worklist.pushValue(BallotArg);
             }
           }
@@ -5099,5 +5101,3 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
   Call.setCalledFunction(FTy, NestF);
   return &Call;
 }
-
-
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index bc0dd4398b5d6..3279986121e34 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -1056,9 +1056,9 @@ define i1 @neg_assume_trunc_eq_one(i8 %x) {
   ret i1 %q
 }
 
-; Test AMDGPU ballot pattern optimization  
+; Test AMDGPU ballot pattern optimization
 ; assume(ballot(cmp) == -1) means cmp is true on all active lanes
-; so dominated uses of cmp can be replaced with true
+; so uses of cmp can be replaced with true
 define void @assume_ballot_uniform(i32 %x) {
 ; CHECK-LABEL: @assume_ballot_uniform(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0

>From e49679eeaf482ed98adb905503f40c90df37bf94 Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Tue, 7 Oct 2025 23:08:11 +0530
Subject: [PATCH 6/9] Address feedback on the location of the opt

- Remove redundant const propagration (assume equality opt) from InstCombine.
- Moved assume(ballot(cmp) == -1) optimization from InstCombine to GVN.
---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     | 16 ++--
 .../InstCombine/InstCombineCalls.cpp          | 75 -------------------
 llvm/lib/Transforms/Scalar/GVN.cpp            |  1 +
 llvm/test/Transforms/GVN/assume-equal.ll      | 54 +++++++++++++
 .../amdgpu-ballot-constant-fold.ll            | 56 --------------
 llvm/test/Transforms/InstCombine/assume.ll    | 31 +-------
 6 files changed, 62 insertions(+), 171 deletions(-)
 delete mode 100644 llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index fc4e64fcd52a1..4fe5d00679436 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1322,7 +1322,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     if (isa<PoisonValue>(Arg))
       return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
 
-    // For Wave32 targets, convert i64 ballot to i32 ballot + zext
+    if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
+      if (Src->isZero()) {
+        // amdgcn.ballot(i1 0) is zero.
+        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
+      }
+    }
     if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
       // %b64 = call i64 ballot.i64(...)
       // =>
@@ -1336,15 +1341,6 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       Call->takeName(&II);
       return IC.replaceInstUsesWith(II, Call);
     }
-
-    if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
-      if (Src->isZero()) {
-        // amdgcn.ballot(i1 0) is zero.
-        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
-      }
-      // Note: ballot(true) is NOT constant folded because the result depends
-      // on the active lanes in the wavefront, not just the condition value.
-    }
     break;
   }
   case Intrinsic::amdgcn_wavefrontsize: {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 7d585f63383fd..92fca90ddb88a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3549,81 +3549,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
-    // Basic assume equality optimization: assume(x == c) -> replace uses of x
-    // with c
-    if (auto *ICmp = dyn_cast<ICmpInst>(IIOperand)) {
-      if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) {
-        Value *LHS = ICmp->getOperand(0);
-        Value *RHS = ICmp->getOperand(1);
-        Value *Variable = nullptr;
-        Constant *ConstantVal = nullptr;
-
-        if (auto *C = dyn_cast<Constant>(RHS)) {
-          Variable = LHS;
-          ConstantVal = C;
-        } else if (auto *C = dyn_cast<Constant>(LHS)) {
-          Variable = RHS;
-          ConstantVal = C;
-        }
-
-        if (Variable && ConstantVal && Variable->hasUseList()) {
-          SmallVector<Use *, 8> Uses;
-          for (Use &U : Variable->uses()) {
-            if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
-              if (UseInst != ICmp &&
-                  isValidAssumeForContext(II, UseInst, &DT)) {
-                Uses.push_back(&U);
-              }
-            }
-          }
-
-          for (Use *U : Uses) {
-            U->set(ConstantVal);
-            Worklist.pushValue(U->getUser());
-          }
-
-          if (!Uses.empty()) {
-            Worklist.pushValue(Variable);
-          }
-        }
-      }
-    }
-
-    // Optimize AMDGPU ballot patterns in assumes:
-    // assume(ballot(cmp) == -1) means cmp is true on all active lanes
-    // We can replace uses of cmp with true
-    Value *BallotInst;
-    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst),
-                                        m_AllOnes()))) {
-      if (auto *IntrCall = dyn_cast<IntrinsicInst>(BallotInst)) {
-        if (IntrCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
-          Value *BallotArg = IntrCall->getArgOperand(0);
-          if (BallotArg->getType()->isIntegerTy(1) && BallotArg->hasUseList()) {
-            // Find uses and replace with true
-            SmallVector<Use *, 8> Uses;
-            for (Use &U : BallotArg->uses()) {
-              if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
-                if (UseInst != IntrCall &&
-                    isValidAssumeForContext(II, UseInst, &DT)) {
-                  Uses.push_back(&U);
-                }
-              }
-            }
-
-            // Replace uses with true
-            for (Use *U : Uses) {
-              U->set(ConstantInt::getTrue(BallotArg->getType()));
-              Worklist.pushValue(U->getUser());
-            }
-
-            if (!Uses.empty()) {
-              Worklist.pushValue(BallotArg);
-            }
-          }
-        }
-      }
-    }
-
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
     KnownBits Known(1);
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 72e1131a54a86..958826aba2699 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -54,6 +54,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
diff --git a/llvm/test/Transforms/GVN/assume-equal.ll b/llvm/test/Transforms/GVN/assume-equal.ll
index a38980169fc52..54e5267e573b3 100644
--- a/llvm/test/Transforms/GVN/assume-equal.ll
+++ b/llvm/test/Transforms/GVN/assume-equal.ll
@@ -462,6 +462,60 @@ define i8 @assume_ptr_eq_same_prov(ptr %p, i64 %x) {
   ret i8 %v
 }
 
+; Test AMDGPU ballot pattern optimization
+; assume(ballot(cmp) == -1) means cmp is true on all active lanes
+; so uses of cmp can be replaced with true
+define void @assume_ballot_const(i32 %x) {
+; CHECK-LABEL: @assume_ballot_const(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+define void @assume_ballot_exec_mask(i32 %x, i64 %exec_mask) {
+; CHECK-LABEL: @assume_ballot_exec_mask(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], %exec_mask
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, %exec_mask
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare noalias ptr @_Znwm(i64)
 declare void @_ZN1AC1Ev(ptr)
 declare void @llvm.assume(i1)
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
deleted file mode 100644
index b146487af9990..0000000000000
--- a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -passes=instcombine -S | FileCheck %s
-
-; Test cases for optimizing AMDGPU ballot intrinsics
-; Focus on constant folding ballot(false) -> 0 and poison handling
-
-; Test ballot with constant false condition gets folded
-define i32 @test_ballot_constant_false() {
-; CHECK-LABEL: @test_ballot_constant_false(
-; CHECK-NEXT:    ret i32 0
-;
-  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 false)
-  ret i32 %ballot
-}
-
-; Test ballot.i64 with constant false condition gets folded
-define i64 @test_ballot_i64_constant_false() {
-; CHECK-LABEL: @test_ballot_i64_constant_false(
-; CHECK-NEXT:    ret i64 0
-;
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 false)
-  ret i64 %ballot
-}
-
-; Test ballot with poison condition gets folded to poison
-define i64 @test_ballot_poison() {
-; CHECK-LABEL: @test_ballot_poison(
-; CHECK-NEXT:    ret i64 poison
-;
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 poison)
-  ret i64 %ballot
-}
-
-; Test that ballot(true) is NOT constant folded (depends on active lanes)
-define i64 @test_ballot_constant_true() {
-; CHECK-LABEL: @test_ballot_constant_true(
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; CHECK-NEXT:    ret i64 [[BALLOT]]
-;
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-  ret i64 %ballot
-}
-
-; Test that ballot with variable condition is not optimized
-define i64 @test_ballot_variable_condition(i32 %x) {
-; CHECK-LABEL: @test_ballot_variable_condition(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    ret i64 [[BALLOT]]
-;
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  ret i64 %ballot
-}
-
-declare i64 @llvm.amdgcn.ballot.i64(i1)
-declare i32 @llvm.amdgcn.ballot.i32(i1)
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index 3279986121e34..cc87d6542fa12 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -104,7 +104,7 @@ define i32 @simple(i32 %a) #1 {
 ; CHECK-LABEL: @simple(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 4
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
-; CHECK-NEXT:    ret i32 4
+; CHECK-NEXT:    ret i32 [[A]]
 ;
   %cmp = icmp eq i32 %a, 4
   tail call void @llvm.assume(i1 %cmp)
@@ -1056,35 +1056,6 @@ define i1 @neg_assume_trunc_eq_one(i8 %x) {
   ret i1 %q
 }
 
-; Test AMDGPU ballot pattern optimization
-; assume(ballot(cmp) == -1) means cmp is true on all active lanes
-; so uses of cmp can be replaced with true
-define void @assume_ballot_uniform(i32 %x) {
-; CHECK-LABEL: @assume_ballot_uniform(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare void @use(i1)
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 

>From ccf712b7c2f41e2c24843e57c194b6f499860ab8 Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Fri, 10 Oct 2025 18:35:00 +0530
Subject: [PATCH 7/9] Refactored the ballot optimization condition into
 propagateEquality method

---
 llvm/lib/Transforms/Scalar/GVN.cpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 958826aba2699..4290d68d9cac6 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -2540,6 +2540,21 @@ bool GVNPass::propagateEquality(
       }
     }
 
+    // If "ballot(cond) == -1" or "ballot(cond) == exec_mask" then cond is true
+    // on all active lanes, so cond can be replaced with true.
+    if (IntrinsicInst *IntrCall = dyn_cast<IntrinsicInst>(LHS)) {
+      if (IntrCall->getIntrinsicID() ==
+          Intrinsic::AMDGCNIntrinsics::amdgcn_ballot) {
+        Value *BallotArg = IntrCall->getArgOperand(0);
+        if (BallotArg->getType()->isIntegerTy(1) &&
+            (match(RHS, m_AllOnes()) || !isa<Constant>(RHS))) {
+          Worklist.push_back(std::make_pair(
+              BallotArg, ConstantInt::getTrue(BallotArg->getType())));
+          continue;
+        }
+      }
+    }
+
     // Now try to deduce additional equalities from this one. For example, if
     // the known equality was "(A != B)" == "false" then it follows that A and B
     // are equal in the scope. Only boolean equalities with an explicit true or

>From d3b966581cf38f028e04dd97ff2383171db991a9 Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Fri, 31 Oct 2025 11:53:13 +0530
Subject: [PATCH 8/9] Implement reviewer's suggestions to - 1. Add logic to
 handle swapped operands in icmp 2. Introduce priliminary logic for
 identifying an exec mask 3. Add a separate test file and include
 comprehensive cases for ballot with assume

---
 .github/copilot-instructions.md           |   4 -
 llvm/lib/Transforms/Scalar/GVN.cpp        |  58 ++-
 llvm/test/Transforms/GVN/assume-ballot.ll | 445 ++++++++++++++++++++++
 llvm/test/Transforms/GVN/assume-equal.ll  |  54 ---
 4 files changed, 495 insertions(+), 66 deletions(-)
 delete mode 100644 .github/copilot-instructions.md
 create mode 100644 llvm/test/Transforms/GVN/assume-ballot.ll

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
deleted file mode 100644
index 03748938700e3..0000000000000
--- a/.github/copilot-instructions.md
+++ /dev/null
@@ -1,4 +0,0 @@
-When performing a code review, pay close attention to code modifying a function's
-control flow. Could the change result in the corruption of performance profile
-data? Could the change result in invalid debug information, in particular for
-branches and calls?
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 4290d68d9cac6..081f108cbe71a 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -2540,18 +2540,60 @@ bool GVNPass::propagateEquality(
       }
     }
 
-    // If "ballot(cond) == -1" or "ballot(cond) == exec_mask" then cond is true
-    // on all active lanes, so cond can be replaced with true.
-    if (IntrinsicInst *IntrCall = dyn_cast<IntrinsicInst>(LHS)) {
-      if (IntrCall->getIntrinsicID() ==
-          Intrinsic::AMDGCNIntrinsics::amdgcn_ballot) {
-        Value *BallotArg = IntrCall->getArgOperand(0);
-        if (BallotArg->getType()->isIntegerTy(1) &&
-            (match(RHS, m_AllOnes()) || !isa<Constant>(RHS))) {
+    // Helper function to check if a value represents the current exec mask.
+    auto IsExecMask = [](Value *V) -> bool {
+      // Pattern 1: ballot(true)
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(V)) {
+        if (II->getIntrinsicID() == Intrinsic::AMDGCNIntrinsics::amdgcn_ballot) {
+          // Check if argument is constant true
+          if (match(II->getArgOperand(0), m_One())) {
+            return true;
+          }
+        }
+      }
+
+      return false;
+    };
+
+    // Check if either of the operands is a ballot intrinsic.
+    IntrinsicInst *BallotCall = nullptr;
+    Value *CompareValue = nullptr;
+
+    // Check both LHS and RHS for ballot intrinsic and its value since GVN may
+    // swap the operands.
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(LHS)) {
+      if (II->getIntrinsicID() == Intrinsic::AMDGCNIntrinsics::amdgcn_ballot) {
+        BallotCall = II;
+        CompareValue = RHS;
+      }
+    }
+    if (!BallotCall && isa<IntrinsicInst>(RHS)) {
+      IntrinsicInst *II = cast<IntrinsicInst>(RHS);
+      if (II->getIntrinsicID() == Intrinsic::AMDGCNIntrinsics::amdgcn_ballot) {
+        BallotCall = II;
+        CompareValue = LHS;
+      }
+    }
+
+    // If a ballot intrinsic is found, calculate the truth value of the ballot
+    // argument based on the RHS.
+    if (BallotCall) {
+      Value *BallotArg = BallotCall->getArgOperand(0);
+      if (BallotArg->getType()->isIntegerTy(1)) {
+        // Case 1: ballot(cond) == -1: cond true in all lanes -> cond = true.
+        // Case 2: ballot(cond) == exec_mask: cond true in all active lanes ->
+        // cond = true.
+        if (match(CompareValue, m_AllOnes()) || IsExecMask(CompareValue)) {
           Worklist.push_back(std::make_pair(
               BallotArg, ConstantInt::getTrue(BallotArg->getType())));
           continue;
         }
+        // Case 3: ballot(cond) == 0: cond false in all lanes -> cond = false.
+        if (match(CompareValue, m_Zero())) {
+          Worklist.push_back(std::make_pair(
+              BallotArg, ConstantInt::getFalse(BallotArg->getType())));
+          continue;
+        }
       }
     }
 
diff --git a/llvm/test/Transforms/GVN/assume-ballot.ll b/llvm/test/Transforms/GVN/assume-ballot.ll
new file mode 100644
index 0000000000000..5bc605ce11a99
--- /dev/null
+++ b/llvm/test/Transforms/GVN/assume-ballot.ll
@@ -0,0 +1,445 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=gvn -S | FileCheck %s
+;
+; Tests for assume-based ballot optimizations
+; This optimization recognizes patterns like:
+;   assume(ballot(cmp) == -1) -> cmp is true on all lanes
+;   assume(ballot(cmp) == 0)  -> cmp is false on all lanes
+
+declare void @llvm.assume(i1)
+declare i64 @llvm.amdgcn.ballot.i64(i1)
+declare i32 @llvm.amdgcn.ballot.i32(i1)
+
+; ============================================================================
+; POSITIVE CASES
+; ============================================================================
+
+; Test 1: assume(ballot(cmp) == -1) -> cmp replaced with true
+define amdgpu_kernel void @assume_ballot_all_lanes_i64(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_all_lanes_i64(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  store i32 1, ptr addrspace(1) %out
+  ret void
+bar:
+  store i32 0, ptr addrspace(1) %out
+  ret void
+}
+
+; Test 2: assume(ballot(cmp) == 0) -> cmp replaced with false
+define amdgpu_kernel void @assume_ballot_no_lanes_i64(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_no_lanes_i64(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[NONE:%.*]] = icmp eq i64 [[BALLOT]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[NONE]])
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %none = icmp eq i64 %ballot, 0
+  call void @llvm.assume(i1 %none)
+  %sel = select i1 %cmp, i32 1, i32 0
+  store i32 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; Test 3: ballot(cmp) == ballot(true) -> cmp replaced with true (EXEC MASK)
+define amdgpu_kernel void @assume_ballot_exec_mask_ballot_true(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_exec_mask_ballot_true(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[EXEC:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], [[EXEC]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %exec = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+  %all = icmp eq i64 %ballot, %exec
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 4: assume(ballot(cmp) == -1) -> cmp replaced with true (wave32 variant with ballot.i32)
+define amdgpu_kernel void @assume_ballot_all_lanes_i32(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_all_lanes_i32(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i32 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
+  %all = icmp eq i32 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 5: assume(ballot(cmp) == 0) -> cmp replaced with false (wave32 variant with ballot.i32)
+define amdgpu_kernel void @assume_ballot_no_lanes_i32(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_no_lanes_i32(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[CMP]])
+; CHECK-NEXT:    [[NONE:%.*]] = icmp eq i32 [[BALLOT]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[NONE]])
+; CHECK-NEXT:    br i1 false, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
+  %none = icmp eq i32 %ballot, 0
+  call void @llvm.assume(i1 %none)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 6: assume(ballot(cmp) == -1) -> cmp replaced with true (EXEC MASK - wave32 variant with ballot.i32)
+define amdgpu_kernel void @assume_ballot_exec_mask_wave32(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_exec_mask_wave32(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[CMP]])
+; CHECK-NEXT:    [[EXEC:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 true)
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i32 [[BALLOT]], [[EXEC]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
+  %exec = call i32 @llvm.amdgcn.ballot.i32(i1 true)
+  %all = icmp eq i32 %ballot, %exec
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 7: Dominance -> only dominated uses replaced with truth values
+define amdgpu_kernel void @assume_ballot_dominance(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_dominance(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[USE_BEFORE:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    store i32 [[USE_BEFORE]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    [[OUT2:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[OUT2]], align 4
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %use_before = zext i1 %cmp to i32
+  store i32 %use_before, ptr addrspace(1) %out
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  %use_after = zext i1 %cmp to i32
+  %out2 = getelementptr i32, ptr addrspace(1) %out, i64 1
+  store i32 %use_after, ptr addrspace(1) %out2
+  ret void
+}
+
+; Test 8: Swapped operands in icmp -> cmp replaced with true
+define amdgpu_kernel void @assume_ballot_swapped(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_swapped(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 -1, [[BALLOT]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 -1, %ballot
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 9: Swapped operands in icmp -> cmp replaced with true (EXEC MASK)
+define amdgpu_kernel void @assume_ballot_exec_mask_swapped(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_exec_mask_swapped(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[EXEC:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[EXEC]], [[BALLOT]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %exec = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+  %all = icmp eq i64 %exec, %ballot
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 10: Multiple uses of cmp after assume -> uses 1 & 2 replaced with truth values
+define amdgpu_kernel void @assume_ballot_multiple_uses(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_multiple_uses(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[OUT2:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NEXT:    store i32 10, ptr addrspace(1) [[OUT2]], align 4
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  %use1 = zext i1 %cmp to i32
+  store i32 %use1, ptr addrspace(1) %out
+  %use2 = select i1 %cmp, i32 10, i32 20
+  %out2 = getelementptr i32, ptr addrspace(1) %out, i64 1
+  store i32 %use2, ptr addrspace(1) %out2
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 11: Multiple uses of cmp after assume -> uses 1 & 2 replaced with truth values (EXEC MASK)
+define amdgpu_kernel void @assume_ballot_exec_mask_multiple_uses(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_exec_mask_multiple_uses(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[EXEC:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], [[EXEC]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    [[USE1:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    store i32 [[USE1]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[USE2:%.*]] = select i1 [[CMP]], i32 10, i32 20
+; CHECK-NEXT:    [[OUT2:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NEXT:    store i32 [[USE2]], ptr addrspace(1) [[OUT2]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %exec = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+  %all = icmp eq i64 %ballot, %exec
+  call void @llvm.assume(i1 %all)
+  %use1 = zext i1 %cmp to i32
+  store i32 %use1, ptr addrspace(1) %out
+  %use2 = select i1 %cmp, i32 10, i32 20
+  %out2 = getelementptr i32, ptr addrspace(1) %out, i64 1
+  store i32 %use2, ptr addrspace(1) %out2
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; ============================================================================
+; NEGATIVE CASES
+; ============================================================================
+
+; Test 1: assume(ballot != -1) -> cmp should not be transformed (cmp is false in atleast one lane)
+define amdgpu_kernel void @assume_ballot_ne_negative(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_ne_negative(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[NOT_ALL:%.*]] = icmp ne i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[NOT_ALL]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %not_all = icmp ne i64 %ballot, -1
+  call void @llvm.assume(i1 %not_all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 2: assume(ballot != 0) -> cmp should not be transformed (cmp is true in atleast one lane)
+define amdgpu_kernel void @assume_ballot_ne_zero_negative(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_ne_zero_negative(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[SOME:%.*]] = icmp ne i64 [[BALLOT]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[SOME]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %some = icmp ne i64 %ballot, 0
+  call void @llvm.assume(i1 %some)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 3: ballot(cmp) == ballot(false) -> cmp should not be transformed (RHS is not EXEC MASK)
+define amdgpu_kernel void @assume_ballot_not_exec_mask(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_not_exec_mask(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[NOT_EXEC:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 false)
+; CHECK-NEXT:    [[MATCHES:%.*]] = icmp eq i64 [[BALLOT]], [[NOT_EXEC]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[MATCHES]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %not_exec = call i64 @llvm.amdgcn.ballot.i64(i1 false)
+  %matches = icmp eq i64 %ballot, %not_exec
+  call void @llvm.assume(i1 %matches)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 4: Constant as mask value (other than -1 or 0) -> cmp should not be transformed
+define amdgpu_kernel void @assume_ballot_constant_mask(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_constant_mask(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[MATCHES:%.*]] = icmp eq i64 [[BALLOT]], 255
+; CHECK-NEXT:    call void @llvm.assume(i1 [[MATCHES]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %matches = icmp eq i64 %ballot, 255  ; partial mask
+  call void @llvm.assume(i1 %matches)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
+
+; Test 5: Arbitrary mask -> cmp should not be transformed
+define amdgpu_kernel void @assume_ballot_arbitrary_mask(i32 %x, i64 %mask, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_arbitrary_mask(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[MATCHES:%.*]] = icmp eq i64 [[BALLOT]], [[MASK:%.*]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[MATCHES]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %matches = icmp eq i64 %ballot, %mask
+  call void @llvm.assume(i1 %matches)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+bar:
+  ret void
+}
diff --git a/llvm/test/Transforms/GVN/assume-equal.ll b/llvm/test/Transforms/GVN/assume-equal.ll
index 54e5267e573b3..a38980169fc52 100644
--- a/llvm/test/Transforms/GVN/assume-equal.ll
+++ b/llvm/test/Transforms/GVN/assume-equal.ll
@@ -462,60 +462,6 @@ define i8 @assume_ptr_eq_same_prov(ptr %p, i64 %x) {
   ret i8 %v
 }
 
-; Test AMDGPU ballot pattern optimization
-; assume(ballot(cmp) == -1) means cmp is true on all active lanes
-; so uses of cmp can be replaced with true
-define void @assume_ballot_const(i32 %x) {
-; CHECK-LABEL: @assume_ballot_const(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-define void @assume_ballot_exec_mask(i32 %x, i64 %exec_mask) {
-; CHECK-LABEL: @assume_ballot_exec_mask(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], %exec_mask
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, %exec_mask
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare noalias ptr @_Znwm(i64)
 declare void @_ZN1AC1Ev(ptr)
 declare void @llvm.assume(i1)

>From b037aa665dbc01bde1122f72a1a3da4a1a1e5dd4 Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari at amd.com>
Date: Mon, 10 Nov 2025 14:44:19 +0530
Subject: [PATCH 9/9] Moved the assume based ballot folding logic to
 AMDGPUInstCombineIntrinsic.cpp

---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     | 53 +++++++++++
 llvm/lib/Transforms/Scalar/GVN.cpp            | 58 ------------
 .../AMDGPU/llvm.amdgcn.ballot-assume.ll}      | 92 +++++++++----------
 3 files changed, 97 insertions(+), 106 deletions(-)
 rename llvm/test/Transforms/{GVN/assume-ballot.ll => InstCombine/AMDGPU/llvm.amdgcn.ballot-assume.ll} (88%)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 4fe5d00679436..fd8be2de4e4f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -18,6 +18,7 @@
 #include "AMDGPUTargetTransformInfo.h"
 #include "GCNSubtarget.h"
 #include "llvm/ADT/FloatingPointMode.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
@@ -1341,6 +1342,58 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       Call->takeName(&II);
       return IC.replaceInstUsesWith(II, Call);
     }
+
+    // Fold ballot intrinsic based on llvm.assume hint about the result.
+    //
+    // assume(ballot(x) == ballot(i1 true)) -> x = true
+    // assume(ballot(x) == -1)              -> x = true
+    // assume(ballot(x) == 0)               -> x = false
+    if (Arg->getType()->isIntegerTy(1)) {
+      for (auto &AssumeVH : IC.getAssumptionCache().assumptionsFor(&II)) {
+        if (!AssumeVH)
+          continue;
+
+        auto *Assume = cast<AssumeInst>(AssumeVH);
+        Value *Cond = Assume->getArgOperand(0);
+
+        // Check if assume condition is an equality comparison.
+        auto *ICI = dyn_cast<ICmpInst>(Cond);
+        if (!ICI || ICI->getPredicate() != ICmpInst::ICMP_EQ)
+          continue;
+
+        // Extract the ballot and the value being compared against it.
+        Value *LHS = ICI->getOperand(0), *RHS = ICI->getOperand(1);
+        Value *CompareValue = (LHS == &II) ? RHS : (RHS == &II) ? LHS : nullptr;
+        if (!CompareValue)
+          continue;
+
+        // Determine the constant value of the ballot's condition argument.
+        std::optional<bool> PropagatedBool;
+        if (match(CompareValue, m_AllOnes()) ||
+            match(CompareValue,
+                  m_Intrinsic<Intrinsic::amdgcn_ballot>(m_One()))) {
+          // ballot(x) == -1 or ballot(x) == ballot(true) means x is true.
+          PropagatedBool = true;
+        } else if (match(CompareValue, m_Zero())) {
+          // ballot(x) == 0 means x is false.
+          PropagatedBool = false;
+        }
+
+        if (!PropagatedBool)
+          continue;
+
+        Constant *PropagatedValue =
+            ConstantInt::getBool(Arg->getContext(), *PropagatedBool);
+
+        // Replace dominated uses of the ballot's condition argument with the
+        // propagated value.
+        Arg->replaceUsesWithIf(PropagatedValue, [&](Use &U) {
+          Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
+          return UserInst && IC.getDominatorTree().dominates(Assume, U);
+        });
+      }
+    }
+
     break;
   }
   case Intrinsic::amdgcn_wavefrontsize: {
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 081f108cbe71a..72e1131a54a86 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -54,7 +54,6 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
@@ -2540,63 +2539,6 @@ bool GVNPass::propagateEquality(
       }
     }
 
-    // Helper function to check if a value represents the current exec mask.
-    auto IsExecMask = [](Value *V) -> bool {
-      // Pattern 1: ballot(true)
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(V)) {
-        if (II->getIntrinsicID() == Intrinsic::AMDGCNIntrinsics::amdgcn_ballot) {
-          // Check if argument is constant true
-          if (match(II->getArgOperand(0), m_One())) {
-            return true;
-          }
-        }
-      }
-
-      return false;
-    };
-
-    // Check if either of the operands is a ballot intrinsic.
-    IntrinsicInst *BallotCall = nullptr;
-    Value *CompareValue = nullptr;
-
-    // Check both LHS and RHS for ballot intrinsic and its value since GVN may
-    // swap the operands.
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(LHS)) {
-      if (II->getIntrinsicID() == Intrinsic::AMDGCNIntrinsics::amdgcn_ballot) {
-        BallotCall = II;
-        CompareValue = RHS;
-      }
-    }
-    if (!BallotCall && isa<IntrinsicInst>(RHS)) {
-      IntrinsicInst *II = cast<IntrinsicInst>(RHS);
-      if (II->getIntrinsicID() == Intrinsic::AMDGCNIntrinsics::amdgcn_ballot) {
-        BallotCall = II;
-        CompareValue = LHS;
-      }
-    }
-
-    // If a ballot intrinsic is found, calculate the truth value of the ballot
-    // argument based on the RHS.
-    if (BallotCall) {
-      Value *BallotArg = BallotCall->getArgOperand(0);
-      if (BallotArg->getType()->isIntegerTy(1)) {
-        // Case 1: ballot(cond) == -1: cond true in all lanes -> cond = true.
-        // Case 2: ballot(cond) == exec_mask: cond true in all active lanes ->
-        // cond = true.
-        if (match(CompareValue, m_AllOnes()) || IsExecMask(CompareValue)) {
-          Worklist.push_back(std::make_pair(
-              BallotArg, ConstantInt::getTrue(BallotArg->getType())));
-          continue;
-        }
-        // Case 3: ballot(cond) == 0: cond false in all lanes -> cond = false.
-        if (match(CompareValue, m_Zero())) {
-          Worklist.push_back(std::make_pair(
-              BallotArg, ConstantInt::getFalse(BallotArg->getType())));
-          continue;
-        }
-      }
-    }
-
     // Now try to deduce additional equalities from this one. For example, if
     // the known equality was "(A != B)" == "false" then it follows that A and B
     // are equal in the scope. Only boolean equalities with an explicit true or
diff --git a/llvm/test/Transforms/GVN/assume-ballot.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.ballot-assume.ll
similarity index 88%
rename from llvm/test/Transforms/GVN/assume-ballot.ll
rename to llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.ballot-assume.ll
index 5bc605ce11a99..b535d3ce8a07c 100644
--- a/llvm/test/Transforms/GVN/assume-ballot.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.ballot-assume.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=gvn -S | FileCheck %s
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -passes=instcombine -S | FileCheck %s
 ;
-; Tests for assume-based ballot optimizations
-; This optimization recognizes patterns like:
-;   assume(ballot(cmp) == -1) -> cmp is true on all lanes
-;   assume(ballot(cmp) == 0)  -> cmp is false on all lanes
+; Tests for assume-based ballot optimizations for patterns like:
+;   assume(ballot(cmp) == -1)         -> replace uses of cmp with true
+;   assume(ballot(cmp) == 0)          -> replace uses of cmp with false
+;   assume(ballot(cmp) == ballot(1))  -> replace uses of cmp with true
 
 declare void @llvm.assume(i1)
 declare i64 @llvm.amdgcn.ballot.i64(i1)
@@ -26,7 +26,6 @@ define amdgpu_kernel void @assume_ballot_all_lanes_i64(i32 %x, ptr addrspace(1)
 ; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ; CHECK:       bar:
-; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %cmp = icmp eq i32 %x, 0
@@ -70,7 +69,7 @@ define amdgpu_kernel void @assume_ballot_exec_mask_ballot_true(i32 %x, ptr addrs
 ; CHECK-NEXT:    [[EXEC:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], [[EXEC]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
 ; CHECK:       foo:
 ; CHECK-NEXT:    ret void
 ; CHECK:       bar:
@@ -147,7 +146,7 @@ define amdgpu_kernel void @assume_ballot_exec_mask_wave32(i32 %x, ptr addrspace(
 ; CHECK-NEXT:    [[EXEC:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 true)
 ; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i32 [[BALLOT]], [[EXEC]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
 ; CHECK:       foo:
 ; CHECK-NEXT:    ret void
 ; CHECK:       bar:
@@ -175,7 +174,7 @@ define amdgpu_kernel void @assume_ballot_dominance(i32 %x, ptr addrspace(1) %out
 ; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
 ; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    [[OUT2:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NEXT:    [[OUT2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
 ; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[OUT2]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -196,7 +195,7 @@ define amdgpu_kernel void @assume_ballot_swapped(i32 %x, ptr addrspace(1) %out)
 ; CHECK-LABEL: @assume_ballot_swapped(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 -1, [[BALLOT]]
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
 ; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
 ; CHECK:       foo:
@@ -224,7 +223,7 @@ define amdgpu_kernel void @assume_ballot_exec_mask_swapped(i32 %x, ptr addrspace
 ; CHECK-NEXT:    [[EXEC:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[EXEC]], [[BALLOT]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
 ; CHECK:       foo:
 ; CHECK-NEXT:    ret void
 ; CHECK:       bar:
@@ -251,7 +250,7 @@ define amdgpu_kernel void @assume_ballot_multiple_uses(i32 %x, ptr addrspace(1)
 ; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
 ; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[OUT:%.*]], align 4
-; CHECK-NEXT:    [[OUT2:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 1
+; CHECK-NEXT:    [[OUT2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
 ; CHECK-NEXT:    store i32 10, ptr addrspace(1) [[OUT2]], align 4
 ; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
 ; CHECK:       foo:
@@ -284,12 +283,10 @@ define amdgpu_kernel void @assume_ballot_exec_mask_multiple_uses(i32 %x, ptr add
 ; CHECK-NEXT:    [[EXEC:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], [[EXEC]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    [[USE1:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    store i32 [[USE1]], ptr addrspace(1) [[OUT:%.*]], align 4
-; CHECK-NEXT:    [[USE2:%.*]] = select i1 [[CMP]], i32 10, i32 20
-; CHECK-NEXT:    [[OUT2:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 1
-; CHECK-NEXT:    store i32 [[USE2]], ptr addrspace(1) [[OUT2]], align 4
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[OUT2:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-NEXT:    store i32 10, ptr addrspace(1) [[OUT2]], align 4
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
 ; CHECK:       foo:
 ; CHECK-NEXT:    ret void
 ; CHECK:       bar:
@@ -313,18 +310,14 @@ bar:
   ret void
 }
 
-; ============================================================================
-; NEGATIVE CASES
-; ============================================================================
-
-; Test 1: assume(ballot != -1) -> cmp should not be transformed (cmp is false in atleast one lane)
-define amdgpu_kernel void @assume_ballot_ne_negative(i32 %x, ptr addrspace(1) %out) {
-; CHECK-LABEL: @assume_ballot_ne_negative(
+; Test 12: ballot(cmp) == ballot(false) -> cmp replaced with false
+define amdgpu_kernel void @assume_ballot_false(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_false(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[NOT_ALL:%.*]] = icmp ne i64 [[BALLOT]], -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[NOT_ALL]])
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK-NEXT:    [[MATCHES:%.*]] = icmp eq i64 [[BALLOT]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[MATCHES]])
+; CHECK-NEXT:    br i1 false, label [[FOO:%.*]], label [[BAR:%.*]]
 ; CHECK:       foo:
 ; CHECK-NEXT:    ret void
 ; CHECK:       bar:
@@ -332,8 +325,9 @@ define amdgpu_kernel void @assume_ballot_ne_negative(i32 %x, ptr addrspace(1) %o
 ;
   %cmp = icmp eq i32 %x, 0
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %not_all = icmp ne i64 %ballot, -1
-  call void @llvm.assume(i1 %not_all)
+  %not_exec = call i64 @llvm.amdgcn.ballot.i64(i1 false)
+  %matches = icmp eq i64 %ballot, %not_exec
+  call void @llvm.assume(i1 %matches)
   br i1 %cmp, label %foo, label %bar
 
 foo:
@@ -342,13 +336,17 @@ bar:
   ret void
 }
 
-; Test 2: assume(ballot != 0) -> cmp should not be transformed (cmp is true in atleast one lane)
-define amdgpu_kernel void @assume_ballot_ne_zero_negative(i32 %x, ptr addrspace(1) %out) {
-; CHECK-LABEL: @assume_ballot_ne_zero_negative(
+; ============================================================================
+; NEGATIVE CASES
+; ============================================================================
+
+; Test 1: assume(ballot != -1) -> no transformation (requires icmp eq)
+define amdgpu_kernel void @assume_ballot_ne_negative(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_ne_negative(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[SOME:%.*]] = icmp ne i64 [[BALLOT]], 0
-; CHECK-NEXT:    call void @llvm.assume(i1 [[SOME]])
+; CHECK-NEXT:    [[NOT_ALL:%.*]] = icmp ne i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[NOT_ALL]])
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
 ; CHECK:       foo:
 ; CHECK-NEXT:    ret void
@@ -357,8 +355,8 @@ define amdgpu_kernel void @assume_ballot_ne_zero_negative(i32 %x, ptr addrspace(
 ;
   %cmp = icmp eq i32 %x, 0
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %some = icmp ne i64 %ballot, 0
-  call void @llvm.assume(i1 %some)
+  %not_all = icmp ne i64 %ballot, -1
+  call void @llvm.assume(i1 %not_all)
   br i1 %cmp, label %foo, label %bar
 
 foo:
@@ -367,14 +365,13 @@ bar:
   ret void
 }
 
-; Test 3: ballot(cmp) == ballot(false) -> cmp should not be transformed (RHS is not EXEC MASK)
-define amdgpu_kernel void @assume_ballot_not_exec_mask(i32 %x, ptr addrspace(1) %out) {
-; CHECK-LABEL: @assume_ballot_not_exec_mask(
+; Test 2: assume(ballot != 0) -> no transformation (requires icmp eq)
+define amdgpu_kernel void @assume_ballot_ne_zero_negative(i32 %x, ptr addrspace(1) %out) {
+; CHECK-LABEL: @assume_ballot_ne_zero_negative(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[NOT_EXEC:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 false)
-; CHECK-NEXT:    [[MATCHES:%.*]] = icmp eq i64 [[BALLOT]], [[NOT_EXEC]]
-; CHECK-NEXT:    call void @llvm.assume(i1 [[MATCHES]])
+; CHECK-NEXT:    [[SOME:%.*]] = icmp ne i64 [[BALLOT]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[SOME]])
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
 ; CHECK:       foo:
 ; CHECK-NEXT:    ret void
@@ -383,9 +380,8 @@ define amdgpu_kernel void @assume_ballot_not_exec_mask(i32 %x, ptr addrspace(1)
 ;
   %cmp = icmp eq i32 %x, 0
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %not_exec = call i64 @llvm.amdgcn.ballot.i64(i1 false)
-  %matches = icmp eq i64 %ballot, %not_exec
-  call void @llvm.assume(i1 %matches)
+  %some = icmp ne i64 %ballot, 0
+  call void @llvm.assume(i1 %some)
   br i1 %cmp, label %foo, label %bar
 
 foo:
@@ -394,7 +390,7 @@ bar:
   ret void
 }
 
-; Test 4: Constant as mask value (other than -1 or 0) -> cmp should not be transformed
+; Test 3: Constant mask (other than -1/0) -> no transformation
 define amdgpu_kernel void @assume_ballot_constant_mask(i32 %x, ptr addrspace(1) %out) {
 ; CHECK-LABEL: @assume_ballot_constant_mask(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
@@ -419,7 +415,7 @@ bar:
   ret void
 }
 
-; Test 5: Arbitrary mask -> cmp should not be transformed
+; Test 4: Runtime mask value -> no transformation
 define amdgpu_kernel void @assume_ballot_arbitrary_mask(i32 %x, i64 %mask, ptr addrspace(1) %out) {
 ; CHECK-LABEL: @assume_ballot_arbitrary_mask(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0