[llvm] Fold patterns which uses v4i32 type for comparisons on v2i64 type (PR #184328)

Fri Mar 6 10:19:25 PST 2026

https://github.com/fuad1502 updated https://github.com/llvm/llvm-project/pull/184328

>From 5c444710bf7fd07d8405aa8b6802cff845f9994a Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Tue, 3 Mar 2026 07:15:43 +0700
Subject: [PATCH 01/13] Add folding v4i32 equals-shuffle-and pattern to v2i64
 equals lit test

---
 .../fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll  | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll

diff --git a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
new file mode 100644
index 0000000000000..3c1b98af193f5
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
+
+define <4 x i32> @cmpeq_epi64_select(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SELECT]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %select = select <4 x i1> %cmp, <4 x i32> %shuffle, <4 x i32> zeroinitializer
+  ret <4 x i32> %select
+}
+
+define <4 x i32> @cmpeq_epi64_and(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_and(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SEXT]], [[SHUFFLE]]
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %and = and <4 x i32> %sext, %shuffle
+  ret <4 x i32> %and
+}

>From b8f17f37a75293051498bac60b03aa40c0e3c365 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Tue, 3 Mar 2026 12:08:15 +0700
Subject: [PATCH 02/13] Apply folding for v4i32 equals-shuffle-and pattern

---
 .../Transforms/Vectorize/VectorCombine.cpp    | 73 +++++++++++++++++++
 .../fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll  | 18 +++--
 2 files changed, 83 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 1f37e435b8080..ee29ce690a435 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -27,12 +27,15 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -153,6 +156,7 @@ class VectorCombine {
   bool foldEquivalentReductionCmp(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
   bool foldInterleaveIntrinsics(Instruction &I);
+  bool foldEqualShuffleAnd(Instruction &I);
   bool shrinkType(Instruction &I);
   bool shrinkLoadForShuffles(Instruction &I);
   bool shrinkPhiOfShuffles(Instruction &I);
@@ -5435,6 +5439,69 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
   return true;
 }
 
+// Prior to SSE4.1, performing equality comparison on v2i64 types require a
+// comparison on v4i32 types using the following pattern:
+//
+// ...
+// %3 = icmp eq <4 x i32> %1, %2
+// %4 = sext <4 x i1> %3 to <4 x i32>
+// %5 = shufflevector <4 x i32> %4, <4 x i32> poison, <4 x i32> <i32 1, i32 0,
+// i32 3, i32 2> %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32>
+// zeroinitializer
+// ...
+//
+// We should detect such patterns and fold them to:
+//
+// %3 = bitcast <4 x i32> %1 to <2 x i64>
+// %4 = bitcast <4 x i32> %2 to <2 x i64>
+// %5 = icmp eq <2 x i64> %3, %4
+// %6 = bitcast <2 x i64> %5 to <4 x i32>
+//
+bool VectorCombine::foldEqualShuffleAnd(Instruction &I) {
+  // Check pattern existance
+  Value *L, *R;
+  CmpPredicate Pred;
+
+  auto Equal = m_ICmp(Pred, m_Value(L), m_Value(R));
+  SmallVector<int> Mask = {1, 0, 3, 2};
+  auto Shuffle =
+      m_CombineOr(m_SExt(m_Shuffle(Equal, m_Poison(), m_SpecificMask(Mask))),
+                  m_Shuffle(m_SExt(Equal), m_Poison(), m_SpecificMask(Mask)));
+
+  if (!match(&I, m_CombineOr(m_And(m_SExt(Equal), Shuffle),
+                             m_Select(Equal, Shuffle, m_ZeroInt()))) ||
+      !ICmpInst::isEquality(Pred) || !L->getType()->isVectorTy())
+    return false;
+
+  auto *OldVecType = cast<VectorType>(L->getType());
+
+  if (OldVecType->isScalableTy() ||
+      !OldVecType->getElementType()->isIntegerTy())
+    return false;
+
+  int ElementCount = OldVecType->getElementCount().getFixedValue();
+  int ElementBitWidth = OldVecType->getElementType()->getIntegerBitWidth();
+
+  if (ElementCount != 4 || ElementBitWidth != 32)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "VC: Found equal-shuffle-and pattern" << '\n');
+
+  // Perform folding
+  IRBuilder Builder(&I);
+  auto *NewElementType = IntegerType::get(I.getContext(), ElementBitWidth * 2);
+  auto *NewVecType = VectorType::get(NewElementType, ElementCount / 2, false);
+  auto *BitCastL = Builder.CreateBitCast(L, NewVecType);
+  auto *BitCastR = Builder.CreateBitCast(R, NewVecType);
+  auto *Cmp = Builder.CreateICmp(Pred, BitCastL, BitCastR);
+  auto *SExt = Builder.CreateSExt(Cmp, NewVecType);
+  auto *BitCastCmp = Builder.CreateBitCast(SExt, OldVecType);
+
+  replaceValue(I, *BitCastCmp);
+
+  return false;
+}
+
 // Attempt to shrink loads that are only used by shufflevector instructions.
 bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
   auto *OldLoad = dyn_cast<LoadInst>(&I);
@@ -5777,11 +5844,17 @@ bool VectorCombine::run() {
           return true;
         if (foldBitOpOfCastConstant(I))
           return true;
+        if (foldEqualShuffleAnd(I))
+          return true;
         break;
       case Instruction::PHI:
         if (shrinkPhiOfShuffles(I))
           return true;
         break;
+      case Instruction::Select:
+        if (foldEqualShuffleAnd(I))
+          return true;
+        break;
       default:
         if (shrinkType(I))
           return true;
diff --git a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
index 3c1b98af193f5..42f3222ae3d27 100644
--- a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
+++ b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
@@ -4,10 +4,11 @@
 define <4 x i32> @cmpeq_epi64_select(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[SELECT:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[SELECT]]
 ;
   %cmp = icmp eq <4 x i32> %a, %b
@@ -20,10 +21,11 @@ define <4 x i32> @cmpeq_epi64_select(<4 x i32> noundef %a, <4 x i32> noundef %b)
 define <4 x i32> @cmpeq_epi64_and(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_and(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SEXT]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[AND:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[AND]]
 ;
   %cmp = icmp eq <4 x i32> %a, %b

>From 10ad4ba00e85fadba6255e53594144aa8973f068 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Tue, 3 Mar 2026 13:13:33 +0700
Subject: [PATCH 03/13] Handle commutated and instruction

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp |  2 +-
 .../fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll    | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index ee29ce690a435..13559610e37a2 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -5468,7 +5468,7 @@ bool VectorCombine::foldEqualShuffleAnd(Instruction &I) {
       m_CombineOr(m_SExt(m_Shuffle(Equal, m_Poison(), m_SpecificMask(Mask))),
                   m_Shuffle(m_SExt(Equal), m_Poison(), m_SpecificMask(Mask)));
 
-  if (!match(&I, m_CombineOr(m_And(m_SExt(Equal), Shuffle),
+  if (!match(&I, m_CombineOr(m_c_And(m_SExt(Equal), Shuffle),
                              m_Select(Equal, Shuffle, m_ZeroInt()))) ||
       !ICmpInst::isEquality(Pred) || !L->getType()->isVectorTy())
     return false;
diff --git a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
index 42f3222ae3d27..983a5a6708609 100644
--- a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
+++ b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
@@ -34,3 +34,20 @@ define <4 x i32> @cmpeq_epi64_and(<4 x i32> noundef %a, <4 x i32> noundef %b) {
   %and = and <4 x i32> %sext, %shuffle
   ret <4 x i32> %and
 }
+
+define <4 x i32> @cmpeq_epi64_commutated_and(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_commutated_and(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[AND:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %and = and <4 x i32> %shuffle, %sext
+  ret <4 x i32> %and
+}

>From 79bbff7d352bfd6f49990f76b2e04621695ab128 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Tue, 3 Mar 2026 16:44:27 +0700
Subject: [PATCH 04/13] Don't fold when intermediate instructions have uses
 outside pattern

---
 .../Transforms/Vectorize/VectorCombine.cpp    | 53 ++++++++++++-----
 .../fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll  | 57 +++++++++++++++++++
 2 files changed, 95 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 13559610e37a2..805aedfc61c04 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -5442,37 +5442,46 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
 // Prior to SSE4.1, performing equality comparison on v2i64 types require a
 // comparison on v4i32 types using the following pattern:
 //
-// ...
 // %3 = icmp eq <4 x i32> %1, %2
+//
 // %4 = sext <4 x i1> %3 to <4 x i32>
+//
 // %5 = shufflevector <4 x i32> %4, <4 x i32> poison, <4 x i32> <i32 1, i32 0,
-// i32 3, i32 2> %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32>
-// zeroinitializer
-// ...
+// i32 3, i32 2>
+//
+// %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32> zeroinitializer
+//
+// OR
+//
+// %6 = and <4 x i32> %sext, %shuffle
 //
-// We should detect such patterns and fold them to:
+// We should detect such patterns and fold them into:
 //
 // %3 = bitcast <4 x i32> %1 to <2 x i64>
+//
 // %4 = bitcast <4 x i32> %2 to <2 x i64>
+//
 // %5 = icmp eq <2 x i64> %3, %4
+//
 // %6 = bitcast <2 x i64> %5 to <4 x i32>
 //
 bool VectorCombine::foldEqualShuffleAnd(Instruction &I) {
-  // Check pattern existance
-  Value *L, *R;
+  Value *Equal, *Shuffle, *L, *R;
   CmpPredicate Pred;
-
-  auto Equal = m_ICmp(Pred, m_Value(L), m_Value(R));
   SmallVector<int> Mask = {1, 0, 3, 2};
-  auto Shuffle =
-      m_CombineOr(m_SExt(m_Shuffle(Equal, m_Poison(), m_SpecificMask(Mask))),
-                  m_Shuffle(m_SExt(Equal), m_Poison(), m_SpecificMask(Mask)));
 
-  if (!match(&I, m_CombineOr(m_c_And(m_SExt(Equal), Shuffle),
-                             m_Select(Equal, Shuffle, m_ZeroInt()))) ||
-      !ICmpInst::isEquality(Pred) || !L->getType()->isVectorTy())
+  // Check pattern existance
+  if (!match(&I, m_CombineOr(m_c_And(m_SExt(m_Value(Equal)),
+                                     m_SExtOrSelf(m_Value(Shuffle))),
+                             m_Select(m_Value(Equal),
+                                      m_SExtOrSelf(m_Value(Shuffle)),
+                                      m_ZeroInt()))) ||
+      !match(Shuffle, m_Shuffle(m_SExtOrSelf(m_Specific(Equal)), m_Poison(),
+                                m_SpecificMask(Mask))) ||
+      !match(Equal, m_ICmp(Pred, m_Value(L), m_Value(R))))
     return false;
 
+  // Check argument type
   auto *OldVecType = cast<VectorType>(L->getType());
 
   if (OldVecType->isScalableTy() ||
@@ -5485,6 +5494,20 @@ bool VectorCombine::foldEqualShuffleAnd(Instruction &I) {
   if (ElementCount != 4 || ElementBitWidth != 32)
     return false;
 
+  // Check uses outside pattern
+  if (!Shuffle->hasOneUse())
+    return false;
+
+  for (auto *U : Equal->users()) {
+    if (U == &I || U == Shuffle)
+      continue;
+    if (!isa<llvm::CastInst>(U))
+      return false;
+    for (auto *U : U->users())
+      if (U != &I && U != Shuffle)
+        return false;
+  }
+
   LLVM_DEBUG(dbgs() << "VC: Found equal-shuffle-and pattern" << '\n');
 
   // Perform folding
diff --git a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
index 983a5a6708609..2d7e72d359973 100644
--- a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
+++ b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
@@ -51,3 +51,60 @@ define <4 x i32> @cmpeq_epi64_commutated_and(<4 x i32> noundef %a, <4 x i32> nou
   %and = and <4 x i32> %shuffle, %sext
   ret <4 x i32> %and
 }
+
+declare void @use.v4i1(<4 x i1>)
+declare void @use.v4i32(<4 x i32>)
+
+define <4 x i32> @cmpeq_epi64_multi_use_cmp(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_multi_use_cmp(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    call void @use.v4i1(<4 x i1> [[CMP]])
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SHUFFLE]], [[SEXT]]
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  call void @use.v4i1(<4 x i1> %cmp)
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %and = and <4 x i32> %shuffle, %sext
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @cmpeq_epi64_multi_use_sext(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_multi_use_sext(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    call void @use.v4i32(<4 x i32> [[SEXT]])
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SHUFFLE]], [[SEXT]]
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  call void @use.v4i32(<4 x i32> %sext)
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %and = and <4 x i32> %shuffle, %sext
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @cmpeq_epi64_multi_use_shuffle(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_multi_use_shuffle(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    call void @use.v4i32(<4 x i32> [[SHUFFLE]])
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SHUFFLE]], [[SEXT]]
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  call void @use.v4i32(<4 x i32> %shuffle)
+  %and = and <4 x i32> %shuffle, %sext
+  ret <4 x i32> %and
+}

>From dd0432152c35d705e03983a55f5fae2758699564 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Tue, 3 Mar 2026 18:10:25 +0700
Subject: [PATCH 05/13] Add negative test cases and add icmp condition code
 check

---
 .../Transforms/Vectorize/VectorCombine.cpp    |  3 +-
 .../fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll  | 64 +++++++++++++++++++
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 805aedfc61c04..fd2ca3ef7c901 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -5478,7 +5478,8 @@ bool VectorCombine::foldEqualShuffleAnd(Instruction &I) {
                                       m_ZeroInt()))) ||
       !match(Shuffle, m_Shuffle(m_SExtOrSelf(m_Specific(Equal)), m_Poison(),
                                 m_SpecificMask(Mask))) ||
-      !match(Equal, m_ICmp(Pred, m_Value(L), m_Value(R))))
+      !match(Equal, m_ICmp(Pred, m_Value(L), m_Value(R))) ||
+      !CmpInst::isEquality(Pred))
     return false;
 
   // Check argument type
diff --git a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
index 2d7e72d359973..4e13afd360673 100644
--- a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
+++ b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
@@ -108,3 +108,67 @@ define <4 x i32> @cmpeq_epi64_multi_use_shuffle(<4 x i32> noundef %a, <4 x i32>
   %and = and <4 x i32> %shuffle, %sext
   ret <4 x i32> %and
 }
+
+define <4 x i32> @cmpeq_epi64_select_neg_0(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select_neg_0(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SELECT]]
+;
+  %cmp = icmp sgt <4 x i32> %a, %b
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %select = select <4 x i1> %cmp, <4 x i32> %shuffle, <4 x i32> zeroinitializer
+  ret <4 x i32> %select
+}
+
+define <4 x i32> @cmpeq_epi64_and_neg_1(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_and_neg_1(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT:%.*]] = zext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SEXT]], [[SHUFFLE]]
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  %sext = zext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %and = and <4 x i32> %sext, %shuffle
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @cmpeq_epi64_select_neg_2(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select_neg_2(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SELECT]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %select = select <4 x i1> %cmp, <4 x i32> %shuffle, <4 x i32> zeroinitializer
+  ret <4 x i32> %select
+}
+
+define <4 x i32> @cmpeq_epi64_select_neg_3(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select_neg_3(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> [[SEXT]]
+; CHECK-NEXT:    ret <4 x i32> [[SELECT]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %select = select <4 x i1> %cmp, <4 x i32> %shuffle, <4 x i32> %sext
+  ret <4 x i32> %select
+}

>From 32b31c2f728bfbbfcc982db0550d1f5f289dc71f Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Tue, 3 Mar 2026 19:03:24 +0700
Subject: [PATCH 06/13] Remove unnecessary additional headers

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fd2ca3ef7c901..9832b396bde71 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -27,15 +27,12 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"

>From 5f46a5f41a65ab7759110db98c5797e61ad5b4d0 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Wed, 4 Mar 2026 18:06:25 +0700
Subject: [PATCH 07/13] Move v4i32 eq-shuffle-and folding to InstCombine

---
 .../InstCombine/InstCombineAndOrXor.cpp       |  3 +
 .../InstCombine/InstCombineInternal.h         |  2 +
 .../InstCombine/InstCombineSelect.cpp         |  3 +
 .../InstCombine/InstructionCombining.cpp      | 84 +++++++++++++++++
 .../Transforms/Vectorize/VectorCombine.cpp    | 94 -------------------
 .../fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll  | 28 +++---
 6 files changed, 106 insertions(+), 108 deletions(-)
 rename llvm/test/Transforms/{VectorCombine => InstCombine}/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll (88%)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 073f094639fa0..30c1e8e8aca73 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2898,6 +2898,9 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
                                       /*SimplifyOnly*/ false, *this))
     return BinaryOperator::CreateAnd(Op0, V);
 
+  if (auto *Folded = foldV4EqualShuffleAndToV2Equal(I))
+    return Folded;
+
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 2e7758e952eaf..fed88cc84f46e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -484,6 +484,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
     return Sel;
   }
 
+  Instruction *foldV4EqualShuffleAndToV2Equal(Instruction &I);
+
 public:
   /// Create and insert the idiom we use to indicate a block is unreachable
   /// without having to rewrite the CFG from within InstCombine.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 6707d1abf5ca0..598008a3d8d24 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -4982,5 +4982,8 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
     return CallInst::Create(Scmp, {CmpLHS, ConstantInt::get(SI.getType(), 0)});
   }
 
+  if (auto *Folded = foldV4EqualShuffleAndToV2Equal(SI))
+    return Folded;
+
   return nullptr;
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 0ca5da1bbf251..1c71bb5387ab5 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1296,6 +1296,90 @@ Value *InstCombinerImpl::foldUsingDistributiveLaws(BinaryOperator &I) {
   return SimplifySelectsFeedingBinaryOp(I, LHS, RHS);
 }
 
+// Prior to SSE4.1, performing equality comparison on v2i64 types require a
+// comparison on v4i32 types using the following pattern:
+//
+// %3 = icmp eq <4 x i32> %1, %2
+//
+// %4 = sext <4 x i1> %3 to <4 x i32>
+//
+// %5 = shufflevector <4 x i32> %4, <4 x i32> poison, <4 x i32> <i32 1, i32 0,
+// i32 3, i32 2>
+//
+// %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32> zeroinitializer
+//
+// OR
+//
+// %6 = and <4 x i32> %sext, %shuffle
+//
+// We should detect such patterns and fold them into:
+//
+// %3 = bitcast <4 x i32> %1 to <2 x i64>
+//
+// %4 = bitcast <4 x i32> %2 to <2 x i64>
+//
+// %5 = icmp eq <2 x i64> %3, %4
+//
+// %6 = bitcast <2 x i64> %5 to <4 x i32>
+//
+Instruction *InstCombinerImpl::foldV4EqualShuffleAndToV2Equal(Instruction &I) {
+  Value *Equal, *Shuffle, *L, *R;
+  CmpPredicate Pred;
+  SmallVector<int> Mask = {1, 0, 3, 2};
+
+  // Check pattern existance
+  if (!match(&I,
+             m_CombineOr(m_c_And(m_SExt(m_Value(Equal)),
+                                 m_SExtOrSelf(m_Value(Shuffle))),
+                         m_Select(m_Value(Equal),
+                                  m_SExtOrSelf(m_Value(Shuffle)), m_Zero()))) ||
+      !match(Shuffle, m_Shuffle(m_SExtOrSelf(m_Specific(Equal)), m_Poison(),
+                                m_SpecificMask(Mask))) ||
+      !match(Equal, m_ICmp(Pred, m_Value(L), m_Value(R))) ||
+      Pred != CmpInst::ICMP_EQ)
+    return nullptr;
+
+  // Check argument type
+  auto *OldVecType = cast<VectorType>(L->getType());
+
+  if (OldVecType->isScalableTy() ||
+      !OldVecType->getElementType()->isIntegerTy())
+    return nullptr;
+
+  int ElementCount = OldVecType->getElementCount().getFixedValue();
+  int ElementBitWidth = OldVecType->getElementType()->getIntegerBitWidth();
+
+  if (ElementCount != 4 || ElementBitWidth != 32)
+    return nullptr;
+
+  // Check uses outside pattern
+  if (!Shuffle->hasOneUse())
+    return nullptr;
+
+  for (auto *U : Equal->users()) {
+    if (U == &I || U == Shuffle)
+      continue;
+    if (!isa<llvm::CastInst>(U))
+      return nullptr;
+    for (auto *U : U->users())
+      if (U != &I && U != Shuffle)
+        return nullptr;
+  }
+
+  LLVM_DEBUG(dbgs() << "IC: Folding equal-shuffle-and pattern" << '\n');
+
+  // Perform folding
+  auto *NewElementType = IntegerType::get(I.getContext(), ElementBitWidth * 2);
+  auto *NewVecType = VectorType::get(NewElementType, ElementCount / 2, false);
+  auto *BitCastL = Builder.CreateBitCast(L, NewVecType);
+  auto *BitCastR = Builder.CreateBitCast(R, NewVecType);
+  auto *Cmp = Builder.CreateICmp(Pred, BitCastL, BitCastR);
+  auto *SExt = Builder.CreateSExt(Cmp, NewVecType);
+  auto *BitCastCmp = Builder.CreateBitCast(SExt, OldVecType);
+
+  return replaceInstUsesWith(I, BitCastCmp);
+}
+
 static std::optional<std::pair<Value *, Value *>>
 matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS) {
   if (LHS->getParent() != RHS->getParent())
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 9832b396bde71..1f37e435b8080 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -153,7 +153,6 @@ class VectorCombine {
   bool foldEquivalentReductionCmp(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
   bool foldInterleaveIntrinsics(Instruction &I);
-  bool foldEqualShuffleAnd(Instruction &I);
   bool shrinkType(Instruction &I);
   bool shrinkLoadForShuffles(Instruction &I);
   bool shrinkPhiOfShuffles(Instruction &I);
@@ -5436,93 +5435,6 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
   return true;
 }
 
-// Prior to SSE4.1, performing equality comparison on v2i64 types require a
-// comparison on v4i32 types using the following pattern:
-//
-// %3 = icmp eq <4 x i32> %1, %2
-//
-// %4 = sext <4 x i1> %3 to <4 x i32>
-//
-// %5 = shufflevector <4 x i32> %4, <4 x i32> poison, <4 x i32> <i32 1, i32 0,
-// i32 3, i32 2>
-//
-// %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32> zeroinitializer
-//
-// OR
-//
-// %6 = and <4 x i32> %sext, %shuffle
-//
-// We should detect such patterns and fold them into:
-//
-// %3 = bitcast <4 x i32> %1 to <2 x i64>
-//
-// %4 = bitcast <4 x i32> %2 to <2 x i64>
-//
-// %5 = icmp eq <2 x i64> %3, %4
-//
-// %6 = bitcast <2 x i64> %5 to <4 x i32>
-//
-bool VectorCombine::foldEqualShuffleAnd(Instruction &I) {
-  Value *Equal, *Shuffle, *L, *R;
-  CmpPredicate Pred;
-  SmallVector<int> Mask = {1, 0, 3, 2};
-
-  // Check pattern existance
-  if (!match(&I, m_CombineOr(m_c_And(m_SExt(m_Value(Equal)),
-                                     m_SExtOrSelf(m_Value(Shuffle))),
-                             m_Select(m_Value(Equal),
-                                      m_SExtOrSelf(m_Value(Shuffle)),
-                                      m_ZeroInt()))) ||
-      !match(Shuffle, m_Shuffle(m_SExtOrSelf(m_Specific(Equal)), m_Poison(),
-                                m_SpecificMask(Mask))) ||
-      !match(Equal, m_ICmp(Pred, m_Value(L), m_Value(R))) ||
-      !CmpInst::isEquality(Pred))
-    return false;
-
-  // Check argument type
-  auto *OldVecType = cast<VectorType>(L->getType());
-
-  if (OldVecType->isScalableTy() ||
-      !OldVecType->getElementType()->isIntegerTy())
-    return false;
-
-  int ElementCount = OldVecType->getElementCount().getFixedValue();
-  int ElementBitWidth = OldVecType->getElementType()->getIntegerBitWidth();
-
-  if (ElementCount != 4 || ElementBitWidth != 32)
-    return false;
-
-  // Check uses outside pattern
-  if (!Shuffle->hasOneUse())
-    return false;
-
-  for (auto *U : Equal->users()) {
-    if (U == &I || U == Shuffle)
-      continue;
-    if (!isa<llvm::CastInst>(U))
-      return false;
-    for (auto *U : U->users())
-      if (U != &I && U != Shuffle)
-        return false;
-  }
-
-  LLVM_DEBUG(dbgs() << "VC: Found equal-shuffle-and pattern" << '\n');
-
-  // Perform folding
-  IRBuilder Builder(&I);
-  auto *NewElementType = IntegerType::get(I.getContext(), ElementBitWidth * 2);
-  auto *NewVecType = VectorType::get(NewElementType, ElementCount / 2, false);
-  auto *BitCastL = Builder.CreateBitCast(L, NewVecType);
-  auto *BitCastR = Builder.CreateBitCast(R, NewVecType);
-  auto *Cmp = Builder.CreateICmp(Pred, BitCastL, BitCastR);
-  auto *SExt = Builder.CreateSExt(Cmp, NewVecType);
-  auto *BitCastCmp = Builder.CreateBitCast(SExt, OldVecType);
-
-  replaceValue(I, *BitCastCmp);
-
-  return false;
-}
-
 // Attempt to shrink loads that are only used by shufflevector instructions.
 bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
   auto *OldLoad = dyn_cast<LoadInst>(&I);
@@ -5865,17 +5777,11 @@ bool VectorCombine::run() {
           return true;
         if (foldBitOpOfCastConstant(I))
           return true;
-        if (foldEqualShuffleAnd(I))
-          return true;
         break;
       case Instruction::PHI:
         if (shrinkPhiOfShuffles(I))
           return true;
         break;
-      case Instruction::Select:
-        if (foldEqualShuffleAnd(I))
-          return true;
-        break;
       default:
         if (shrinkType(I))
           return true;
diff --git a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll b/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
similarity index 88%
rename from llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
rename to llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
index 4e13afd360673..ff23a9d4e277c 100644
--- a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
+++ b/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
 define <4 x i32> @cmpeq_epi64_select(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select(
@@ -62,7 +62,7 @@ define <4 x i32> @cmpeq_epi64_multi_use_cmp(<4 x i32> noundef %a, <4 x i32> noun
 ; CHECK-NEXT:    call void @use.v4i1(<4 x i1> [[CMP]])
 ; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SHUFFLE]], [[SEXT]]
+; CHECK-NEXT:    [[AND:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <4 x i32> [[AND]]
 ;
   %cmp = icmp eq <4 x i32> %a, %b
@@ -80,7 +80,7 @@ define <4 x i32> @cmpeq_epi64_multi_use_sext(<4 x i32> noundef %a, <4 x i32> nou
 ; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
 ; CHECK-NEXT:    call void @use.v4i32(<4 x i32> [[SEXT]])
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SHUFFLE]], [[SEXT]]
+; CHECK-NEXT:    [[AND:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <4 x i32> [[AND]]
 ;
   %cmp = icmp eq <4 x i32> %a, %b
@@ -98,7 +98,7 @@ define <4 x i32> @cmpeq_epi64_multi_use_shuffle(<4 x i32> noundef %a, <4 x i32>
 ; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    call void @use.v4i32(<4 x i32> [[SHUFFLE]])
-; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SHUFFLE]], [[SEXT]]
+; CHECK-NEXT:    [[AND:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <4 x i32> [[AND]]
 ;
   %cmp = icmp eq <4 x i32> %a, %b
@@ -113,8 +113,8 @@ define <4 x i32> @cmpeq_epi64_select_neg_0(<4 x i32> noundef %a, <4 x i32> nound
 ; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select_neg_0(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <4 x i32> [[SELECT]]
 ;
@@ -131,7 +131,7 @@ define <4 x i32> @cmpeq_epi64_and_neg_1(<4 x i32> noundef %a, <4 x i32> noundef
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
 ; CHECK-NEXT:    [[SEXT:%.*]] = zext <4 x i1> [[CMP]] to <4 x i32>
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SEXT]], [[SHUFFLE]]
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SHUFFLE]], [[SEXT]]
 ; CHECK-NEXT:    ret <4 x i32> [[AND]]
 ;
   %cmp = icmp eq <4 x i32> %a, %b
@@ -145,8 +145,8 @@ define <4 x i32> @cmpeq_epi64_select_neg_2(<4 x i32> noundef %a, <4 x i32> nound
 ; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select_neg_2(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <4 x i32> [[SELECT]]
 ;
@@ -162,13 +162,13 @@ define <4 x i32> @cmpeq_epi64_select_neg_3(<4 x i32> noundef %a, <4 x i32> nound
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
 ; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> [[SEXT]]
-; CHECK-NEXT:    ret <4 x i32> [[SELECT]]
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE1]], <4 x i32> splat (i32 1)
+; CHECK-NEXT:    ret <4 x i32> [[SHUFFLE]]
 ;
   %cmp = icmp eq <4 x i32> %a, %b
   %sext = sext <4 x i1> %cmp to <4 x i32>
-  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  %select = select <4 x i1> %cmp, <4 x i32> %shuffle, <4 x i32> %sext
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %select = select <4 x i1> %cmp, <4 x i32> %shuffle, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   ret <4 x i32> %select
 }

>From 4b9e070d638ff320b0f085715e85dd03703523c5 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Thu, 5 Mar 2026 15:25:30 +0700
Subject: [PATCH 08/13] Add folding v2i64 cmpgt using v4i32 pattern test

---
 .../fold-v2i64-cmpgt-using-v4i32-pattern.ll   | 86 +++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll

diff --git a/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll b/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
new file mode 100644
index 0000000000000..3442b004eff13
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define <4 x i32> @alt_cmpgt_epi64(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[XOR_A:%.*]] = xor <4 x i32> [[A]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[XOR_B:%.*]] = xor <4 x i32> [[B]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[GT:%.*]] = icmp sgt <4 x i32> [[XOR_A]], [[XOR_B]]
+; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
+; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT_EQ:%.*]] = sext <4 x i1> [[EQ]] to <4 x i32>
+; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i32> [[SEXT_EQ]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[GT_0]], [[EQ_0]]
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %xor.b = xor <4 x i32> %b, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %gt = icmp sgt <4 x i32> %xor.a, %xor.b
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %and = and <4 x i32> %gt.0, %eq.0
+  %or = or <4 x i32> %and, %gt.1
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @alt_cmpgt_epu64_1(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epu64_1(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[GT:%.*]] = icmp ugt <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
+; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT_EQ:%.*]] = sext <4 x i1> [[EQ]] to <4 x i32>
+; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i32> [[SEXT_EQ]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[GT_0]], [[EQ_0]]
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %gt = icmp ugt <4 x i32> %a, %b
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %and = and <4 x i32> %gt.0, %eq.0
+  %or = or <4 x i32> %and, %gt.1
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @alt_cmpgt_epu64_2(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epu64_2(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[GT:%.*]] = icmp ugt <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
+; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT_EQ:%.*]] = sext <4 x i1> [[EQ]] to <4 x i32>
+; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i32> [[SEXT_EQ]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[GT_0]], [[EQ_0]]
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %xor.b = xor <4 x i32> %b, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %gt = icmp sgt <4 x i32> %xor.a, %xor.b
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %and = and <4 x i32> %gt.0, %eq.0
+  %or = or <4 x i32> %and, %gt.1
+  ret <4 x i32> %or
+}

>From 28599ed5b4ef8a838cb589700c28fc739ab86f55 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Fri, 6 Mar 2026 16:40:03 +0700
Subject: [PATCH 09/13] Apply folding for v2i64 greater comparison using v4i32
 pattern

---
 .../InstCombine/InstCombineAndOrXor.cpp       |  3 +
 .../InstCombine/InstCombineInternal.h         |  2 +
 .../InstCombine/InstructionCombining.cpp      | 79 +++++++++++++++++++
 .../fold-v2i64-cmpgt-using-v4i32-pattern.ll   | 44 ++++-------
 4 files changed, 99 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 30c1e8e8aca73..b872ac423f0c9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -4652,6 +4652,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (Value *Res = FoldOrOfSelectSmaxToAbs(I, Builder))
     return replaceInstUsesWith(I, Res);
 
+  if (Instruction *Folded = foldV2CmpGtUsingV4CmpGtPattern(I))
+    return Folded;
+
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index fed88cc84f46e..fb79291977eb6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -486,6 +486,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
 
   Instruction *foldV4EqualShuffleAndToV2Equal(Instruction &I);
 
+  Instruction *foldV2CmpGtUsingV4CmpGtPattern(BinaryOperator &I);
+
 public:
   /// Create and insert the idiom we use to indicate a block is unreachable
   /// without having to rewrite the CFG from within InstCombine.
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 1c71bb5387ab5..c2ef56ad67ce5 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1380,6 +1380,85 @@ Instruction *InstCombinerImpl::foldV4EqualShuffleAndToV2Equal(Instruction &I) {
   return replaceInstUsesWith(I, BitCastCmp);
 }
 
+// Prior to SSE4.2, to perform greater (or less than) comparisons between two
+// v2i64 values, the comparison is performed on v4i32 values:
+//
+// (A1, A2) -> (A1Lower, A1Upper, A2Lower, A2Upper)
+// (B1, B2) -> (B1Lower, B1Upper, B2Lower, B2Upper)
+// (Result1, Result2) -> (Result1, Result1, Result2, Result2)
+//
+// where,
+//
+// ResultX = (GtLowerX & EqUpperX) | (GtUpperX)
+// GtLowerX = AXLower OP BXLower
+// GtUpperX = AXUpper OP BXUpper
+// EqUpperX = AXUpper EQ BXUpper
+//
+// Upper and lower values are obtained through vector shuffles.
+Instruction *
+InstCombinerImpl::foldV2CmpGtUsingV4CmpGtPattern(BinaryOperator &I) {
+  if (I.getOpcode() != Instruction::Or)
+    return nullptr;
+
+  auto *OldVecType = dyn_cast<VectorType>(I.getType());
+
+  if (!OldVecType || OldVecType->isScalableTy() ||
+      !OldVecType->getElementType()->isIntegerTy(32) ||
+      OldVecType->getElementCount().getFixedValue() != 4)
+    return nullptr;
+
+  Value *A, *B, *Greater1, *Greater2, *Greater;
+  CmpPredicate PredEq;
+  SmallVector<int> MaskLower = {0, 0, 2, 2};
+  SmallVector<int> MaskUpper = {1, 1, 3, 3};
+
+  auto GreaterLower = m_SExtOrSelf(m_Shuffle(
+      m_SExtOrSelf(m_Value(Greater1)), m_Poison(), m_SpecificMask(MaskLower)));
+  auto GreaterUpper = m_SExtOrSelf(m_Shuffle(
+      m_SExtOrSelf(m_Value(Greater2)), m_Poison(), m_SpecificMask(MaskUpper)));
+  auto EqUpper = m_SExtOrSelf(
+      m_Shuffle(m_SExtOrSelf(m_c_ICmp(PredEq, m_Value(A), m_Value(B))),
+                m_Poison(), m_SpecificMask(MaskUpper)));
+
+  if (!match(&I, m_c_Or(m_c_And(GreaterLower, EqUpper), GreaterUpper)) ||
+      Greater1 != Greater2 || PredEq != ICmpInst::ICMP_EQ)
+    return nullptr;
+
+  Greater = Greater1;
+
+  auto *Zero = ConstantInt::get(IntegerType::getInt32Ty(I.getContext()), 0);
+  auto *Flip =
+      ConstantInt::get(IntegerType::getInt32Ty(I.getContext()), 0x80000000);
+  auto *FlipLower = ConstantVector::get({Flip, Zero, Flip, Zero});
+  auto *FlipAll = ConstantVector::get({Flip, Flip, Flip, Flip});
+
+  CmpPredicate PredGt;
+  auto UGt = m_c_ICmp(PredGt, m_Specific(A), m_Specific(B));
+  auto UGtAlt = m_c_ICmp(PredGt, m_c_Xor(m_Specific(A), m_Specific(FlipAll)),
+                         m_c_Xor(m_Specific(B), m_Specific(FlipAll)));
+  auto SGt = m_c_ICmp(PredGt, m_c_Xor(m_Specific(A), m_Specific(FlipLower)),
+                      m_c_Xor(m_Specific(B), m_Specific(FlipLower)));
+
+  if (!(match(Greater, UGt) &&
+        (PredGt == ICmpInst::ICMP_UGT || PredGt == ICmpInst::ICMP_ULT)) &&
+      !((match(Greater, SGt) || match(Greater, UGtAlt)) &&
+        (PredGt == ICmpInst::ICMP_SGT || PredGt == ICmpInst::ICMP_SLT)))
+    return nullptr;
+
+  LLVM_DEBUG(dbgs() << "Found V2CmpGt using V4CmpGt pattern" << '\n');
+
+  // Perform folding
+  auto *NewElementType = IntegerType::get(I.getContext(), 64);
+  auto *NewVecType = VectorType::get(NewElementType, 2, false);
+  auto *BitCastA = Builder.CreateBitCast(A, NewVecType);
+  auto *BitCastB = Builder.CreateBitCast(B, NewVecType);
+  auto *Cmp = Builder.CreateICmp(PredGt, BitCastA, BitCastB);
+  auto *SExt = Builder.CreateSExt(Cmp, NewVecType);
+  auto *BitCastCmp = Builder.CreateBitCast(SExt, OldVecType);
+
+  return replaceInstUsesWith(I, BitCastCmp);
+}
+
 static std::optional<std::pair<Value *, Value *>>
 matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS) {
   if (LHS->getParent() != RHS->getParent())
diff --git a/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll b/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
index 3442b004eff13..1ca2c71f988f3 100644
--- a/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
+++ b/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
@@ -4,17 +4,11 @@
 define <4 x i32> @alt_cmpgt_epi64(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[XOR_A:%.*]] = xor <4 x i32> [[A]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
-; CHECK-NEXT:    [[XOR_B:%.*]] = xor <4 x i32> [[B]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
-; CHECK-NEXT:    [[GT:%.*]] = icmp sgt <4 x i32> [[XOR_A]], [[XOR_B]]
-; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
-; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
-; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
-; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SEXT_EQ:%.*]] = sext <4 x i1> [[EQ]] to <4 x i32>
-; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i32> [[SEXT_EQ]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
-; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[GT_0]], [[EQ_0]]
-; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[OR:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[OR]]
 ;
   %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
@@ -34,15 +28,11 @@ define <4 x i32> @alt_cmpgt_epi64(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 define <4 x i32> @alt_cmpgt_epu64_1(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epu64_1(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[GT:%.*]] = icmp ugt <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
-; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
-; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
-; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SEXT_EQ:%.*]] = sext <4 x i1> [[EQ]] to <4 x i32>
-; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i32> [[SEXT_EQ]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
-; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[GT_0]], [[EQ_0]]
-; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[OR:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[OR]]
 ;
   %gt = icmp ugt <4 x i32> %a, %b
@@ -60,15 +50,11 @@ define <4 x i32> @alt_cmpgt_epu64_1(<4 x i32> noundef %a, <4 x i32> noundef %b)
 define <4 x i32> @alt_cmpgt_epu64_2(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epu64_2(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[GT:%.*]] = icmp ugt <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
-; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
-; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
-; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SEXT_EQ:%.*]] = sext <4 x i1> [[EQ]] to <4 x i32>
-; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i32> [[SEXT_EQ]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
-; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[GT_0]], [[EQ_0]]
-; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[OR:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[OR]]
 ;
   %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>

>From 5af139970d24bccc8cf266543cced2d34d01543f Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Fri, 6 Mar 2026 19:28:08 +0700
Subject: [PATCH 10/13] Remove multi-use test and improve documentation
 comments

---
 .../InstCombine/InstructionCombining.cpp      | 81 ++++++-------------
 .../fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll  | 57 -------------
 2 files changed, 26 insertions(+), 112 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index c2ef56ad67ce5..f56883e57740c 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1296,46 +1296,32 @@ Value *InstCombinerImpl::foldUsingDistributiveLaws(BinaryOperator &I) {
   return SimplifySelectsFeedingBinaryOp(I, LHS, RHS);
 }
 
-// Prior to SSE4.1, performing equality comparison on v2i64 types require a
-// comparison on v4i32 types using the following pattern:
-//
-// %3 = icmp eq <4 x i32> %1, %2
-//
-// %4 = sext <4 x i1> %3 to <4 x i32>
-//
-// %5 = shufflevector <4 x i32> %4, <4 x i32> poison, <4 x i32> <i32 1, i32 0,
-// i32 3, i32 2>
-//
-// %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32> zeroinitializer
-//
-// OR
-//
-// %6 = and <4 x i32> %sext, %shuffle
-//
-// We should detect such patterns and fold them into:
-//
-// %3 = bitcast <4 x i32> %1 to <2 x i64>
+// Prior to SSE4.1, to perform equality comparisons between two
+// v2i64 values, the comparison is performed on v4i32 values:
 //
-// %4 = bitcast <4 x i32> %2 to <2 x i64>
+// (A1, A2) -> (A1Lower, A1Upper, A2Lower, A2Upper)
+// (B1, B2) -> (B1Lower, B1Upper, B2Lower, B2Upper)
+// (Result1, Result2) -> (Result1, Result1, Result2, Result2)
 //
-// %5 = icmp eq <2 x i64> %3, %4
+// where,
 //
-// %6 = bitcast <2 x i64> %5 to <4 x i32>
+// ResultX = EqLowerX & EqUpperX
+// EqLowerX = AXLower == BXLower
+// EqUpperX = AXUpper == BXUpper
 //
+// Bitwise AND between the upper and lower parts can be achived by performing
+// the operation between the original and shuffled equality vector.
 Instruction *InstCombinerImpl::foldV4EqualShuffleAndToV2Equal(Instruction &I) {
-  Value *Equal, *Shuffle, *L, *R;
+  Value *L, *R;
   CmpPredicate Pred;
   SmallVector<int> Mask = {1, 0, 3, 2};
 
   // Check pattern existance
-  if (!match(&I,
-             m_CombineOr(m_c_And(m_SExt(m_Value(Equal)),
-                                 m_SExtOrSelf(m_Value(Shuffle))),
-                         m_Select(m_Value(Equal),
-                                  m_SExtOrSelf(m_Value(Shuffle)), m_Zero()))) ||
-      !match(Shuffle, m_Shuffle(m_SExtOrSelf(m_Specific(Equal)), m_Poison(),
-                                m_SpecificMask(Mask))) ||
-      !match(Equal, m_ICmp(Pred, m_Value(L), m_Value(R))) ||
+  auto Equal = m_ICmp(Pred, m_Value(L), m_Value(R));
+  auto Shuffle = m_SExtOrSelf(
+      m_Shuffle(m_SExtOrSelf(Equal), m_Poison(), m_SpecificMask(Mask)));
+  if (!match(&I, m_CombineOr(m_c_And(m_SExt(Equal), Shuffle),
+                             m_Select(Equal, Shuffle, m_Zero()))) ||
       Pred != CmpInst::ICMP_EQ)
     return nullptr;
 
@@ -1343,34 +1329,15 @@ Instruction *InstCombinerImpl::foldV4EqualShuffleAndToV2Equal(Instruction &I) {
   auto *OldVecType = cast<VectorType>(L->getType());
 
   if (OldVecType->isScalableTy() ||
-      !OldVecType->getElementType()->isIntegerTy())
-    return nullptr;
-
-  int ElementCount = OldVecType->getElementCount().getFixedValue();
-  int ElementBitWidth = OldVecType->getElementType()->getIntegerBitWidth();
-
-  if (ElementCount != 4 || ElementBitWidth != 32)
-    return nullptr;
-
-  // Check uses outside pattern
-  if (!Shuffle->hasOneUse())
+      !OldVecType->getElementType()->isIntegerTy(32) ||
+      OldVecType->getElementCount().getFixedValue() != 4)
     return nullptr;
 
-  for (auto *U : Equal->users()) {
-    if (U == &I || U == Shuffle)
-      continue;
-    if (!isa<llvm::CastInst>(U))
-      return nullptr;
-    for (auto *U : U->users())
-      if (U != &I && U != Shuffle)
-        return nullptr;
-  }
-
   LLVM_DEBUG(dbgs() << "IC: Folding equal-shuffle-and pattern" << '\n');
 
   // Perform folding
-  auto *NewElementType = IntegerType::get(I.getContext(), ElementBitWidth * 2);
-  auto *NewVecType = VectorType::get(NewElementType, ElementCount / 2, false);
+  auto *NewElementType = IntegerType::get(I.getContext(), 64);
+  auto *NewVecType = VectorType::get(NewElementType, 2, false);
   auto *BitCastL = Builder.CreateBitCast(L, NewVecType);
   auto *BitCastR = Builder.CreateBitCast(R, NewVecType);
   auto *Cmp = Builder.CreateICmp(Pred, BitCastL, BitCastR);
@@ -1394,7 +1361,11 @@ Instruction *InstCombinerImpl::foldV4EqualShuffleAndToV2Equal(Instruction &I) {
 // GtUpperX = AXUpper OP BXUpper
 // EqUpperX = AXUpper EQ BXUpper
 //
-// Upper and lower values are obtained through vector shuffles.
+// Upper and lower parts are obtained through vector shuffles.
+//
+// Note that comparison of the lower parts are always unsigned comparisons
+// regardless of the resulting signedness. Also note that, unsigned comparison
+// can be derived from signed comparison by flipping the MSB of both operands.
 Instruction *
 InstCombinerImpl::foldV2CmpGtUsingV4CmpGtPattern(BinaryOperator &I) {
   if (I.getOpcode() != Instruction::Or)
diff --git a/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll b/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
index ff23a9d4e277c..6300ff30f103c 100644
--- a/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
+++ b/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
@@ -52,63 +52,6 @@ define <4 x i32> @cmpeq_epi64_commutated_and(<4 x i32> noundef %a, <4 x i32> nou
   ret <4 x i32> %and
 }
 
-declare void @use.v4i1(<4 x i1>)
-declare void @use.v4i32(<4 x i32>)
-
-define <4 x i32> @cmpeq_epi64_multi_use_cmp(<4 x i32> noundef %a, <4 x i32> noundef %b) {
-; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_multi_use_cmp(
-; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    call void @use.v4i1(<4 x i1> [[CMP]])
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[AND:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    ret <4 x i32> [[AND]]
-;
-  %cmp = icmp eq <4 x i32> %a, %b
-  call void @use.v4i1(<4 x i1> %cmp)
-  %sext = sext <4 x i1> %cmp to <4 x i32>
-  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  %and = and <4 x i32> %shuffle, %sext
-  ret <4 x i32> %and
-}
-
-define <4 x i32> @cmpeq_epi64_multi_use_sext(<4 x i32> noundef %a, <4 x i32> noundef %b) {
-; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_multi_use_sext(
-; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-; CHECK-NEXT:    call void @use.v4i32(<4 x i32> [[SEXT]])
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[AND:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    ret <4 x i32> [[AND]]
-;
-  %cmp = icmp eq <4 x i32> %a, %b
-  %sext = sext <4 x i1> %cmp to <4 x i32>
-  call void @use.v4i32(<4 x i32> %sext)
-  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  %and = and <4 x i32> %shuffle, %sext
-  ret <4 x i32> %and
-}
-
-define <4 x i32> @cmpeq_epi64_multi_use_shuffle(<4 x i32> noundef %a, <4 x i32> noundef %b) {
-; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_multi_use_shuffle(
-; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    call void @use.v4i32(<4 x i32> [[SHUFFLE]])
-; CHECK-NEXT:    [[AND:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    ret <4 x i32> [[AND]]
-;
-  %cmp = icmp eq <4 x i32> %a, %b
-  %sext = sext <4 x i1> %cmp to <4 x i32>
-  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  call void @use.v4i32(<4 x i32> %shuffle)
-  %and = and <4 x i32> %shuffle, %sext
-  ret <4 x i32> %and
-}
-
 define <4 x i32> @cmpeq_epi64_select_neg_0(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select_neg_0(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {

>From 834f12ce4ba7934e5efe8ab5d301e5349ad48010 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Fri, 6 Mar 2026 23:33:21 +0700
Subject: [PATCH 11/13] Add commutated and negative test cases to fold v2i64
 cmpgt using v4i32 pattern test

---
 .../fold-v2i64-cmpgt-using-v4i32-pattern.ll   | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll b/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
index 1ca2c71f988f3..bd310db1a35a8 100644
--- a/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
+++ b/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
@@ -70,3 +70,85 @@ define <4 x i32> @alt_cmpgt_epu64_2(<4 x i32> noundef %a, <4 x i32> noundef %b)
   %or = or <4 x i32> %and, %gt.1
   ret <4 x i32> %or
 }
+
+define <4 x i32> @alt_cmpgt_epi64_commutated_gt(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64_commutated_gt(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[OR:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %xor.b = xor <4 x i32> %b, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %gt = icmp sgt <4 x i32> %xor.b, %xor.a
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %and = and <4 x i32> %gt.0, %eq.0
+  %or = or <4 x i32> %and, %gt.1
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @alt_cmpgt_epi64_neg_0(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64_neg_0(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[XOR_A:%.*]] = xor <4 x i32> [[A]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[XOR_B:%.*]] = xor <4 x i32> [[B]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[GT:%.*]] = icmp sgt <4 x i32> [[XOR_B]], [[XOR_A]]
+; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
+; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i1> [[GT]], [[EQ]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[AND:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %xor.b = xor <4 x i32> %b, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %gt = icmp sgt <4 x i32> %xor.b, %xor.a
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %and = and <4 x i32> %gt.0, %eq.0
+  %or = or <4 x i32> %and, %gt.1
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @alt_cmpgt_epi64_neg_1(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64_neg_1(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[XOR_A:%.*]] = xor <4 x i32> [[A]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[XOR_B:%.*]] = xor <4 x i32> [[B]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[GT:%.*]] = icmp sgt <4 x i32> [[XOR_B]], [[XOR_A]]
+; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
+; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i1> [[GT]], [[EQ]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[AND:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_0]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %xor.b = xor <4 x i32> %b, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %gt = icmp sgt <4 x i32> %xor.b, %xor.a
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %and = and <4 x i32> %gt.1, %eq.0
+  %or = or <4 x i32> %and, %gt.0
+  ret <4 x i32> %or
+}

>From 0f1a660354ad125079d20eb431091ab383a4d8a5 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Sat, 7 Mar 2026 00:48:50 +0700
Subject: [PATCH 12/13] Handle and transformed into select in folding v2i64
 cmpgt using v4i32 pattern

---
 .../InstCombine/InstructionCombining.cpp      |  8 +++++--
 .../fold-v2i64-cmpgt-using-v4i32-pattern.ll   | 23 +++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index f56883e57740c..d08c67a518fc4 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1387,11 +1387,15 @@ InstCombinerImpl::foldV2CmpGtUsingV4CmpGtPattern(BinaryOperator &I) {
       m_SExtOrSelf(m_Value(Greater1)), m_Poison(), m_SpecificMask(MaskLower)));
   auto GreaterUpper = m_SExtOrSelf(m_Shuffle(
       m_SExtOrSelf(m_Value(Greater2)), m_Poison(), m_SpecificMask(MaskUpper)));
-  auto EqUpper = m_SExtOrSelf(
+  auto EqUpper = m_Shuffle(m_c_ICmp(PredEq, m_Value(A), m_Value(B)), m_Poison(),
+                           m_SpecificMask(MaskUpper));
+  auto EqUpperSExt = m_SExtOrSelf(
       m_Shuffle(m_SExtOrSelf(m_c_ICmp(PredEq, m_Value(A), m_Value(B))),
                 m_Poison(), m_SpecificMask(MaskUpper)));
 
-  if (!match(&I, m_c_Or(m_c_And(GreaterLower, EqUpper), GreaterUpper)) ||
+  if (!match(&I, m_c_Or(m_CombineOr(m_c_And(GreaterLower, EqUpperSExt),
+                                    m_Select(EqUpper, GreaterLower, m_Zero())),
+                        GreaterUpper)) ||
       Greater1 != Greater2 || PredEq != ICmpInst::ICMP_EQ)
     return nullptr;
 
diff --git a/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll b/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
index bd310db1a35a8..8d1b8e40655c2 100644
--- a/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
+++ b/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
@@ -25,6 +25,29 @@ define <4 x i32> @alt_cmpgt_epi64(<4 x i32> noundef %a, <4 x i32> noundef %b) {
   ret <4 x i32> %or
 }
 
+define <4 x i32> @alt_cmpgt_epi64_select(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64_select(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[OR:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %xor.b = xor <4 x i32> %b, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %gt = icmp sgt <4 x i32> %xor.a, %xor.b
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %eq.0 = shufflevector <4 x i1> %eq, <4 x i1> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %select = select <4 x i1> %eq.0, <4 x i32> %gt.0, <4 x i32> zeroinitializer
+  %or = or <4 x i32> %select, %gt.1
+  ret <4 x i32> %or
+}
+
 define <4 x i32> @alt_cmpgt_epu64_1(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epu64_1(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {

>From 07cb34afbccf6aad25069cbd374003c9f176cc89 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Sat, 7 Mar 2026 01:10:00 +0700
Subject: [PATCH 13/13] Check type upfront in folding equal-shuffle-and pattern

---
 .../InstCombine/InstructionCombining.cpp      | 25 +++++++++----------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index d08c67a518fc4..abe6d96d0567d 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1312,11 +1312,19 @@ Value *InstCombinerImpl::foldUsingDistributiveLaws(BinaryOperator &I) {
 // Bitwise AND between the upper and lower parts can be achived by performing
 // the operation between the original and shuffled equality vector.
 Instruction *InstCombinerImpl::foldV4EqualShuffleAndToV2Equal(Instruction &I) {
+  // Check argument type
+  auto *OldVecType = dyn_cast<VectorType>(I.getType());
+
+  if (!OldVecType || OldVecType->isScalableTy() ||
+      !OldVecType->getElementType()->isIntegerTy(32) ||
+      OldVecType->getElementCount().getFixedValue() != 4)
+    return nullptr;
+
+  // Check pattern existance
   Value *L, *R;
   CmpPredicate Pred;
   SmallVector<int> Mask = {1, 0, 3, 2};
 
-  // Check pattern existance
   auto Equal = m_ICmp(Pred, m_Value(L), m_Value(R));
   auto Shuffle = m_SExtOrSelf(
       m_Shuffle(m_SExtOrSelf(Equal), m_Poison(), m_SpecificMask(Mask)));
@@ -1325,14 +1333,6 @@ Instruction *InstCombinerImpl::foldV4EqualShuffleAndToV2Equal(Instruction &I) {
       Pred != CmpInst::ICMP_EQ)
     return nullptr;
 
-  // Check argument type
-  auto *OldVecType = cast<VectorType>(L->getType());
-
-  if (OldVecType->isScalableTy() ||
-      !OldVecType->getElementType()->isIntegerTy(32) ||
-      OldVecType->getElementCount().getFixedValue() != 4)
-    return nullptr;
-
   LLVM_DEBUG(dbgs() << "IC: Folding equal-shuffle-and pattern" << '\n');
 
   // Perform folding
@@ -1368,9 +1368,7 @@ Instruction *InstCombinerImpl::foldV4EqualShuffleAndToV2Equal(Instruction &I) {
 // can be derived from signed comparison by flipping the MSB of both operands.
 Instruction *
 InstCombinerImpl::foldV2CmpGtUsingV4CmpGtPattern(BinaryOperator &I) {
-  if (I.getOpcode() != Instruction::Or)
-    return nullptr;
-
+  // Check argument type
   auto *OldVecType = dyn_cast<VectorType>(I.getType());
 
   if (!OldVecType || OldVecType->isScalableTy() ||
@@ -1378,6 +1376,7 @@ InstCombinerImpl::foldV2CmpGtUsingV4CmpGtPattern(BinaryOperator &I) {
       OldVecType->getElementCount().getFixedValue() != 4)
     return nullptr;
 
+  // Check pattern existance
   Value *A, *B, *Greater1, *Greater2, *Greater;
   CmpPredicate PredEq;
   SmallVector<int> MaskLower = {0, 0, 2, 2};
@@ -1420,7 +1419,7 @@ InstCombinerImpl::foldV2CmpGtUsingV4CmpGtPattern(BinaryOperator &I) {
         (PredGt == ICmpInst::ICMP_SGT || PredGt == ICmpInst::ICMP_SLT)))
     return nullptr;
 
-  LLVM_DEBUG(dbgs() << "Found V2CmpGt using V4CmpGt pattern" << '\n');
+  LLVM_DEBUG(dbgs() << "IC: Folding V2CmpGt using V4CmpGt pattern" << '\n');
 
   // Perform folding
   auto *NewElementType = IntegerType::get(I.getContext(), 64);