[llvm] [InstCombine] Fold patterns which uses <2N x iM> type for comparisons on <N x i2M> vector types (PR #184328)

Tue Mar 10 21:01:34 PDT 2026

https://github.com/fuad1502 updated https://github.com/llvm/llvm-project/pull/184328

>From 5c444710bf7fd07d8405aa8b6802cff845f9994a Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Tue, 3 Mar 2026 07:15:43 +0700
Subject: [PATCH 01/19] Add folding v4i32 equals-shuffle-and pattern to v2i64
 equals lit test

---
 .../fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll  | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll

diff --git a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
new file mode 100644
index 0000000000000..3c1b98af193f5
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
+
+define <4 x i32> @cmpeq_epi64_select(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SELECT]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %select = select <4 x i1> %cmp, <4 x i32> %shuffle, <4 x i32> zeroinitializer
+  ret <4 x i32> %select
+}
+
+define <4 x i32> @cmpeq_epi64_and(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_and(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SEXT]], [[SHUFFLE]]
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %and = and <4 x i32> %sext, %shuffle
+  ret <4 x i32> %and
+}

>From b8f17f37a75293051498bac60b03aa40c0e3c365 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Tue, 3 Mar 2026 12:08:15 +0700
Subject: [PATCH 02/19] Apply folding for v4i32 equals-shuffle-and pattern

---
 .../Transforms/Vectorize/VectorCombine.cpp    | 73 +++++++++++++++++++
 .../fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll  | 18 +++--
 2 files changed, 83 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 1f37e435b8080..ee29ce690a435 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -27,12 +27,15 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -153,6 +156,7 @@ class VectorCombine {
   bool foldEquivalentReductionCmp(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
   bool foldInterleaveIntrinsics(Instruction &I);
+  bool foldEqualShuffleAnd(Instruction &I);
   bool shrinkType(Instruction &I);
   bool shrinkLoadForShuffles(Instruction &I);
   bool shrinkPhiOfShuffles(Instruction &I);
@@ -5435,6 +5439,69 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
   return true;
 }
 
+// Prior to SSE4.1, performing equality comparison on v2i64 types require a
+// comparison on v4i32 types using the following pattern:
+//
+// ...
+// %3 = icmp eq <4 x i32> %1, %2
+// %4 = sext <4 x i1> %3 to <4 x i32>
+// %5 = shufflevector <4 x i32> %4, <4 x i32> poison, <4 x i32> <i32 1, i32 0,
+// i32 3, i32 2> %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32>
+// zeroinitializer
+// ...
+//
+// We should detect such patterns and fold them to:
+//
+// %3 = bitcast <4 x i32> %1 to <2 x i64>
+// %4 = bitcast <4 x i32> %2 to <2 x i64>
+// %5 = icmp eq <2 x i64> %3, %4
+// %6 = bitcast <2 x i64> %5 to <4 x i32>
+//
+bool VectorCombine::foldEqualShuffleAnd(Instruction &I) {
+  // Check pattern existance
+  Value *L, *R;
+  CmpPredicate Pred;
+
+  auto Equal = m_ICmp(Pred, m_Value(L), m_Value(R));
+  SmallVector<int> Mask = {1, 0, 3, 2};
+  auto Shuffle =
+      m_CombineOr(m_SExt(m_Shuffle(Equal, m_Poison(), m_SpecificMask(Mask))),
+                  m_Shuffle(m_SExt(Equal), m_Poison(), m_SpecificMask(Mask)));
+
+  if (!match(&I, m_CombineOr(m_And(m_SExt(Equal), Shuffle),
+                             m_Select(Equal, Shuffle, m_ZeroInt()))) ||
+      !ICmpInst::isEquality(Pred) || !L->getType()->isVectorTy())
+    return false;
+
+  auto *OldVecType = cast<VectorType>(L->getType());
+
+  if (OldVecType->isScalableTy() ||
+      !OldVecType->getElementType()->isIntegerTy())
+    return false;
+
+  int ElementCount = OldVecType->getElementCount().getFixedValue();
+  int ElementBitWidth = OldVecType->getElementType()->getIntegerBitWidth();
+
+  if (ElementCount != 4 || ElementBitWidth != 32)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "VC: Found equal-shuffle-and pattern" << '\n');
+
+  // Perform folding
+  IRBuilder Builder(&I);
+  auto *NewElementType = IntegerType::get(I.getContext(), ElementBitWidth * 2);
+  auto *NewVecType = VectorType::get(NewElementType, ElementCount / 2, false);
+  auto *BitCastL = Builder.CreateBitCast(L, NewVecType);
+  auto *BitCastR = Builder.CreateBitCast(R, NewVecType);
+  auto *Cmp = Builder.CreateICmp(Pred, BitCastL, BitCastR);
+  auto *SExt = Builder.CreateSExt(Cmp, NewVecType);
+  auto *BitCastCmp = Builder.CreateBitCast(SExt, OldVecType);
+
+  replaceValue(I, *BitCastCmp);
+
+  return false;
+}
+
 // Attempt to shrink loads that are only used by shufflevector instructions.
 bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
   auto *OldLoad = dyn_cast<LoadInst>(&I);
@@ -5777,11 +5844,17 @@ bool VectorCombine::run() {
           return true;
         if (foldBitOpOfCastConstant(I))
           return true;
+        if (foldEqualShuffleAnd(I))
+          return true;
         break;
       case Instruction::PHI:
         if (shrinkPhiOfShuffles(I))
           return true;
         break;
+      case Instruction::Select:
+        if (foldEqualShuffleAnd(I))
+          return true;
+        break;
       default:
         if (shrinkType(I))
           return true;
diff --git a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
index 3c1b98af193f5..42f3222ae3d27 100644
--- a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
+++ b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
@@ -4,10 +4,11 @@
 define <4 x i32> @cmpeq_epi64_select(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[SELECT:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[SELECT]]
 ;
   %cmp = icmp eq <4 x i32> %a, %b
@@ -20,10 +21,11 @@ define <4 x i32> @cmpeq_epi64_select(<4 x i32> noundef %a, <4 x i32> noundef %b)
 define <4 x i32> @cmpeq_epi64_and(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_and(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SEXT]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[AND:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[AND]]
 ;
   %cmp = icmp eq <4 x i32> %a, %b

>From 10ad4ba00e85fadba6255e53594144aa8973f068 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Tue, 3 Mar 2026 13:13:33 +0700
Subject: [PATCH 03/19] Handle commutated and instruction

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp |  2 +-
 .../fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll    | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index ee29ce690a435..13559610e37a2 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -5468,7 +5468,7 @@ bool VectorCombine::foldEqualShuffleAnd(Instruction &I) {
       m_CombineOr(m_SExt(m_Shuffle(Equal, m_Poison(), m_SpecificMask(Mask))),
                   m_Shuffle(m_SExt(Equal), m_Poison(), m_SpecificMask(Mask)));
 
-  if (!match(&I, m_CombineOr(m_And(m_SExt(Equal), Shuffle),
+  if (!match(&I, m_CombineOr(m_c_And(m_SExt(Equal), Shuffle),
                              m_Select(Equal, Shuffle, m_ZeroInt()))) ||
       !ICmpInst::isEquality(Pred) || !L->getType()->isVectorTy())
     return false;
diff --git a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
index 42f3222ae3d27..983a5a6708609 100644
--- a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
+++ b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
@@ -34,3 +34,20 @@ define <4 x i32> @cmpeq_epi64_and(<4 x i32> noundef %a, <4 x i32> noundef %b) {
   %and = and <4 x i32> %sext, %shuffle
   ret <4 x i32> %and
 }
+
+define <4 x i32> @cmpeq_epi64_commutated_and(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_commutated_and(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[AND:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %and = and <4 x i32> %shuffle, %sext
+  ret <4 x i32> %and
+}

>From 79bbff7d352bfd6f49990f76b2e04621695ab128 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Tue, 3 Mar 2026 16:44:27 +0700
Subject: [PATCH 04/19] Don't fold when intermediate instructions have uses
 outside pattern

---
 .../Transforms/Vectorize/VectorCombine.cpp    | 53 ++++++++++++-----
 .../fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll  | 57 +++++++++++++++++++
 2 files changed, 95 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 13559610e37a2..805aedfc61c04 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -5442,37 +5442,46 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
 // Prior to SSE4.1, performing equality comparison on v2i64 types require a
 // comparison on v4i32 types using the following pattern:
 //
-// ...
 // %3 = icmp eq <4 x i32> %1, %2
+//
 // %4 = sext <4 x i1> %3 to <4 x i32>
+//
 // %5 = shufflevector <4 x i32> %4, <4 x i32> poison, <4 x i32> <i32 1, i32 0,
-// i32 3, i32 2> %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32>
-// zeroinitializer
-// ...
+// i32 3, i32 2>
+//
+// %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32> zeroinitializer
+//
+// OR
+//
+// %6 = and <4 x i32> %sext, %shuffle
 //
-// We should detect such patterns and fold them to:
+// We should detect such patterns and fold them into:
 //
 // %3 = bitcast <4 x i32> %1 to <2 x i64>
+//
 // %4 = bitcast <4 x i32> %2 to <2 x i64>
+//
 // %5 = icmp eq <2 x i64> %3, %4
+//
 // %6 = bitcast <2 x i64> %5 to <4 x i32>
 //
 bool VectorCombine::foldEqualShuffleAnd(Instruction &I) {
-  // Check pattern existance
-  Value *L, *R;
+  Value *Equal, *Shuffle, *L, *R;
   CmpPredicate Pred;
-
-  auto Equal = m_ICmp(Pred, m_Value(L), m_Value(R));
   SmallVector<int> Mask = {1, 0, 3, 2};
-  auto Shuffle =
-      m_CombineOr(m_SExt(m_Shuffle(Equal, m_Poison(), m_SpecificMask(Mask))),
-                  m_Shuffle(m_SExt(Equal), m_Poison(), m_SpecificMask(Mask)));
 
-  if (!match(&I, m_CombineOr(m_c_And(m_SExt(Equal), Shuffle),
-                             m_Select(Equal, Shuffle, m_ZeroInt()))) ||
-      !ICmpInst::isEquality(Pred) || !L->getType()->isVectorTy())
+  // Check pattern existance
+  if (!match(&I, m_CombineOr(m_c_And(m_SExt(m_Value(Equal)),
+                                     m_SExtOrSelf(m_Value(Shuffle))),
+                             m_Select(m_Value(Equal),
+                                      m_SExtOrSelf(m_Value(Shuffle)),
+                                      m_ZeroInt()))) ||
+      !match(Shuffle, m_Shuffle(m_SExtOrSelf(m_Specific(Equal)), m_Poison(),
+                                m_SpecificMask(Mask))) ||
+      !match(Equal, m_ICmp(Pred, m_Value(L), m_Value(R))))
     return false;
 
+  // Check argument type
   auto *OldVecType = cast<VectorType>(L->getType());
 
   if (OldVecType->isScalableTy() ||
@@ -5485,6 +5494,20 @@ bool VectorCombine::foldEqualShuffleAnd(Instruction &I) {
   if (ElementCount != 4 || ElementBitWidth != 32)
     return false;
 
+  // Check uses outside pattern
+  if (!Shuffle->hasOneUse())
+    return false;
+
+  for (auto *U : Equal->users()) {
+    if (U == &I || U == Shuffle)
+      continue;
+    if (!isa<llvm::CastInst>(U))
+      return false;
+    for (auto *U : U->users())
+      if (U != &I && U != Shuffle)
+        return false;
+  }
+
   LLVM_DEBUG(dbgs() << "VC: Found equal-shuffle-and pattern" << '\n');
 
   // Perform folding
diff --git a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
index 983a5a6708609..2d7e72d359973 100644
--- a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
+++ b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
@@ -51,3 +51,60 @@ define <4 x i32> @cmpeq_epi64_commutated_and(<4 x i32> noundef %a, <4 x i32> nou
   %and = and <4 x i32> %shuffle, %sext
   ret <4 x i32> %and
 }
+
+declare void @use.v4i1(<4 x i1>)
+declare void @use.v4i32(<4 x i32>)
+
+define <4 x i32> @cmpeq_epi64_multi_use_cmp(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_multi_use_cmp(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    call void @use.v4i1(<4 x i1> [[CMP]])
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SHUFFLE]], [[SEXT]]
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  call void @use.v4i1(<4 x i1> %cmp)
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %and = and <4 x i32> %shuffle, %sext
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @cmpeq_epi64_multi_use_sext(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_multi_use_sext(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    call void @use.v4i32(<4 x i32> [[SEXT]])
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SHUFFLE]], [[SEXT]]
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  call void @use.v4i32(<4 x i32> %sext)
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %and = and <4 x i32> %shuffle, %sext
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @cmpeq_epi64_multi_use_shuffle(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_multi_use_shuffle(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    call void @use.v4i32(<4 x i32> [[SHUFFLE]])
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SHUFFLE]], [[SEXT]]
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  call void @use.v4i32(<4 x i32> %shuffle)
+  %and = and <4 x i32> %shuffle, %sext
+  ret <4 x i32> %and
+}

>From dd0432152c35d705e03983a55f5fae2758699564 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Tue, 3 Mar 2026 18:10:25 +0700
Subject: [PATCH 05/19] Add negative test cases and add icmp condition code
 check

---
 .../Transforms/Vectorize/VectorCombine.cpp    |  3 +-
 .../fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll  | 64 +++++++++++++++++++
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 805aedfc61c04..fd2ca3ef7c901 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -5478,7 +5478,8 @@ bool VectorCombine::foldEqualShuffleAnd(Instruction &I) {
                                       m_ZeroInt()))) ||
       !match(Shuffle, m_Shuffle(m_SExtOrSelf(m_Specific(Equal)), m_Poison(),
                                 m_SpecificMask(Mask))) ||
-      !match(Equal, m_ICmp(Pred, m_Value(L), m_Value(R))))
+      !match(Equal, m_ICmp(Pred, m_Value(L), m_Value(R))) ||
+      !CmpInst::isEquality(Pred))
     return false;
 
   // Check argument type
diff --git a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
index 2d7e72d359973..4e13afd360673 100644
--- a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
+++ b/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
@@ -108,3 +108,67 @@ define <4 x i32> @cmpeq_epi64_multi_use_shuffle(<4 x i32> noundef %a, <4 x i32>
   %and = and <4 x i32> %shuffle, %sext
   ret <4 x i32> %and
 }
+
+define <4 x i32> @cmpeq_epi64_select_neg_0(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select_neg_0(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SELECT]]
+;
+  %cmp = icmp sgt <4 x i32> %a, %b
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %select = select <4 x i1> %cmp, <4 x i32> %shuffle, <4 x i32> zeroinitializer
+  ret <4 x i32> %select
+}
+
+define <4 x i32> @cmpeq_epi64_and_neg_1(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_and_neg_1(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT:%.*]] = zext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SEXT]], [[SHUFFLE]]
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  %sext = zext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %and = and <4 x i32> %sext, %shuffle
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @cmpeq_epi64_select_neg_2(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select_neg_2(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SELECT]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %select = select <4 x i1> %cmp, <4 x i32> %shuffle, <4 x i32> zeroinitializer
+  ret <4 x i32> %select
+}
+
+define <4 x i32> @cmpeq_epi64_select_neg_3(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select_neg_3(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> [[SEXT]]
+; CHECK-NEXT:    ret <4 x i32> [[SELECT]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %select = select <4 x i1> %cmp, <4 x i32> %shuffle, <4 x i32> %sext
+  ret <4 x i32> %select
+}

>From 32b31c2f728bfbbfcc982db0550d1f5f289dc71f Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Tue, 3 Mar 2026 19:03:24 +0700
Subject: [PATCH 06/19] Remove unnecessary additional headers

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fd2ca3ef7c901..9832b396bde71 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -27,15 +27,12 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"

>From 5f46a5f41a65ab7759110db98c5797e61ad5b4d0 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Wed, 4 Mar 2026 18:06:25 +0700
Subject: [PATCH 07/19] Move v4i32 eq-shuffle-and folding to InstCombine

---
 .../InstCombine/InstCombineAndOrXor.cpp       |  3 +
 .../InstCombine/InstCombineInternal.h         |  2 +
 .../InstCombine/InstCombineSelect.cpp         |  3 +
 .../InstCombine/InstructionCombining.cpp      | 84 +++++++++++++++++
 .../Transforms/Vectorize/VectorCombine.cpp    | 94 -------------------
 .../fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll  | 28 +++---
 6 files changed, 106 insertions(+), 108 deletions(-)
 rename llvm/test/Transforms/{VectorCombine => InstCombine}/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll (88%)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 073f094639fa0..30c1e8e8aca73 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2898,6 +2898,9 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
                                       /*SimplifyOnly*/ false, *this))
     return BinaryOperator::CreateAnd(Op0, V);
 
+  if (auto *Folded = foldV4EqualShuffleAndToV2Equal(I))
+    return Folded;
+
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 2e7758e952eaf..fed88cc84f46e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -484,6 +484,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
     return Sel;
   }
 
+  Instruction *foldV4EqualShuffleAndToV2Equal(Instruction &I);
+
 public:
   /// Create and insert the idiom we use to indicate a block is unreachable
   /// without having to rewrite the CFG from within InstCombine.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 6707d1abf5ca0..598008a3d8d24 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -4982,5 +4982,8 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
     return CallInst::Create(Scmp, {CmpLHS, ConstantInt::get(SI.getType(), 0)});
   }
 
+  if (auto *Folded = foldV4EqualShuffleAndToV2Equal(SI))
+    return Folded;
+
   return nullptr;
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 0ca5da1bbf251..1c71bb5387ab5 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1296,6 +1296,90 @@ Value *InstCombinerImpl::foldUsingDistributiveLaws(BinaryOperator &I) {
   return SimplifySelectsFeedingBinaryOp(I, LHS, RHS);
 }
 
+// Prior to SSE4.1, performing equality comparison on v2i64 types require a
+// comparison on v4i32 types using the following pattern:
+//
+// %3 = icmp eq <4 x i32> %1, %2
+//
+// %4 = sext <4 x i1> %3 to <4 x i32>
+//
+// %5 = shufflevector <4 x i32> %4, <4 x i32> poison, <4 x i32> <i32 1, i32 0,
+// i32 3, i32 2>
+//
+// %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32> zeroinitializer
+//
+// OR
+//
+// %6 = and <4 x i32> %sext, %shuffle
+//
+// We should detect such patterns and fold them into:
+//
+// %3 = bitcast <4 x i32> %1 to <2 x i64>
+//
+// %4 = bitcast <4 x i32> %2 to <2 x i64>
+//
+// %5 = icmp eq <2 x i64> %3, %4
+//
+// %6 = bitcast <2 x i64> %5 to <4 x i32>
+//
+Instruction *InstCombinerImpl::foldV4EqualShuffleAndToV2Equal(Instruction &I) {
+  Value *Equal, *Shuffle, *L, *R;
+  CmpPredicate Pred;
+  SmallVector<int> Mask = {1, 0, 3, 2};
+
+  // Check pattern existance
+  if (!match(&I,
+             m_CombineOr(m_c_And(m_SExt(m_Value(Equal)),
+                                 m_SExtOrSelf(m_Value(Shuffle))),
+                         m_Select(m_Value(Equal),
+                                  m_SExtOrSelf(m_Value(Shuffle)), m_Zero()))) ||
+      !match(Shuffle, m_Shuffle(m_SExtOrSelf(m_Specific(Equal)), m_Poison(),
+                                m_SpecificMask(Mask))) ||
+      !match(Equal, m_ICmp(Pred, m_Value(L), m_Value(R))) ||
+      Pred != CmpInst::ICMP_EQ)
+    return nullptr;
+
+  // Check argument type
+  auto *OldVecType = cast<VectorType>(L->getType());
+
+  if (OldVecType->isScalableTy() ||
+      !OldVecType->getElementType()->isIntegerTy())
+    return nullptr;
+
+  int ElementCount = OldVecType->getElementCount().getFixedValue();
+  int ElementBitWidth = OldVecType->getElementType()->getIntegerBitWidth();
+
+  if (ElementCount != 4 || ElementBitWidth != 32)
+    return nullptr;
+
+  // Check uses outside pattern
+  if (!Shuffle->hasOneUse())
+    return nullptr;
+
+  for (auto *U : Equal->users()) {
+    if (U == &I || U == Shuffle)
+      continue;
+    if (!isa<llvm::CastInst>(U))
+      return nullptr;
+    for (auto *U : U->users())
+      if (U != &I && U != Shuffle)
+        return nullptr;
+  }
+
+  LLVM_DEBUG(dbgs() << "IC: Folding equal-shuffle-and pattern" << '\n');
+
+  // Perform folding
+  auto *NewElementType = IntegerType::get(I.getContext(), ElementBitWidth * 2);
+  auto *NewVecType = VectorType::get(NewElementType, ElementCount / 2, false);
+  auto *BitCastL = Builder.CreateBitCast(L, NewVecType);
+  auto *BitCastR = Builder.CreateBitCast(R, NewVecType);
+  auto *Cmp = Builder.CreateICmp(Pred, BitCastL, BitCastR);
+  auto *SExt = Builder.CreateSExt(Cmp, NewVecType);
+  auto *BitCastCmp = Builder.CreateBitCast(SExt, OldVecType);
+
+  return replaceInstUsesWith(I, BitCastCmp);
+}
+
 static std::optional<std::pair<Value *, Value *>>
 matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS) {
   if (LHS->getParent() != RHS->getParent())
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 9832b396bde71..1f37e435b8080 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -153,7 +153,6 @@ class VectorCombine {
   bool foldEquivalentReductionCmp(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
   bool foldInterleaveIntrinsics(Instruction &I);
-  bool foldEqualShuffleAnd(Instruction &I);
   bool shrinkType(Instruction &I);
   bool shrinkLoadForShuffles(Instruction &I);
   bool shrinkPhiOfShuffles(Instruction &I);
@@ -5436,93 +5435,6 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
   return true;
 }
 
-// Prior to SSE4.1, performing equality comparison on v2i64 types require a
-// comparison on v4i32 types using the following pattern:
-//
-// %3 = icmp eq <4 x i32> %1, %2
-//
-// %4 = sext <4 x i1> %3 to <4 x i32>
-//
-// %5 = shufflevector <4 x i32> %4, <4 x i32> poison, <4 x i32> <i32 1, i32 0,
-// i32 3, i32 2>
-//
-// %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32> zeroinitializer
-//
-// OR
-//
-// %6 = and <4 x i32> %sext, %shuffle
-//
-// We should detect such patterns and fold them into:
-//
-// %3 = bitcast <4 x i32> %1 to <2 x i64>
-//
-// %4 = bitcast <4 x i32> %2 to <2 x i64>
-//
-// %5 = icmp eq <2 x i64> %3, %4
-//
-// %6 = bitcast <2 x i64> %5 to <4 x i32>
-//
-bool VectorCombine::foldEqualShuffleAnd(Instruction &I) {
-  Value *Equal, *Shuffle, *L, *R;
-  CmpPredicate Pred;
-  SmallVector<int> Mask = {1, 0, 3, 2};
-
-  // Check pattern existance
-  if (!match(&I, m_CombineOr(m_c_And(m_SExt(m_Value(Equal)),
-                                     m_SExtOrSelf(m_Value(Shuffle))),
-                             m_Select(m_Value(Equal),
-                                      m_SExtOrSelf(m_Value(Shuffle)),
-                                      m_ZeroInt()))) ||
-      !match(Shuffle, m_Shuffle(m_SExtOrSelf(m_Specific(Equal)), m_Poison(),
-                                m_SpecificMask(Mask))) ||
-      !match(Equal, m_ICmp(Pred, m_Value(L), m_Value(R))) ||
-      !CmpInst::isEquality(Pred))
-    return false;
-
-  // Check argument type
-  auto *OldVecType = cast<VectorType>(L->getType());
-
-  if (OldVecType->isScalableTy() ||
-      !OldVecType->getElementType()->isIntegerTy())
-    return false;
-
-  int ElementCount = OldVecType->getElementCount().getFixedValue();
-  int ElementBitWidth = OldVecType->getElementType()->getIntegerBitWidth();
-
-  if (ElementCount != 4 || ElementBitWidth != 32)
-    return false;
-
-  // Check uses outside pattern
-  if (!Shuffle->hasOneUse())
-    return false;
-
-  for (auto *U : Equal->users()) {
-    if (U == &I || U == Shuffle)
-      continue;
-    if (!isa<llvm::CastInst>(U))
-      return false;
-    for (auto *U : U->users())
-      if (U != &I && U != Shuffle)
-        return false;
-  }
-
-  LLVM_DEBUG(dbgs() << "VC: Found equal-shuffle-and pattern" << '\n');
-
-  // Perform folding
-  IRBuilder Builder(&I);
-  auto *NewElementType = IntegerType::get(I.getContext(), ElementBitWidth * 2);
-  auto *NewVecType = VectorType::get(NewElementType, ElementCount / 2, false);
-  auto *BitCastL = Builder.CreateBitCast(L, NewVecType);
-  auto *BitCastR = Builder.CreateBitCast(R, NewVecType);
-  auto *Cmp = Builder.CreateICmp(Pred, BitCastL, BitCastR);
-  auto *SExt = Builder.CreateSExt(Cmp, NewVecType);
-  auto *BitCastCmp = Builder.CreateBitCast(SExt, OldVecType);
-
-  replaceValue(I, *BitCastCmp);
-
-  return false;
-}
-
 // Attempt to shrink loads that are only used by shufflevector instructions.
 bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
   auto *OldLoad = dyn_cast<LoadInst>(&I);
@@ -5865,17 +5777,11 @@ bool VectorCombine::run() {
           return true;
         if (foldBitOpOfCastConstant(I))
           return true;
-        if (foldEqualShuffleAnd(I))
-          return true;
         break;
       case Instruction::PHI:
         if (shrinkPhiOfShuffles(I))
           return true;
         break;
-      case Instruction::Select:
-        if (foldEqualShuffleAnd(I))
-          return true;
-        break;
       default:
         if (shrinkType(I))
           return true;
diff --git a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll b/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
similarity index 88%
rename from llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
rename to llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
index 4e13afd360673..ff23a9d4e277c 100644
--- a/llvm/test/Transforms/VectorCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
+++ b/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
 define <4 x i32> @cmpeq_epi64_select(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select(
@@ -62,7 +62,7 @@ define <4 x i32> @cmpeq_epi64_multi_use_cmp(<4 x i32> noundef %a, <4 x i32> noun
 ; CHECK-NEXT:    call void @use.v4i1(<4 x i1> [[CMP]])
 ; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SHUFFLE]], [[SEXT]]
+; CHECK-NEXT:    [[AND:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <4 x i32> [[AND]]
 ;
   %cmp = icmp eq <4 x i32> %a, %b
@@ -80,7 +80,7 @@ define <4 x i32> @cmpeq_epi64_multi_use_sext(<4 x i32> noundef %a, <4 x i32> nou
 ; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
 ; CHECK-NEXT:    call void @use.v4i32(<4 x i32> [[SEXT]])
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SHUFFLE]], [[SEXT]]
+; CHECK-NEXT:    [[AND:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <4 x i32> [[AND]]
 ;
   %cmp = icmp eq <4 x i32> %a, %b
@@ -98,7 +98,7 @@ define <4 x i32> @cmpeq_epi64_multi_use_shuffle(<4 x i32> noundef %a, <4 x i32>
 ; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    call void @use.v4i32(<4 x i32> [[SHUFFLE]])
-; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SHUFFLE]], [[SEXT]]
+; CHECK-NEXT:    [[AND:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <4 x i32> [[AND]]
 ;
   %cmp = icmp eq <4 x i32> %a, %b
@@ -113,8 +113,8 @@ define <4 x i32> @cmpeq_epi64_select_neg_0(<4 x i32> noundef %a, <4 x i32> nound
 ; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select_neg_0(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <4 x i32> [[SELECT]]
 ;
@@ -131,7 +131,7 @@ define <4 x i32> @cmpeq_epi64_and_neg_1(<4 x i32> noundef %a, <4 x i32> noundef
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
 ; CHECK-NEXT:    [[SEXT:%.*]] = zext <4 x i1> [[CMP]] to <4 x i32>
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SEXT]], [[SHUFFLE]]
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[SHUFFLE]], [[SEXT]]
 ; CHECK-NEXT:    ret <4 x i32> [[AND]]
 ;
   %cmp = icmp eq <4 x i32> %a, %b
@@ -145,8 +145,8 @@ define <4 x i32> @cmpeq_epi64_select_neg_2(<4 x i32> noundef %a, <4 x i32> nound
 ; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select_neg_2(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <4 x i32> [[SELECT]]
 ;
@@ -162,13 +162,13 @@ define <4 x i32> @cmpeq_epi64_select_neg_3(<4 x i32> noundef %a, <4 x i32> nound
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
 ; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> [[SEXT]]
-; CHECK-NEXT:    ret <4 x i32> [[SELECT]]
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE1]], <4 x i32> splat (i32 1)
+; CHECK-NEXT:    ret <4 x i32> [[SHUFFLE]]
 ;
   %cmp = icmp eq <4 x i32> %a, %b
   %sext = sext <4 x i1> %cmp to <4 x i32>
-  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  %select = select <4 x i1> %cmp, <4 x i32> %shuffle, <4 x i32> %sext
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %select = select <4 x i1> %cmp, <4 x i32> %shuffle, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   ret <4 x i32> %select
 }

>From 4b9e070d638ff320b0f085715e85dd03703523c5 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Thu, 5 Mar 2026 15:25:30 +0700
Subject: [PATCH 08/19] Add folding v2i64 cmpgt using v4i32 pattern test

---
 .../fold-v2i64-cmpgt-using-v4i32-pattern.ll   | 86 +++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll

diff --git a/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll b/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
new file mode 100644
index 0000000000000..3442b004eff13
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define <4 x i32> @alt_cmpgt_epi64(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[XOR_A:%.*]] = xor <4 x i32> [[A]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[XOR_B:%.*]] = xor <4 x i32> [[B]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[GT:%.*]] = icmp sgt <4 x i32> [[XOR_A]], [[XOR_B]]
+; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
+; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT_EQ:%.*]] = sext <4 x i1> [[EQ]] to <4 x i32>
+; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i32> [[SEXT_EQ]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[GT_0]], [[EQ_0]]
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %xor.b = xor <4 x i32> %b, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %gt = icmp sgt <4 x i32> %xor.a, %xor.b
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %and = and <4 x i32> %gt.0, %eq.0
+  %or = or <4 x i32> %and, %gt.1
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @alt_cmpgt_epu64_1(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epu64_1(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[GT:%.*]] = icmp ugt <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
+; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT_EQ:%.*]] = sext <4 x i1> [[EQ]] to <4 x i32>
+; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i32> [[SEXT_EQ]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[GT_0]], [[EQ_0]]
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %gt = icmp ugt <4 x i32> %a, %b
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %and = and <4 x i32> %gt.0, %eq.0
+  %or = or <4 x i32> %and, %gt.1
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @alt_cmpgt_epu64_2(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epu64_2(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[GT:%.*]] = icmp ugt <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
+; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT_EQ:%.*]] = sext <4 x i1> [[EQ]] to <4 x i32>
+; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i32> [[SEXT_EQ]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[GT_0]], [[EQ_0]]
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %xor.b = xor <4 x i32> %b, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %gt = icmp sgt <4 x i32> %xor.a, %xor.b
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %and = and <4 x i32> %gt.0, %eq.0
+  %or = or <4 x i32> %and, %gt.1
+  ret <4 x i32> %or
+}

>From 28599ed5b4ef8a838cb589700c28fc739ab86f55 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Fri, 6 Mar 2026 16:40:03 +0700
Subject: [PATCH 09/19] Apply folding for v2i64 greater comparison using v4i32
 pattern

---
 .../InstCombine/InstCombineAndOrXor.cpp       |  3 +
 .../InstCombine/InstCombineInternal.h         |  2 +
 .../InstCombine/InstructionCombining.cpp      | 79 +++++++++++++++++++
 .../fold-v2i64-cmpgt-using-v4i32-pattern.ll   | 44 ++++-------
 4 files changed, 99 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 30c1e8e8aca73..b872ac423f0c9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -4652,6 +4652,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (Value *Res = FoldOrOfSelectSmaxToAbs(I, Builder))
     return replaceInstUsesWith(I, Res);
 
+  if (Instruction *Folded = foldV2CmpGtUsingV4CmpGtPattern(I))
+    return Folded;
+
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index fed88cc84f46e..fb79291977eb6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -486,6 +486,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
 
   Instruction *foldV4EqualShuffleAndToV2Equal(Instruction &I);
 
+  Instruction *foldV2CmpGtUsingV4CmpGtPattern(BinaryOperator &I);
+
 public:
   /// Create and insert the idiom we use to indicate a block is unreachable
   /// without having to rewrite the CFG from within InstCombine.
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 1c71bb5387ab5..c2ef56ad67ce5 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1380,6 +1380,85 @@ Instruction *InstCombinerImpl::foldV4EqualShuffleAndToV2Equal(Instruction &I) {
   return replaceInstUsesWith(I, BitCastCmp);
 }
 
+// Prior to SSE4.2, to perform greater (or less than) comparisons between two
+// v2i64 values, the comparison is performed on v4i32 values:
+//
+// (A1, A2) -> (A1Lower, A1Upper, A2Lower, A2Upper)
+// (B1, B2) -> (B1Lower, B1Upper, B2Lower, B2Upper)
+// (Result1, Result2) -> (Result1, Result1, Result2, Result2)
+//
+// where,
+//
+// ResultX = (GtLowerX & EqUpperX) | (GtUpperX)
+// GtLowerX = AXLower OP BXLower
+// GtUpperX = AXUpper OP BXUpper
+// EqUpperX = AXUpper EQ BXUpper
+//
+// Upper and lower values are obtained through vector shuffles.
+Instruction *
+InstCombinerImpl::foldV2CmpGtUsingV4CmpGtPattern(BinaryOperator &I) {
+  if (I.getOpcode() != Instruction::Or)
+    return nullptr;
+
+  auto *OldVecType = dyn_cast<VectorType>(I.getType());
+
+  if (!OldVecType || OldVecType->isScalableTy() ||
+      !OldVecType->getElementType()->isIntegerTy(32) ||
+      OldVecType->getElementCount().getFixedValue() != 4)
+    return nullptr;
+
+  Value *A, *B, *Greater1, *Greater2, *Greater;
+  CmpPredicate PredEq;
+  SmallVector<int> MaskLower = {0, 0, 2, 2};
+  SmallVector<int> MaskUpper = {1, 1, 3, 3};
+
+  auto GreaterLower = m_SExtOrSelf(m_Shuffle(
+      m_SExtOrSelf(m_Value(Greater1)), m_Poison(), m_SpecificMask(MaskLower)));
+  auto GreaterUpper = m_SExtOrSelf(m_Shuffle(
+      m_SExtOrSelf(m_Value(Greater2)), m_Poison(), m_SpecificMask(MaskUpper)));
+  auto EqUpper = m_SExtOrSelf(
+      m_Shuffle(m_SExtOrSelf(m_c_ICmp(PredEq, m_Value(A), m_Value(B))),
+                m_Poison(), m_SpecificMask(MaskUpper)));
+
+  if (!match(&I, m_c_Or(m_c_And(GreaterLower, EqUpper), GreaterUpper)) ||
+      Greater1 != Greater2 || PredEq != ICmpInst::ICMP_EQ)
+    return nullptr;
+
+  Greater = Greater1;
+
+  auto *Zero = ConstantInt::get(IntegerType::getInt32Ty(I.getContext()), 0);
+  auto *Flip =
+      ConstantInt::get(IntegerType::getInt32Ty(I.getContext()), 0x80000000);
+  auto *FlipLower = ConstantVector::get({Flip, Zero, Flip, Zero});
+  auto *FlipAll = ConstantVector::get({Flip, Flip, Flip, Flip});
+
+  CmpPredicate PredGt;
+  auto UGt = m_c_ICmp(PredGt, m_Specific(A), m_Specific(B));
+  auto UGtAlt = m_c_ICmp(PredGt, m_c_Xor(m_Specific(A), m_Specific(FlipAll)),
+                         m_c_Xor(m_Specific(B), m_Specific(FlipAll)));
+  auto SGt = m_c_ICmp(PredGt, m_c_Xor(m_Specific(A), m_Specific(FlipLower)),
+                      m_c_Xor(m_Specific(B), m_Specific(FlipLower)));
+
+  if (!(match(Greater, UGt) &&
+        (PredGt == ICmpInst::ICMP_UGT || PredGt == ICmpInst::ICMP_ULT)) &&
+      !((match(Greater, SGt) || match(Greater, UGtAlt)) &&
+        (PredGt == ICmpInst::ICMP_SGT || PredGt == ICmpInst::ICMP_SLT)))
+    return nullptr;
+
+  LLVM_DEBUG(dbgs() << "Found V2CmpGt using V4CmpGt pattern" << '\n');
+
+  // Perform folding
+  auto *NewElementType = IntegerType::get(I.getContext(), 64);
+  auto *NewVecType = VectorType::get(NewElementType, 2, false);
+  auto *BitCastA = Builder.CreateBitCast(A, NewVecType);
+  auto *BitCastB = Builder.CreateBitCast(B, NewVecType);
+  auto *Cmp = Builder.CreateICmp(PredGt, BitCastA, BitCastB);
+  auto *SExt = Builder.CreateSExt(Cmp, NewVecType);
+  auto *BitCastCmp = Builder.CreateBitCast(SExt, OldVecType);
+
+  return replaceInstUsesWith(I, BitCastCmp);
+}
+
 static std::optional<std::pair<Value *, Value *>>
 matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS) {
   if (LHS->getParent() != RHS->getParent())
diff --git a/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll b/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
index 3442b004eff13..1ca2c71f988f3 100644
--- a/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
+++ b/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
@@ -4,17 +4,11 @@
 define <4 x i32> @alt_cmpgt_epi64(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[XOR_A:%.*]] = xor <4 x i32> [[A]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
-; CHECK-NEXT:    [[XOR_B:%.*]] = xor <4 x i32> [[B]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
-; CHECK-NEXT:    [[GT:%.*]] = icmp sgt <4 x i32> [[XOR_A]], [[XOR_B]]
-; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
-; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
-; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
-; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SEXT_EQ:%.*]] = sext <4 x i1> [[EQ]] to <4 x i32>
-; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i32> [[SEXT_EQ]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
-; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[GT_0]], [[EQ_0]]
-; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[OR:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[OR]]
 ;
   %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
@@ -34,15 +28,11 @@ define <4 x i32> @alt_cmpgt_epi64(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 define <4 x i32> @alt_cmpgt_epu64_1(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epu64_1(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[GT:%.*]] = icmp ugt <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
-; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
-; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
-; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SEXT_EQ:%.*]] = sext <4 x i1> [[EQ]] to <4 x i32>
-; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i32> [[SEXT_EQ]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
-; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[GT_0]], [[EQ_0]]
-; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[OR:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[OR]]
 ;
   %gt = icmp ugt <4 x i32> %a, %b
@@ -60,15 +50,11 @@ define <4 x i32> @alt_cmpgt_epu64_1(<4 x i32> noundef %a, <4 x i32> noundef %b)
 define <4 x i32> @alt_cmpgt_epu64_2(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epu64_2(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[GT:%.*]] = icmp ugt <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
-; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
-; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
-; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SEXT_EQ:%.*]] = sext <4 x i1> [[EQ]] to <4 x i32>
-; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i32> [[SEXT_EQ]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
-; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[GT_0]], [[EQ_0]]
-; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[OR:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[OR]]
 ;
   %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>

>From 5af139970d24bccc8cf266543cced2d34d01543f Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Fri, 6 Mar 2026 19:28:08 +0700
Subject: [PATCH 10/19] Remove multi-use test and improve documentation
 comments

---
 .../InstCombine/InstructionCombining.cpp      | 81 ++++++-------------
 .../fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll  | 57 -------------
 2 files changed, 26 insertions(+), 112 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index c2ef56ad67ce5..f56883e57740c 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1296,46 +1296,32 @@ Value *InstCombinerImpl::foldUsingDistributiveLaws(BinaryOperator &I) {
   return SimplifySelectsFeedingBinaryOp(I, LHS, RHS);
 }
 
-// Prior to SSE4.1, performing equality comparison on v2i64 types require a
-// comparison on v4i32 types using the following pattern:
-//
-// %3 = icmp eq <4 x i32> %1, %2
-//
-// %4 = sext <4 x i1> %3 to <4 x i32>
-//
-// %5 = shufflevector <4 x i32> %4, <4 x i32> poison, <4 x i32> <i32 1, i32 0,
-// i32 3, i32 2>
-//
-// %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32> zeroinitializer
-//
-// OR
-//
-// %6 = and <4 x i32> %sext, %shuffle
-//
-// We should detect such patterns and fold them into:
-//
-// %3 = bitcast <4 x i32> %1 to <2 x i64>
+// Prior to SSE4.1, to perform equality comparisons between two
+// v2i64 values, the comparison is performed on v4i32 values:
 //
-// %4 = bitcast <4 x i32> %2 to <2 x i64>
+// (A1, A2) -> (A1Lower, A1Upper, A2Lower, A2Upper)
+// (B1, B2) -> (B1Lower, B1Upper, B2Lower, B2Upper)
+// (Result1, Result2) -> (Result1, Result1, Result2, Result2)
 //
-// %5 = icmp eq <2 x i64> %3, %4
+// where,
 //
-// %6 = bitcast <2 x i64> %5 to <4 x i32>
+// ResultX = EqLowerX & EqUpperX
+// EqLowerX = AXLower == BXLower
+// EqUpperX = AXUpper == BXUpper
 //
+// Bitwise AND between the upper and lower parts can be achived by performing
+// the operation between the original and shuffled equality vector.
 Instruction *InstCombinerImpl::foldV4EqualShuffleAndToV2Equal(Instruction &I) {
-  Value *Equal, *Shuffle, *L, *R;
+  Value *L, *R;
   CmpPredicate Pred;
   SmallVector<int> Mask = {1, 0, 3, 2};
 
   // Check pattern existance
-  if (!match(&I,
-             m_CombineOr(m_c_And(m_SExt(m_Value(Equal)),
-                                 m_SExtOrSelf(m_Value(Shuffle))),
-                         m_Select(m_Value(Equal),
-                                  m_SExtOrSelf(m_Value(Shuffle)), m_Zero()))) ||
-      !match(Shuffle, m_Shuffle(m_SExtOrSelf(m_Specific(Equal)), m_Poison(),
-                                m_SpecificMask(Mask))) ||
-      !match(Equal, m_ICmp(Pred, m_Value(L), m_Value(R))) ||
+  auto Equal = m_ICmp(Pred, m_Value(L), m_Value(R));
+  auto Shuffle = m_SExtOrSelf(
+      m_Shuffle(m_SExtOrSelf(Equal), m_Poison(), m_SpecificMask(Mask)));
+  if (!match(&I, m_CombineOr(m_c_And(m_SExt(Equal), Shuffle),
+                             m_Select(Equal, Shuffle, m_Zero()))) ||
       Pred != CmpInst::ICMP_EQ)
     return nullptr;
 
@@ -1343,34 +1329,15 @@ Instruction *InstCombinerImpl::foldV4EqualShuffleAndToV2Equal(Instruction &I) {
   auto *OldVecType = cast<VectorType>(L->getType());
 
   if (OldVecType->isScalableTy() ||
-      !OldVecType->getElementType()->isIntegerTy())
-    return nullptr;
-
-  int ElementCount = OldVecType->getElementCount().getFixedValue();
-  int ElementBitWidth = OldVecType->getElementType()->getIntegerBitWidth();
-
-  if (ElementCount != 4 || ElementBitWidth != 32)
-    return nullptr;
-
-  // Check uses outside pattern
-  if (!Shuffle->hasOneUse())
+      !OldVecType->getElementType()->isIntegerTy(32) ||
+      OldVecType->getElementCount().getFixedValue() != 4)
     return nullptr;
 
-  for (auto *U : Equal->users()) {
-    if (U == &I || U == Shuffle)
-      continue;
-    if (!isa<llvm::CastInst>(U))
-      return nullptr;
-    for (auto *U : U->users())
-      if (U != &I && U != Shuffle)
-        return nullptr;
-  }
-
   LLVM_DEBUG(dbgs() << "IC: Folding equal-shuffle-and pattern" << '\n');
 
   // Perform folding
-  auto *NewElementType = IntegerType::get(I.getContext(), ElementBitWidth * 2);
-  auto *NewVecType = VectorType::get(NewElementType, ElementCount / 2, false);
+  auto *NewElementType = IntegerType::get(I.getContext(), 64);
+  auto *NewVecType = VectorType::get(NewElementType, 2, false);
   auto *BitCastL = Builder.CreateBitCast(L, NewVecType);
   auto *BitCastR = Builder.CreateBitCast(R, NewVecType);
   auto *Cmp = Builder.CreateICmp(Pred, BitCastL, BitCastR);
@@ -1394,7 +1361,11 @@ Instruction *InstCombinerImpl::foldV4EqualShuffleAndToV2Equal(Instruction &I) {
 // GtUpperX = AXUpper OP BXUpper
 // EqUpperX = AXUpper EQ BXUpper
 //
-// Upper and lower values are obtained through vector shuffles.
+// Upper and lower parts are obtained through vector shuffles.
+//
+// Note that comparison of the lower parts are always unsigned comparisons
+// regardless of the resulting signedness. Also note that, unsigned comparison
+// can be derived from signed comparison by flipping the MSB of both operands.
 Instruction *
 InstCombinerImpl::foldV2CmpGtUsingV4CmpGtPattern(BinaryOperator &I) {
   if (I.getOpcode() != Instruction::Or)
diff --git a/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll b/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
index ff23a9d4e277c..6300ff30f103c 100644
--- a/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
+++ b/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
@@ -52,63 +52,6 @@ define <4 x i32> @cmpeq_epi64_commutated_and(<4 x i32> noundef %a, <4 x i32> nou
   ret <4 x i32> %and
 }
 
-declare void @use.v4i1(<4 x i1>)
-declare void @use.v4i32(<4 x i32>)
-
-define <4 x i32> @cmpeq_epi64_multi_use_cmp(<4 x i32> noundef %a, <4 x i32> noundef %b) {
-; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_multi_use_cmp(
-; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    call void @use.v4i1(<4 x i1> [[CMP]])
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[AND:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    ret <4 x i32> [[AND]]
-;
-  %cmp = icmp eq <4 x i32> %a, %b
-  call void @use.v4i1(<4 x i1> %cmp)
-  %sext = sext <4 x i1> %cmp to <4 x i32>
-  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  %and = and <4 x i32> %shuffle, %sext
-  ret <4 x i32> %and
-}
-
-define <4 x i32> @cmpeq_epi64_multi_use_sext(<4 x i32> noundef %a, <4 x i32> noundef %b) {
-; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_multi_use_sext(
-; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-; CHECK-NEXT:    call void @use.v4i32(<4 x i32> [[SEXT]])
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[AND:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    ret <4 x i32> [[AND]]
-;
-  %cmp = icmp eq <4 x i32> %a, %b
-  %sext = sext <4 x i1> %cmp to <4 x i32>
-  call void @use.v4i32(<4 x i32> %sext)
-  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  %and = and <4 x i32> %shuffle, %sext
-  ret <4 x i32> %and
-}
-
-define <4 x i32> @cmpeq_epi64_multi_use_shuffle(<4 x i32> noundef %a, <4 x i32> noundef %b) {
-; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_multi_use_shuffle(
-; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    call void @use.v4i32(<4 x i32> [[SHUFFLE]])
-; CHECK-NEXT:    [[AND:%.*]] = select <4 x i1> [[CMP]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    ret <4 x i32> [[AND]]
-;
-  %cmp = icmp eq <4 x i32> %a, %b
-  %sext = sext <4 x i1> %cmp to <4 x i32>
-  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  call void @use.v4i32(<4 x i32> %shuffle)
-  %and = and <4 x i32> %shuffle, %sext
-  ret <4 x i32> %and
-}
-
 define <4 x i32> @cmpeq_epi64_select_neg_0(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select_neg_0(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {

>From 0a49316c935b9cf9c00e53707004e95a36283158 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Fri, 6 Mar 2026 23:33:21 +0700
Subject: [PATCH 11/19] Add commutated and negative test cases to fold v2i64
 cmpgt using v4i32 pattern test

---
 .../fold-v2i64-cmpgt-using-v4i32-pattern.ll   | 82 +++++++++++++++++++
 .../fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll  |  2 +-
 2 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll b/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
index 1ca2c71f988f3..bd310db1a35a8 100644
--- a/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
+++ b/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
@@ -70,3 +70,85 @@ define <4 x i32> @alt_cmpgt_epu64_2(<4 x i32> noundef %a, <4 x i32> noundef %b)
   %or = or <4 x i32> %and, %gt.1
   ret <4 x i32> %or
 }
+
+define <4 x i32> @alt_cmpgt_epi64_commutated_gt(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64_commutated_gt(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[OR:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %xor.b = xor <4 x i32> %b, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %gt = icmp sgt <4 x i32> %xor.b, %xor.a
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %and = and <4 x i32> %gt.0, %eq.0
+  %or = or <4 x i32> %and, %gt.1
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @alt_cmpgt_epi64_neg_0(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64_neg_0(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[XOR_A:%.*]] = xor <4 x i32> [[A]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[XOR_B:%.*]] = xor <4 x i32> [[B]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[GT:%.*]] = icmp sgt <4 x i32> [[XOR_B]], [[XOR_A]]
+; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
+; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i1> [[GT]], [[EQ]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[AND:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %xor.b = xor <4 x i32> %b, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %gt = icmp sgt <4 x i32> %xor.b, %xor.a
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %and = and <4 x i32> %gt.0, %eq.0
+  %or = or <4 x i32> %and, %gt.1
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @alt_cmpgt_epi64_neg_1(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64_neg_1(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[XOR_A:%.*]] = xor <4 x i32> [[A]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[XOR_B:%.*]] = xor <4 x i32> [[B]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[GT:%.*]] = icmp sgt <4 x i32> [[XOR_B]], [[XOR_A]]
+; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
+; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i1> [[GT]], [[EQ]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[AND:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_0]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %xor.b = xor <4 x i32> %b, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %gt = icmp sgt <4 x i32> %xor.b, %xor.a
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %and = and <4 x i32> %gt.1, %eq.0
+  %or = or <4 x i32> %and, %gt.0
+  ret <4 x i32> %or
+}
diff --git a/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll b/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
index 6300ff30f103c..8030565f26de9 100644
--- a/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
+++ b/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
@@ -111,7 +111,7 @@ define <4 x i32> @cmpeq_epi64_select_neg_3(<4 x i32> noundef %a, <4 x i32> nound
 ;
   %cmp = icmp eq <4 x i32> %a, %b
   %sext = sext <4 x i1> %cmp to <4 x i32>
-  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %shuffle = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   %select = select <4 x i1> %cmp, <4 x i32> %shuffle, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   ret <4 x i32> %select
 }

>From b637d8b41edf32b1d914da27b6cd11ffe81984cb Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Sat, 7 Mar 2026 00:48:50 +0700
Subject: [PATCH 12/19] Handle and transformed into select in folding v2i64
 cmpgt using v4i32 pattern

---
 .../InstCombine/InstructionCombining.cpp      |  8 +++++--
 .../fold-v2i64-cmpgt-using-v4i32-pattern.ll   | 23 +++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index f56883e57740c..d08c67a518fc4 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1387,11 +1387,15 @@ InstCombinerImpl::foldV2CmpGtUsingV4CmpGtPattern(BinaryOperator &I) {
       m_SExtOrSelf(m_Value(Greater1)), m_Poison(), m_SpecificMask(MaskLower)));
   auto GreaterUpper = m_SExtOrSelf(m_Shuffle(
       m_SExtOrSelf(m_Value(Greater2)), m_Poison(), m_SpecificMask(MaskUpper)));
-  auto EqUpper = m_SExtOrSelf(
+  auto EqUpper = m_Shuffle(m_c_ICmp(PredEq, m_Value(A), m_Value(B)), m_Poison(),
+                           m_SpecificMask(MaskUpper));
+  auto EqUpperSExt = m_SExtOrSelf(
       m_Shuffle(m_SExtOrSelf(m_c_ICmp(PredEq, m_Value(A), m_Value(B))),
                 m_Poison(), m_SpecificMask(MaskUpper)));
 
-  if (!match(&I, m_c_Or(m_c_And(GreaterLower, EqUpper), GreaterUpper)) ||
+  if (!match(&I, m_c_Or(m_CombineOr(m_c_And(GreaterLower, EqUpperSExt),
+                                    m_Select(EqUpper, GreaterLower, m_Zero())),
+                        GreaterUpper)) ||
       Greater1 != Greater2 || PredEq != ICmpInst::ICMP_EQ)
     return nullptr;
 
diff --git a/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll b/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
index bd310db1a35a8..8d1b8e40655c2 100644
--- a/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
+++ b/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
@@ -25,6 +25,29 @@ define <4 x i32> @alt_cmpgt_epi64(<4 x i32> noundef %a, <4 x i32> noundef %b) {
   ret <4 x i32> %or
 }
 
+define <4 x i32> @alt_cmpgt_epi64_select(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64_select(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[OR:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %xor.b = xor <4 x i32> %b, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %gt = icmp sgt <4 x i32> %xor.a, %xor.b
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %eq.0 = shufflevector <4 x i1> %eq, <4 x i1> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %select = select <4 x i1> %eq.0, <4 x i32> %gt.0, <4 x i32> zeroinitializer
+  %or = or <4 x i32> %select, %gt.1
+  ret <4 x i32> %or
+}
+
 define <4 x i32> @alt_cmpgt_epu64_1(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epu64_1(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {

>From 72915e89bea279eee3b5472081849ecbd28f8a62 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Sat, 7 Mar 2026 01:10:00 +0700
Subject: [PATCH 13/19] Check type upfront in folding equal-shuffle-and pattern

---
 .../InstCombine/InstructionCombining.cpp      | 25 +++++++++----------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index d08c67a518fc4..abe6d96d0567d 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1312,11 +1312,19 @@ Value *InstCombinerImpl::foldUsingDistributiveLaws(BinaryOperator &I) {
 // Bitwise AND between the upper and lower parts can be achived by performing
 // the operation between the original and shuffled equality vector.
 Instruction *InstCombinerImpl::foldV4EqualShuffleAndToV2Equal(Instruction &I) {
+  // Check argument type
+  auto *OldVecType = dyn_cast<VectorType>(I.getType());
+
+  if (!OldVecType || OldVecType->isScalableTy() ||
+      !OldVecType->getElementType()->isIntegerTy(32) ||
+      OldVecType->getElementCount().getFixedValue() != 4)
+    return nullptr;
+
+  // Check pattern existance
   Value *L, *R;
   CmpPredicate Pred;
   SmallVector<int> Mask = {1, 0, 3, 2};
 
-  // Check pattern existance
   auto Equal = m_ICmp(Pred, m_Value(L), m_Value(R));
   auto Shuffle = m_SExtOrSelf(
       m_Shuffle(m_SExtOrSelf(Equal), m_Poison(), m_SpecificMask(Mask)));
@@ -1325,14 +1333,6 @@ Instruction *InstCombinerImpl::foldV4EqualShuffleAndToV2Equal(Instruction &I) {
       Pred != CmpInst::ICMP_EQ)
     return nullptr;
 
-  // Check argument type
-  auto *OldVecType = cast<VectorType>(L->getType());
-
-  if (OldVecType->isScalableTy() ||
-      !OldVecType->getElementType()->isIntegerTy(32) ||
-      OldVecType->getElementCount().getFixedValue() != 4)
-    return nullptr;
-
   LLVM_DEBUG(dbgs() << "IC: Folding equal-shuffle-and pattern" << '\n');
 
   // Perform folding
@@ -1368,9 +1368,7 @@ Instruction *InstCombinerImpl::foldV4EqualShuffleAndToV2Equal(Instruction &I) {
 // can be derived from signed comparison by flipping the MSB of both operands.
 Instruction *
 InstCombinerImpl::foldV2CmpGtUsingV4CmpGtPattern(BinaryOperator &I) {
-  if (I.getOpcode() != Instruction::Or)
-    return nullptr;
-
+  // Check argument type
   auto *OldVecType = dyn_cast<VectorType>(I.getType());
 
   if (!OldVecType || OldVecType->isScalableTy() ||
@@ -1378,6 +1376,7 @@ InstCombinerImpl::foldV2CmpGtUsingV4CmpGtPattern(BinaryOperator &I) {
       OldVecType->getElementCount().getFixedValue() != 4)
     return nullptr;
 
+  // Check pattern existance
   Value *A, *B, *Greater1, *Greater2, *Greater;
   CmpPredicate PredEq;
   SmallVector<int> MaskLower = {0, 0, 2, 2};
@@ -1420,7 +1419,7 @@ InstCombinerImpl::foldV2CmpGtUsingV4CmpGtPattern(BinaryOperator &I) {
         (PredGt == ICmpInst::ICMP_SGT || PredGt == ICmpInst::ICMP_SLT)))
     return nullptr;
 
-  LLVM_DEBUG(dbgs() << "Found V2CmpGt using V4CmpGt pattern" << '\n');
+  LLVM_DEBUG(dbgs() << "IC: Folding V2CmpGt using V4CmpGt pattern" << '\n');
 
   // Perform folding
   auto *NewElementType = IntegerType::get(I.getContext(), 64);

>From 63b94bcc765761d22065167a11f0988a446bc286 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Sun, 8 Mar 2026 20:27:15 +0700
Subject: [PATCH 14/19] Generalize v2i64 cmpeq using v4i32 cmpeq pattern test

---
 ...> fold-vni2m-cmpeq-using-v2nim-pattern.ll} | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)
 rename llvm/test/Transforms/InstCombine/{fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll => fold-vni2m-cmpeq-using-v2nim-pattern.ll} (78%)

diff --git a/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll b/llvm/test/Transforms/InstCombine/fold-vni2m-cmpeq-using-v2nim-pattern.ll
similarity index 78%
rename from llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
rename to llvm/test/Transforms/InstCombine/fold-vni2m-cmpeq-using-v2nim-pattern.ll
index 8030565f26de9..3c7e01dd3caeb 100644
--- a/llvm/test/Transforms/InstCombine/fold-v4i32-eq-shuffle-and-to-v2i64-eq.ll
+++ b/llvm/test/Transforms/InstCombine/fold-vni2m-cmpeq-using-v2nim-pattern.ll
@@ -52,6 +52,38 @@ define <4 x i32> @cmpeq_epi64_commutated_and(<4 x i32> noundef %a, <4 x i32> nou
   ret <4 x i32> %and
 }
 
+define <4 x i32> @cmpeq_epi64_sext(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_sext(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i1> [[SHUFFLE]], [[CMP]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[AND]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[SEXT]]
+;
+  %cmp = icmp eq <4 x i32> %a, %b
+  %shuffle = shufflevector <4 x i1> %cmp, <4 x i1> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %and = and <4 x i1> %shuffle, %cmp
+  %sext = sext <4 x i1> %and to <4 x i32>
+  ret <4 x i32> %sext
+}
+
+define <6 x i32> @cmpeq_epi64_generalized(<6 x i32> noundef %a, <6 x i32> noundef %b) {
+; CHECK-LABEL: define <6 x i32> @cmpeq_epi64_generalized(
+; CHECK-SAME: <6 x i32> noundef [[A:%.*]], <6 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <6 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <6 x i1> [[CMP]] to <6 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <6 x i32> [[SEXT]], <6 x i32> poison, <6 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4>
+; CHECK-NEXT:    [[AND:%.*]] = select <6 x i1> [[CMP]], <6 x i32> [[SHUFFLE]], <6 x i32> zeroinitializer
+; CHECK-NEXT:    ret <6 x i32> [[AND]]
+;
+  %cmp = icmp eq <6 x i32> %a, %b
+  %sext = sext <6 x i1> %cmp to <6 x i32>
+  %shuffle = shufflevector <6 x i32> %sext, <6 x i32> poison, <6 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4>
+  %and = and <6 x i32> %sext, %shuffle
+  ret <6 x i32> %and
+}
+
 define <4 x i32> @cmpeq_epi64_select_neg_0(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_select_neg_0(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {

>From 60695b0f4719bca59c521dfbc2ad0c7518d894af Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Sun, 8 Mar 2026 20:35:57 +0700
Subject: [PATCH 15/19] Implement generalized transform of v2i64 cmpeq using
 v4i32 cmpeq pattern

---
 .../InstCombine/InstCombineAndOrXor.cpp       |  2 +-
 .../InstCombine/InstCombineCasts.cpp          |  4 ++
 .../InstCombine/InstCombineInternal.h         |  2 +-
 .../InstCombine/InstCombineSelect.cpp         |  2 +-
 .../InstCombine/InstructionCombining.cpp      | 50 ++++++++++++-------
 .../fold-vni2m-cmpeq-using-v2nim-pattern.ll   | 18 ++++---
 6 files changed, 48 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index b872ac423f0c9..26132bddd8121 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2898,7 +2898,7 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
                                       /*SimplifyOnly*/ false, *this))
     return BinaryOperator::CreateAnd(Op0, V);
 
-  if (auto *Folded = foldV4EqualShuffleAndToV2Equal(I))
+  if (auto *Folded = foldVni2mCmpEqUsingV2nim(I))
     return Folded;
 
   return nullptr;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 2f3c9c6a083bd..b00f839879344 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -1895,6 +1896,9 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) {
     }
   }
 
+  if (auto *Folded = foldVni2mCmpEqUsingV2nim(Sext))
+    return Folded;
+
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index fb79291977eb6..e04b13862b172 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -484,7 +484,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
     return Sel;
   }
 
-  Instruction *foldV4EqualShuffleAndToV2Equal(Instruction &I);
+  Instruction *foldVni2mCmpEqUsingV2nim(Instruction &I);
 
   Instruction *foldV2CmpGtUsingV4CmpGtPattern(BinaryOperator &I);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 598008a3d8d24..99ef4c02e37a5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -4982,7 +4982,7 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
     return CallInst::Create(Scmp, {CmpLHS, ConstantInt::get(SI.getType(), 0)});
   }
 
-  if (auto *Folded = foldV4EqualShuffleAndToV2Equal(SI))
+  if (auto *Folded = foldVni2mCmpEqUsingV2nim(SI))
     return Folded;
 
   return nullptr;
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index abe6d96d0567d..30677cf8f2fac 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1296,12 +1296,12 @@ Value *InstCombinerImpl::foldUsingDistributiveLaws(BinaryOperator &I) {
   return SimplifySelectsFeedingBinaryOp(I, LHS, RHS);
 }
 
-// Prior to SSE4.1, to perform equality comparisons between two
-// v2i64 values, the comparison is performed on v4i32 values:
+// Folds patterns which uses comparisons on <2N x iM> type for a <N x i2M>
+// equality comparison.
 //
-// (A1, A2) -> (A1Lower, A1Upper, A2Lower, A2Upper)
-// (B1, B2) -> (B1Lower, B1Upper, B2Lower, B2Upper)
-// (Result1, Result2) -> (Result1, Result1, Result2, Result2)
+// (A1, ..., AN) -> (A1Lower, A1Upper, ..., ANLower, ANUpper)
+// (B1, ..., BN) -> (B1Lower, B1Upper, ..., BNLower, BNUpper)
+// (Result1, ..., ResultN) -> (Result1, Result1, ..., ResultN, ResultN)
 //
 // where,
 //
@@ -1311,33 +1311,45 @@ Value *InstCombinerImpl::foldUsingDistributiveLaws(BinaryOperator &I) {
 //
 // Bitwise AND between the upper and lower parts can be achived by performing
 // the operation between the original and shuffled equality vector.
-Instruction *InstCombinerImpl::foldV4EqualShuffleAndToV2Equal(Instruction &I) {
-  // Check argument type
-  auto *OldVecType = dyn_cast<VectorType>(I.getType());
+Instruction *InstCombinerImpl::foldVni2mCmpEqUsingV2nim(Instruction &I) {
+  auto *ResultVecType = dyn_cast<VectorType>(I.getType());
 
-  if (!OldVecType || OldVecType->isScalableTy() ||
-      !OldVecType->getElementType()->isIntegerTy(32) ||
-      OldVecType->getElementCount().getFixedValue() != 4)
+  if (!ResultVecType || ResultVecType->isScalableTy() ||
+      !ResultVecType->getElementType()->isIntegerTy() ||
+      ResultVecType->getElementCount().getFixedValue() % 2 != 0)
     return nullptr;
 
   // Check pattern existance
   Value *L, *R;
   CmpPredicate Pred;
-  SmallVector<int> Mask = {1, 0, 3, 2};
+  ArrayRef<int> Mask;
 
-  auto Equal = m_ICmp(Pred, m_Value(L), m_Value(R));
-  auto Shuffle = m_SExtOrSelf(
-      m_Shuffle(m_SExtOrSelf(Equal), m_Poison(), m_SpecificMask(Mask)));
-  if (!match(&I, m_CombineOr(m_c_And(m_SExt(Equal), Shuffle),
-                             m_Select(Equal, Shuffle, m_Zero()))) ||
+  auto Equal = m_SExtOrSelf(m_ICmp(Pred, m_Value(L), m_Value(R)));
+  auto Shuffle = m_SExtOrSelf(m_Shuffle(Equal, m_Poison(), m_Mask(Mask)));
+  if (!match(&I,
+             m_SExtOrSelf(m_CombineOr(m_c_And(Equal, Shuffle),
+                                      m_Select(Equal, Shuffle, m_Zero())))) ||
       Pred != CmpInst::ICMP_EQ)
     return nullptr;
 
+  auto *OldVecType = cast<VectorType>(L->getType());
+
+  if (OldVecType != ResultVecType)
+    return nullptr;
+
+  // Example shuffle mask: {1, 0, 3, 2}
+  for (auto I = 0; I < static_cast<int>(Mask.size()); I += 2)
+    if (Mask[I] != I + 1 || Mask[I + 1] != I)
+      return nullptr;
+
   LLVM_DEBUG(dbgs() << "IC: Folding equal-shuffle-and pattern" << '\n');
 
   // Perform folding
-  auto *NewElementType = IntegerType::get(I.getContext(), 64);
-  auto *NewVecType = VectorType::get(NewElementType, 2, false);
+  auto OldElementCount = OldVecType->getElementCount().getFixedValue();
+  auto OldElementWidth = OldVecType->getElementType()->getIntegerBitWidth();
+  auto *NewElementType = IntegerType::get(I.getContext(), OldElementWidth * 2);
+  auto *NewVecType =
+      VectorType::get(NewElementType, OldElementCount / 2, false);
   auto *BitCastL = Builder.CreateBitCast(L, NewVecType);
   auto *BitCastR = Builder.CreateBitCast(R, NewVecType);
   auto *Cmp = Builder.CreateICmp(Pred, BitCastL, BitCastR);
diff --git a/llvm/test/Transforms/InstCombine/fold-vni2m-cmpeq-using-v2nim-pattern.ll b/llvm/test/Transforms/InstCombine/fold-vni2m-cmpeq-using-v2nim-pattern.ll
index 3c7e01dd3caeb..4f47b9636a001 100644
--- a/llvm/test/Transforms/InstCombine/fold-vni2m-cmpeq-using-v2nim-pattern.ll
+++ b/llvm/test/Transforms/InstCombine/fold-vni2m-cmpeq-using-v2nim-pattern.ll
@@ -55,10 +55,11 @@ define <4 x i32> @cmpeq_epi64_commutated_and(<4 x i32> noundef %a, <4 x i32> nou
 define <4 x i32> @cmpeq_epi64_sext(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @cmpeq_epi64_sext(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[AND:%.*]] = and <4 x i1> [[SHUFFLE]], [[CMP]]
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[AND]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[SEXT:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[SEXT]]
 ;
   %cmp = icmp eq <4 x i32> %a, %b
@@ -71,10 +72,11 @@ define <4 x i32> @cmpeq_epi64_sext(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 define <6 x i32> @cmpeq_epi64_generalized(<6 x i32> noundef %a, <6 x i32> noundef %b) {
 ; CHECK-LABEL: define <6 x i32> @cmpeq_epi64_generalized(
 ; CHECK-SAME: <6 x i32> noundef [[A:%.*]], <6 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <6 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <6 x i1> [[CMP]] to <6 x i32>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <6 x i32> [[SEXT]], <6 x i32> poison, <6 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4>
-; CHECK-NEXT:    [[AND:%.*]] = select <6 x i1> [[CMP]], <6 x i32> [[SHUFFLE]], <6 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <6 x i32> [[A]] to <3 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <6 x i32> [[B]] to <3 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <3 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <3 x i1> [[TMP3]] to <3 x i64>
+; CHECK-NEXT:    [[AND:%.*]] = bitcast <3 x i64> [[TMP4]] to <6 x i32>
 ; CHECK-NEXT:    ret <6 x i32> [[AND]]
 ;
   %cmp = icmp eq <6 x i32> %a, %b

>From 544e886226bbae60bbdfb8abb73a1e6d09feebe8 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Sun, 8 Mar 2026 20:52:43 +0700
Subject: [PATCH 16/19] Generalize v2i64 cmpgt using v4i32 cmpgt pattern test

---
 ...> fold-vni2m-cmpgt-using-v2nim-pattern.ll} | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)
 rename llvm/test/Transforms/InstCombine/{fold-v2i64-cmpgt-using-v4i32-pattern.ll => fold-vni2m-cmpgt-using-v2nim-pattern.ll} (71%)

diff --git a/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll b/llvm/test/Transforms/InstCombine/fold-vni2m-cmpgt-using-v2nim-pattern.ll
similarity index 71%
rename from llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
rename to llvm/test/Transforms/InstCombine/fold-vni2m-cmpgt-using-v2nim-pattern.ll
index 8d1b8e40655c2..6a853a20ba4ad 100644
--- a/llvm/test/Transforms/InstCombine/fold-v2i64-cmpgt-using-v4i32-pattern.ll
+++ b/llvm/test/Transforms/InstCombine/fold-vni2m-cmpgt-using-v2nim-pattern.ll
@@ -48,6 +48,62 @@ define <4 x i32> @alt_cmpgt_epi64_select(<4 x i32> noundef %a, <4 x i32> noundef
   ret <4 x i32> %or
 }
 
+define <4 x i32> @alt_cmpgt_epi64_sext(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64_sext(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[XOR_A:%.*]] = xor <4 x i32> [[A]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[XOR_B:%.*]] = xor <4 x i32> [[B]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[GT:%.*]] = icmp sgt <4 x i32> [[XOR_A]], [[XOR_B]]
+; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i1> [[GT]], <4 x i1> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i1> [[GT]], <4 x i1> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i1> [[EQ]], <4 x i1> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[EQ_0]], <4 x i1> [[GT_0]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i1> [[SELECT]], [[GT_1]]
+; CHECK-NEXT:    [[SEXT_OR:%.*]] = sext <4 x i1> [[OR]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[SEXT_OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %xor.b = xor <4 x i32> %b, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %gt = icmp sgt <4 x i32> %xor.a, %xor.b
+  %gt.0 = shufflevector <4 x i1> %gt, <4 x i1> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i1> %gt, <4 x i1> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %eq.0 = shufflevector <4 x i1> %eq, <4 x i1> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %select = select <4 x i1> %eq.0, <4 x i1> %gt.0, <4 x i1> zeroinitializer
+  %or = or <4 x i1> %select, %gt.1
+  %sext.or = sext <4 x i1> %or to <4 x i32>
+  ret <4 x i32> %sext.or
+}
+
+define <6 x i32> @alt_cmpgt_epi64_generalized(<6 x i32> noundef %a, <6 x i32> noundef %b) {
+; CHECK-LABEL: define <6 x i32> @alt_cmpgt_epi64_generalized(
+; CHECK-SAME: <6 x i32> noundef [[A:%.*]], <6 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[XOR_A:%.*]] = xor <6 x i32> [[A]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[XOR_B:%.*]] = xor <6 x i32> [[B]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[GT:%.*]] = icmp sgt <6 x i32> [[XOR_A]], [[XOR_B]]
+; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <6 x i1> [[GT]] to <6 x i32>
+; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <6 x i32> [[SEXT_GT]], <6 x i32> poison, <6 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4>
+; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <6 x i32> [[SEXT_GT]], <6 x i32> poison, <6 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5>
+; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <6 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <6 x i1> [[EQ]], <6 x i1> poison, <6 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5>
+; CHECK-NEXT:    [[SELECT:%.*]] = select <6 x i1> [[EQ_0]], <6 x i32> [[GT_0]], <6 x i32> zeroinitializer
+; CHECK-NEXT:    [[OR:%.*]] = or <6 x i32> [[SELECT]], [[GT_1]]
+; CHECK-NEXT:    ret <6 x i32> [[OR]]
+;
+  %xor.a = xor <6 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %xor.b = xor <6 x i32> %b, <i32 -2147483648, i32 0, i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %gt = icmp sgt <6 x i32> %xor.a, %xor.b
+  %sext.gt = sext <6 x i1> %gt to <6 x i32>
+  %gt.0 = shufflevector <6 x i32> %sext.gt, <6 x i32> poison, <6 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4>
+  %gt.1 = shufflevector <6 x i32> %sext.gt, <6 x i32> poison, <6 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5>
+  %eq = icmp eq <6 x i32> %a, %b
+  %eq.0 = shufflevector <6 x i1> %eq, <6 x i1> poison, <6 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5>
+  %select = select <6 x i1> %eq.0, <6 x i32> %gt.0, <6 x i32> zeroinitializer
+  %or = or <6 x i32> %select, %gt.1
+  ret <6 x i32> %or
+}
+
 define <4 x i32> @alt_cmpgt_epu64_1(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epu64_1(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {

>From 847d9e1ef78f558b72b2b80f375de8277b9dab65 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Mon, 9 Mar 2026 09:07:45 +0700
Subject: [PATCH 17/19] Implement generalized transform of v2i64 cmpegt using
 v4i32 cmpgt pattern and add more negative tests

---
 .../InstCombine/InstCombineAndOrXor.cpp       |   2 +-
 .../InstCombine/InstCombineCasts.cpp          |   3 +
 .../InstCombine/InstCombineInternal.h         |   2 +-
 .../InstCombine/InstructionCombining.cpp      | 112 ++++++++++------
 .../fold-vni2m-cmpgt-using-v2nim-pattern.ll   | 124 ++++++++++++++----
 5 files changed, 176 insertions(+), 67 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 26132bddd8121..a70a2ec49fcd1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -4652,7 +4652,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (Value *Res = FoldOrOfSelectSmaxToAbs(I, Builder))
     return replaceInstUsesWith(I, Res);
 
-  if (Instruction *Folded = foldV2CmpGtUsingV4CmpGtPattern(I))
+  if (Instruction *Folded = foldVni2mCmpGtUsingV2nim(I))
     return Folded;
 
   return nullptr;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index b00f839879344..caa820b3f7f7f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1899,6 +1899,9 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) {
   if (auto *Folded = foldVni2mCmpEqUsingV2nim(Sext))
     return Folded;
 
+  if (auto *Folded = foldVni2mCmpGtUsingV2nim(Sext))
+    return Folded;
+
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index e04b13862b172..5a468a7ab30ce 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -486,7 +486,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
 
   Instruction *foldVni2mCmpEqUsingV2nim(Instruction &I);
 
-  Instruction *foldV2CmpGtUsingV4CmpGtPattern(BinaryOperator &I);
+  Instruction *foldVni2mCmpGtUsingV2nim(Instruction &I);
 
 public:
   /// Create and insert the idiom we use to indicate a block is unreachable
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 30677cf8f2fac..4bc687922759a 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1342,7 +1342,8 @@ Instruction *InstCombinerImpl::foldVni2mCmpEqUsingV2nim(Instruction &I) {
     if (Mask[I] != I + 1 || Mask[I + 1] != I)
       return nullptr;
 
-  LLVM_DEBUG(dbgs() << "IC: Folding equal-shuffle-and pattern" << '\n');
+  LLVM_DEBUG(dbgs() << "IC: Folding Vn2im CmpEq using V2nim CmpEq pattern"
+                    << '\n');
 
   // Perform folding
   auto OldElementCount = OldVecType->getElementCount().getFixedValue();
@@ -1359,12 +1360,12 @@ Instruction *InstCombinerImpl::foldVni2mCmpEqUsingV2nim(Instruction &I) {
   return replaceInstUsesWith(I, BitCastCmp);
 }
 
-// Prior to SSE4.2, to perform greater (or less than) comparisons between two
-// v2i64 values, the comparison is performed on v4i32 values:
+// Folds patterns which uses comparisons on <2N x iM> type for a <N x i2M>
+// greater / less than comparison.
 //
-// (A1, A2) -> (A1Lower, A1Upper, A2Lower, A2Upper)
-// (B1, B2) -> (B1Lower, B1Upper, B2Lower, B2Upper)
-// (Result1, Result2) -> (Result1, Result1, Result2, Result2)
+// (A1, ..., AN) -> (A1Lower, A1Upper, ..., ANLower, ANUpper)
+// (B1, ..., BN) -> (B1Lower, B1Upper, ..., BNLower, BNUpper)
+// (Result1, ..., ResultN) -> (Result1, Result1, ..., ResultN, ResultN)
 //
 // where,
 //
@@ -1378,64 +1379,89 @@ Instruction *InstCombinerImpl::foldVni2mCmpEqUsingV2nim(Instruction &I) {
 // Note that comparison of the lower parts are always unsigned comparisons
 // regardless of the resulting signedness. Also note that, unsigned comparison
 // can be derived from signed comparison by flipping the MSB of both operands.
-Instruction *
-InstCombinerImpl::foldV2CmpGtUsingV4CmpGtPattern(BinaryOperator &I) {
-  // Check argument type
-  auto *OldVecType = dyn_cast<VectorType>(I.getType());
+Instruction *InstCombinerImpl::foldVni2mCmpGtUsingV2nim(Instruction &I) {
+  auto *ResultVecType = dyn_cast<VectorType>(I.getType());
 
-  if (!OldVecType || OldVecType->isScalableTy() ||
-      !OldVecType->getElementType()->isIntegerTy(32) ||
-      OldVecType->getElementCount().getFixedValue() != 4)
+  if (!ResultVecType || ResultVecType->isScalableTy() ||
+      !ResultVecType->getElementType()->isIntegerTy() ||
+      ResultVecType->getElementCount().getFixedValue() % 2 != 0)
     return nullptr;
 
   // Check pattern existance
-  Value *A, *B, *Greater1, *Greater2, *Greater;
+  Value *A, *B, *Greater1, *Greater2;
   CmpPredicate PredEq;
-  SmallVector<int> MaskLower = {0, 0, 2, 2};
-  SmallVector<int> MaskUpper = {1, 1, 3, 3};
-
-  auto GreaterLower = m_SExtOrSelf(m_Shuffle(
-      m_SExtOrSelf(m_Value(Greater1)), m_Poison(), m_SpecificMask(MaskLower)));
-  auto GreaterUpper = m_SExtOrSelf(m_Shuffle(
-      m_SExtOrSelf(m_Value(Greater2)), m_Poison(), m_SpecificMask(MaskUpper)));
-  auto EqUpper = m_Shuffle(m_c_ICmp(PredEq, m_Value(A), m_Value(B)), m_Poison(),
-                           m_SpecificMask(MaskUpper));
-  auto EqUpperSExt = m_SExtOrSelf(
+  ArrayRef<int> MaskLower, MaskUpper1, MaskUpper2;
+
+  auto GreaterLower = m_SExtOrSelf(m_Shuffle(m_SExtOrSelf(m_Value(Greater1)),
+                                             m_Poison(), m_Mask(MaskLower)));
+  auto GreaterUpper = m_SExtOrSelf(m_Shuffle(m_SExtOrSelf(m_Value(Greater2)),
+                                             m_Poison(), m_Mask(MaskUpper1)));
+  auto EqUpper = m_SExtOrSelf(
       m_Shuffle(m_SExtOrSelf(m_c_ICmp(PredEq, m_Value(A), m_Value(B))),
-                m_Poison(), m_SpecificMask(MaskUpper)));
+                m_Poison(), m_Mask(MaskUpper2)));
+  auto And =
+      m_SExtOrSelf(m_CombineOr(m_c_And(GreaterLower, EqUpper),
+                               m_Select(EqUpper, GreaterLower, m_Zero())));
+  auto Or = m_SExtOrSelf(m_c_Or(And, GreaterUpper));
+
+  if (!match(&I, Or) || Greater1 != Greater2 || MaskUpper1 != MaskUpper2 ||
+      PredEq != ICmpInst::ICMP_EQ)
+    return nullptr;
 
-  if (!match(&I, m_c_Or(m_CombineOr(m_c_And(GreaterLower, EqUpperSExt),
-                                    m_Select(EqUpper, GreaterLower, m_Zero())),
-                        GreaterUpper)) ||
-      Greater1 != Greater2 || PredEq != ICmpInst::ICMP_EQ)
+  auto *OldVecType = cast<VectorType>(A->getType());
+
+  if (OldVecType != ResultVecType)
     return nullptr;
 
-  Greater = Greater1;
+  // Example lower shuffle mask: {0, 0, 2, 2}
+  // Example upper shuffle mask: {1, 1, 3, 3}
+  for (auto I = 0; I < static_cast<int>(MaskLower.size()); I += 2)
+    if (MaskLower[I] != I || MaskLower[I + 1] != I || MaskUpper1[I] != I + 1 ||
+        MaskUpper1[I + 1] != I + 1)
+      return nullptr;
 
+  // Check greater comparison
   auto *Zero = ConstantInt::get(IntegerType::getInt32Ty(I.getContext()), 0);
-  auto *Flip =
+  auto *MsbFlip =
       ConstantInt::get(IntegerType::getInt32Ty(I.getContext()), 0x80000000);
-  auto *FlipLower = ConstantVector::get({Flip, Zero, Flip, Zero});
-  auto *FlipAll = ConstantVector::get({Flip, Flip, Flip, Flip});
-
+  Value *MsbFlipLower1 = nullptr, *MsbFlipLower2 = nullptr;
   CmpPredicate PredGt;
+
   auto UGt = m_c_ICmp(PredGt, m_Specific(A), m_Specific(B));
-  auto UGtAlt = m_c_ICmp(PredGt, m_c_Xor(m_Specific(A), m_Specific(FlipAll)),
-                         m_c_Xor(m_Specific(B), m_Specific(FlipAll)));
-  auto SGt = m_c_ICmp(PredGt, m_c_Xor(m_Specific(A), m_Specific(FlipLower)),
-                      m_c_Xor(m_Specific(B), m_Specific(FlipLower)));
+  auto UGtAlt = m_c_ICmp(
+      PredGt, m_c_Xor(m_Specific(A), m_ConstantSplat(m_ConstantInt(MsbFlip))),
+      m_c_Xor(m_Specific(B), m_ConstantSplat(m_ConstantInt(MsbFlip))));
+  auto SGt = m_c_ICmp(PredGt, m_c_Xor(m_Specific(A), m_Value(MsbFlipLower1)),
+                      m_c_Xor(m_Specific(B), m_Value(MsbFlipLower2)));
 
-  if (!(match(Greater, UGt) &&
+  if (!(match(Greater1, UGt) &&
         (PredGt == ICmpInst::ICMP_UGT || PredGt == ICmpInst::ICMP_ULT)) &&
-      !((match(Greater, SGt) || match(Greater, UGtAlt)) &&
+      !((match(Greater1, SGt) || match(Greater1, UGtAlt)) &&
         (PredGt == ICmpInst::ICMP_SGT || PredGt == ICmpInst::ICMP_SLT)))
     return nullptr;
 
-  LLVM_DEBUG(dbgs() << "IC: Folding V2CmpGt using V4CmpGt pattern" << '\n');
+  auto OldElementCount = OldVecType->getElementCount().getFixedValue();
+
+  if (MsbFlipLower1) {
+    auto *MsbFlipLower = dyn_cast<ConstantDataVector>(MsbFlipLower1);
+    if (!MsbFlipLower || MsbFlipLower2 != MsbFlipLower)
+      return nullptr;
+
+    // Example MSB flip lower mask: {0x80000000, 0, 0x80000000, 0}
+    for (auto I = 0; I < static_cast<int>(OldElementCount); I += 2)
+      if (MsbFlipLower->getAggregateElement(I) != MsbFlip ||
+          MsbFlipLower->getAggregateElement(I + 1) != Zero)
+        return nullptr;
+  }
+
+  LLVM_DEBUG(dbgs() << "IC: Folding Vn2im CmpGt using V2nim CmpGt pattern"
+                    << '\n');
 
   // Perform folding
-  auto *NewElementType = IntegerType::get(I.getContext(), 64);
-  auto *NewVecType = VectorType::get(NewElementType, 2, false);
+  auto OldElementWidth = OldVecType->getElementType()->getIntegerBitWidth();
+  auto *NewElementType = IntegerType::get(I.getContext(), OldElementWidth * 2);
+  auto *NewVecType =
+      VectorType::get(NewElementType, OldElementCount / 2, false);
   auto *BitCastA = Builder.CreateBitCast(A, NewVecType);
   auto *BitCastB = Builder.CreateBitCast(B, NewVecType);
   auto *Cmp = Builder.CreateICmp(PredGt, BitCastA, BitCastB);
diff --git a/llvm/test/Transforms/InstCombine/fold-vni2m-cmpgt-using-v2nim-pattern.ll b/llvm/test/Transforms/InstCombine/fold-vni2m-cmpgt-using-v2nim-pattern.ll
index 6a853a20ba4ad..d9d0266e39e58 100644
--- a/llvm/test/Transforms/InstCombine/fold-vni2m-cmpgt-using-v2nim-pattern.ll
+++ b/llvm/test/Transforms/InstCombine/fold-vni2m-cmpgt-using-v2nim-pattern.ll
@@ -51,16 +51,11 @@ define <4 x i32> @alt_cmpgt_epi64_select(<4 x i32> noundef %a, <4 x i32> noundef
 define <4 x i32> @alt_cmpgt_epi64_sext(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64_sext(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[XOR_A:%.*]] = xor <4 x i32> [[A]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
-; CHECK-NEXT:    [[XOR_B:%.*]] = xor <4 x i32> [[B]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
-; CHECK-NEXT:    [[GT:%.*]] = icmp sgt <4 x i32> [[XOR_A]], [[XOR_B]]
-; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i1> [[GT]], <4 x i1> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
-; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i1> [[GT]], <4 x i1> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
-; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i1> [[EQ]], <4 x i1> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
-; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[EQ_0]], <4 x i1> [[GT_0]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[OR:%.*]] = or <4 x i1> [[SELECT]], [[GT_1]]
-; CHECK-NEXT:    [[SEXT_OR:%.*]] = sext <4 x i1> [[OR]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[SEXT_OR:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[SEXT_OR]]
 ;
   %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
@@ -79,16 +74,11 @@ define <4 x i32> @alt_cmpgt_epi64_sext(<4 x i32> noundef %a, <4 x i32> noundef %
 define <6 x i32> @alt_cmpgt_epi64_generalized(<6 x i32> noundef %a, <6 x i32> noundef %b) {
 ; CHECK-LABEL: define <6 x i32> @alt_cmpgt_epi64_generalized(
 ; CHECK-SAME: <6 x i32> noundef [[A:%.*]], <6 x i32> noundef [[B:%.*]]) {
-; CHECK-NEXT:    [[XOR_A:%.*]] = xor <6 x i32> [[A]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0, i32 -2147483648, i32 0>
-; CHECK-NEXT:    [[XOR_B:%.*]] = xor <6 x i32> [[B]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0, i32 -2147483648, i32 0>
-; CHECK-NEXT:    [[GT:%.*]] = icmp sgt <6 x i32> [[XOR_A]], [[XOR_B]]
-; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <6 x i1> [[GT]] to <6 x i32>
-; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <6 x i32> [[SEXT_GT]], <6 x i32> poison, <6 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4>
-; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <6 x i32> [[SEXT_GT]], <6 x i32> poison, <6 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5>
-; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <6 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <6 x i1> [[EQ]], <6 x i1> poison, <6 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5>
-; CHECK-NEXT:    [[SELECT:%.*]] = select <6 x i1> [[EQ_0]], <6 x i32> [[GT_0]], <6 x i32> zeroinitializer
-; CHECK-NEXT:    [[OR:%.*]] = or <6 x i32> [[SELECT]], [[GT_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <6 x i32> [[A]] to <3 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <6 x i32> [[B]] to <3 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <3 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <3 x i1> [[TMP3]] to <3 x i64>
+; CHECK-NEXT:    [[OR:%.*]] = bitcast <3 x i64> [[TMP4]] to <6 x i32>
 ; CHECK-NEXT:    ret <6 x i32> [[OR]]
 ;
   %xor.a = xor <6 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0, i32 -2147483648, i32 0>
@@ -177,6 +167,66 @@ define <4 x i32> @alt_cmpgt_epi64_commutated_gt(<4 x i32> noundef %a, <4 x i32>
 define <4 x i32> @alt_cmpgt_epi64_neg_0(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64_neg_0(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[XOR_A:%.*]] = xor <4 x i32> [[A]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[XOR_B:%.*]] = xor <4 x i32> [[B]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[GT:%.*]] = icmp sgt <4 x i32> [[XOR_A]], [[XOR_B]]
+; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
+; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT_EQ:%.*]] = sext <4 x i1> [[EQ]] to <4 x i32>
+; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i32> [[SEXT_EQ]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[GT_0]], [[EQ_0]]
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 0>
+  %xor.b = xor <4 x i32> %b, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %gt = icmp sgt <4 x i32> %xor.a, %xor.b
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %and = and <4 x i32> %gt.0, %eq.0
+  %or = or <4 x i32> %and, %gt.1
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @alt_cmpgt_epi64_neg_1(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64_neg_1(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[XOR_A:%.*]] = xor <4 x i32> [[A]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[XOR_B:%.*]] = xor <4 x i32> [[B]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[GT:%.*]] = icmp ugt <4 x i32> [[XOR_A]], [[XOR_B]]
+; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
+; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[EQ]] to <4 x i32>
+; CHECK-NEXT:    [[AND:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[AND1:%.*]] = and <4 x i32> [[GT_0]], [[AND]]
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND1]], [[GT_1]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %xor.b = xor <4 x i32> %b, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %gt = icmp ugt <4 x i32> %xor.a, %xor.b
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %and = and <4 x i32> %gt.0, %eq.0
+  %or = or <4 x i32> %and, %gt.1
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @alt_cmpgt_epi64_neg_2(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64_neg_2(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
 ; CHECK-NEXT:    [[XOR_A:%.*]] = xor <4 x i32> [[A]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
 ; CHECK-NEXT:    [[XOR_B:%.*]] = xor <4 x i32> [[B]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
 ; CHECK-NEXT:    [[GT:%.*]] = icmp sgt <4 x i32> [[XOR_B]], [[XOR_A]]
@@ -203,8 +253,8 @@ define <4 x i32> @alt_cmpgt_epi64_neg_0(<4 x i32> noundef %a, <4 x i32> noundef
   ret <4 x i32> %or
 }
 
-define <4 x i32> @alt_cmpgt_epi64_neg_1(<4 x i32> noundef %a, <4 x i32> noundef %b) {
-; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64_neg_1(
+define <4 x i32> @alt_cmpgt_epi64_neg_3(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64_neg_3(
 ; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
 ; CHECK-NEXT:    [[XOR_A:%.*]] = xor <4 x i32> [[A]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
 ; CHECK-NEXT:    [[XOR_B:%.*]] = xor <4 x i32> [[B]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
@@ -231,3 +281,33 @@ define <4 x i32> @alt_cmpgt_epi64_neg_1(<4 x i32> noundef %a, <4 x i32> noundef
   %or = or <4 x i32> %and, %gt.0
   ret <4 x i32> %or
 }
+
+define <4 x i32> @alt_cmpgt_epu64_neg_0(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epu64_neg_0(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[XOR_A:%.*]] = xor <4 x i32> [[A]], <i32 0, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+; CHECK-NEXT:    [[XOR_B:%.*]] = xor <4 x i32> [[B]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[GT:%.*]] = icmp sgt <4 x i32> [[XOR_A]], [[XOR_B]]
+; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
+; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT_EQ:%.*]] = sext <4 x i1> [[EQ]] to <4 x i32>
+; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i32> [[SEXT_EQ]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[GT_0]], [[EQ_0]]
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 0, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %xor.b = xor <4 x i32> %b, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %gt = icmp sgt <4 x i32> %xor.a, %xor.b
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %and = and <4 x i32> %gt.0, %eq.0
+  %or = or <4 x i32> %and, %gt.1
+  ret <4 x i32> %or
+}

>From 3391977d1059df4da47c14c4d64bb933f89bb342 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Wed, 11 Mar 2026 09:11:02 +0700
Subject: [PATCH 18/19] Refactor folds into single function & replace auto when
 type is not explicit from context

---
 .../InstCombine/InstCombineAndOrXor.cpp       |  4 +-
 .../InstCombine/InstCombineCasts.cpp          |  5 +-
 .../InstCombine/InstCombineInternal.h         |  6 +-
 .../InstCombine/InstCombineSelect.cpp         |  2 +-
 .../InstCombine/InstructionCombining.cpp      | 95 ++++++++++---------
 5 files changed, 58 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index a70a2ec49fcd1..803f9f5aaf053 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2898,7 +2898,7 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
                                       /*SimplifyOnly*/ false, *this))
     return BinaryOperator::CreateAnd(Op0, V);
 
-  if (auto *Folded = foldVni2mCmpEqUsingV2nim(I))
+  if (Instruction *Folded = foldVecCmpOnHalfElementSize(I))
     return Folded;
 
   return nullptr;
@@ -4652,7 +4652,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (Value *Res = FoldOrOfSelectSmaxToAbs(I, Builder))
     return replaceInstUsesWith(I, Res);
 
-  if (Instruction *Folded = foldVni2mCmpGtUsingV2nim(I))
+  if (Instruction *Folded = foldVecCmpOnHalfElementSize(I))
     return Folded;
 
   return nullptr;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index caa820b3f7f7f..f6acf95a07e4a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1896,10 +1896,7 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) {
     }
   }
 
-  if (auto *Folded = foldVni2mCmpEqUsingV2nim(Sext))
-    return Folded;
-
-  if (auto *Folded = foldVni2mCmpGtUsingV2nim(Sext))
+  if (Instruction *Folded = foldVecCmpOnHalfElementSize(Sext))
     return Folded;
 
   return nullptr;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 5a468a7ab30ce..b2f588daa32f2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -423,6 +423,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   Value *foldAndOrOfICmpsUsingRanges(ICmpInst *ICmp1, ICmpInst *ICmp2,
                                      bool IsAnd);
 
+  Instruction *foldVecCmpOnHalfElementSize(Instruction &I);
+
   /// Optimize (fcmp)&(fcmp) or (fcmp)|(fcmp).
   /// NOTE: Unlike most of instcombine, this returns a Value which should
   /// already be inserted into the function.
@@ -484,10 +486,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
     return Sel;
   }
 
-  Instruction *foldVni2mCmpEqUsingV2nim(Instruction &I);
-
-  Instruction *foldVni2mCmpGtUsingV2nim(Instruction &I);
-
 public:
   /// Create and insert the idiom we use to indicate a block is unreachable
   /// without having to rewrite the CFG from within InstCombine.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 99ef4c02e37a5..d1c0cd3433b8c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -4982,7 +4982,7 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
     return CallInst::Create(Scmp, {CmpLHS, ConstantInt::get(SI.getType(), 0)});
   }
 
-  if (auto *Folded = foldVni2mCmpEqUsingV2nim(SI))
+  if (Instruction *Folded = foldVecCmpOnHalfElementSize(SI))
     return Folded;
 
   return nullptr;
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 4bc687922759a..a09ae95debcfe 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1311,14 +1311,9 @@ Value *InstCombinerImpl::foldUsingDistributiveLaws(BinaryOperator &I) {
 //
 // Bitwise AND between the upper and lower parts can be achived by performing
 // the operation between the original and shuffled equality vector.
-Instruction *InstCombinerImpl::foldVni2mCmpEqUsingV2nim(Instruction &I) {
-  auto *ResultVecType = dyn_cast<VectorType>(I.getType());
-
-  if (!ResultVecType || ResultVecType->isScalableTy() ||
-      !ResultVecType->getElementType()->isIntegerTy() ||
-      ResultVecType->getElementCount().getFixedValue() % 2 != 0)
-    return nullptr;
-
+static Value *foldVecCmpEqOnHalfElementSize(Instruction &I,
+                                            FixedVectorType *ResultVecType,
+                                            InstCombiner::BuilderTy &Builder) {
   // Check pattern existance
   Value *L, *R;
   CmpPredicate Pred;
@@ -1332,13 +1327,13 @@ Instruction *InstCombinerImpl::foldVni2mCmpEqUsingV2nim(Instruction &I) {
       Pred != CmpInst::ICMP_EQ)
     return nullptr;
 
-  auto *OldVecType = cast<VectorType>(L->getType());
+  auto *OldVecType = cast<FixedVectorType>(L->getType());
 
   if (OldVecType != ResultVecType)
     return nullptr;
 
   // Example shuffle mask: {1, 0, 3, 2}
-  for (auto I = 0; I < static_cast<int>(Mask.size()); I += 2)
+  for (int I = 0; I < static_cast<int>(Mask.size()); I += 2)
     if (Mask[I] != I + 1 || Mask[I + 1] != I)
       return nullptr;
 
@@ -1346,18 +1341,18 @@ Instruction *InstCombinerImpl::foldVni2mCmpEqUsingV2nim(Instruction &I) {
                     << '\n');
 
   // Perform folding
-  auto OldElementCount = OldVecType->getElementCount().getFixedValue();
-  auto OldElementWidth = OldVecType->getElementType()->getIntegerBitWidth();
-  auto *NewElementType = IntegerType::get(I.getContext(), OldElementWidth * 2);
-  auto *NewVecType =
+  unsigned int OldElementCount = OldVecType->getElementCount().getFixedValue();
+  unsigned int OldElementWidth = OldVecType->getScalarSizeInBits();
+  Type *NewElementType = IntegerType::get(I.getContext(), OldElementWidth * 2);
+  Type *NewVecType =
       VectorType::get(NewElementType, OldElementCount / 2, false);
-  auto *BitCastL = Builder.CreateBitCast(L, NewVecType);
-  auto *BitCastR = Builder.CreateBitCast(R, NewVecType);
-  auto *Cmp = Builder.CreateICmp(Pred, BitCastL, BitCastR);
-  auto *SExt = Builder.CreateSExt(Cmp, NewVecType);
-  auto *BitCastCmp = Builder.CreateBitCast(SExt, OldVecType);
+  Value *BitCastL = Builder.CreateBitCast(L, NewVecType);
+  Value *BitCastR = Builder.CreateBitCast(R, NewVecType);
+  Value *Cmp = Builder.CreateICmp(Pred, BitCastL, BitCastR);
+  Value *SExt = Builder.CreateSExt(Cmp, NewVecType);
+  Value *BitCastCmp = Builder.CreateBitCast(SExt, OldVecType);
 
-  return replaceInstUsesWith(I, BitCastCmp);
+  return BitCastCmp;
 }
 
 // Folds patterns which uses comparisons on <2N x iM> type for a <N x i2M>
@@ -1379,14 +1374,9 @@ Instruction *InstCombinerImpl::foldVni2mCmpEqUsingV2nim(Instruction &I) {
 // Note that comparison of the lower parts are always unsigned comparisons
 // regardless of the resulting signedness. Also note that, unsigned comparison
 // can be derived from signed comparison by flipping the MSB of both operands.
-Instruction *InstCombinerImpl::foldVni2mCmpGtUsingV2nim(Instruction &I) {
-  auto *ResultVecType = dyn_cast<VectorType>(I.getType());
-
-  if (!ResultVecType || ResultVecType->isScalableTy() ||
-      !ResultVecType->getElementType()->isIntegerTy() ||
-      ResultVecType->getElementCount().getFixedValue() % 2 != 0)
-    return nullptr;
-
+static Value *foldVecCmpGtOnHalfElementSize(Instruction &I,
+                                            FixedVectorType *ResultVecType,
+                                            InstCombiner::BuilderTy &Builder) {
   // Check pattern existance
   Value *A, *B, *Greater1, *Greater2;
   CmpPredicate PredEq;
@@ -1408,21 +1398,22 @@ Instruction *InstCombinerImpl::foldVni2mCmpGtUsingV2nim(Instruction &I) {
       PredEq != ICmpInst::ICMP_EQ)
     return nullptr;
 
-  auto *OldVecType = cast<VectorType>(A->getType());
+  auto *OldVecType = cast<FixedVectorType>(A->getType());
 
   if (OldVecType != ResultVecType)
     return nullptr;
 
   // Example lower shuffle mask: {0, 0, 2, 2}
   // Example upper shuffle mask: {1, 1, 3, 3}
-  for (auto I = 0; I < static_cast<int>(MaskLower.size()); I += 2)
+  for (int I = 0; I < static_cast<int>(MaskLower.size()); I += 2)
     if (MaskLower[I] != I || MaskLower[I + 1] != I || MaskUpper1[I] != I + 1 ||
         MaskUpper1[I + 1] != I + 1)
       return nullptr;
 
   // Check greater comparison
-  auto *Zero = ConstantInt::get(IntegerType::getInt32Ty(I.getContext()), 0);
-  auto *MsbFlip =
+  ConstantInt *Zero =
+      ConstantInt::get(IntegerType::getInt32Ty(I.getContext()), 0);
+  ConstantInt *MsbFlip =
       ConstantInt::get(IntegerType::getInt32Ty(I.getContext()), 0x80000000);
   Value *MsbFlipLower1 = nullptr, *MsbFlipLower2 = nullptr;
   CmpPredicate PredGt;
@@ -1440,7 +1431,7 @@ Instruction *InstCombinerImpl::foldVni2mCmpGtUsingV2nim(Instruction &I) {
         (PredGt == ICmpInst::ICMP_SGT || PredGt == ICmpInst::ICMP_SLT)))
     return nullptr;
 
-  auto OldElementCount = OldVecType->getElementCount().getFixedValue();
+  unsigned int OldElementCount = OldVecType->getElementCount().getFixedValue();
 
   if (MsbFlipLower1) {
     auto *MsbFlipLower = dyn_cast<ConstantDataVector>(MsbFlipLower1);
@@ -1448,7 +1439,7 @@ Instruction *InstCombinerImpl::foldVni2mCmpGtUsingV2nim(Instruction &I) {
       return nullptr;
 
     // Example MSB flip lower mask: {0x80000000, 0, 0x80000000, 0}
-    for (auto I = 0; I < static_cast<int>(OldElementCount); I += 2)
+    for (int I = 0; I < static_cast<int>(OldElementCount); I += 2)
       if (MsbFlipLower->getAggregateElement(I) != MsbFlip ||
           MsbFlipLower->getAggregateElement(I + 1) != Zero)
         return nullptr;
@@ -1458,17 +1449,35 @@ Instruction *InstCombinerImpl::foldVni2mCmpGtUsingV2nim(Instruction &I) {
                     << '\n');
 
   // Perform folding
-  auto OldElementWidth = OldVecType->getElementType()->getIntegerBitWidth();
-  auto *NewElementType = IntegerType::get(I.getContext(), OldElementWidth * 2);
-  auto *NewVecType =
+  unsigned int OldElementWidth = OldVecType->getScalarSizeInBits();
+  Type *NewElementType = IntegerType::get(I.getContext(), OldElementWidth * 2);
+  Type *NewVecType =
       VectorType::get(NewElementType, OldElementCount / 2, false);
-  auto *BitCastA = Builder.CreateBitCast(A, NewVecType);
-  auto *BitCastB = Builder.CreateBitCast(B, NewVecType);
-  auto *Cmp = Builder.CreateICmp(PredGt, BitCastA, BitCastB);
-  auto *SExt = Builder.CreateSExt(Cmp, NewVecType);
-  auto *BitCastCmp = Builder.CreateBitCast(SExt, OldVecType);
+  Value *BitCastA = Builder.CreateBitCast(A, NewVecType);
+  Value *BitCastB = Builder.CreateBitCast(B, NewVecType);
+  Value *Cmp = Builder.CreateICmp(PredGt, BitCastA, BitCastB);
+  Value *SExt = Builder.CreateSExt(Cmp, NewVecType);
+  Value *BitCastCmp = Builder.CreateBitCast(SExt, OldVecType);
+
+  return BitCastCmp;
+}
 
-  return replaceInstUsesWith(I, BitCastCmp);
+// Folds patterns which uses comparisons on <2N x iM> type for a <N x i2M>
+// comparison.
+Instruction *InstCombinerImpl::foldVecCmpOnHalfElementSize(Instruction &I) {
+  auto *ResultVecType = dyn_cast<FixedVectorType>(I.getType());
+
+  if (!ResultVecType || !ResultVecType->getElementType()->isIntegerTy() ||
+      ResultVecType->getElementCount().getFixedValue() % 2 != 0)
+    return nullptr;
+
+  if (Value *V = foldVecCmpEqOnHalfElementSize(I, ResultVecType, Builder))
+    return replaceInstUsesWith(I, V);
+
+  if (Value *V = foldVecCmpGtOnHalfElementSize(I, ResultVecType, Builder))
+    return replaceInstUsesWith(I, V);
+
+  return nullptr;
 }
 
 static std::optional<std::pair<Value *, Value *>>

>From db017d38e4ca0a73cca6889ebb6f20f0810ac2e2 Mon Sep 17 00:00:00 2001
From: Fuad Ismail <fuad1502 at gmail.com>
Date: Wed, 11 Mar 2026 11:00:55 +0700
Subject: [PATCH 19/19] Handle endianness when folding vector greater / less
 than compare using half element size

---
 .../InstCombine/InstructionCombining.cpp      | 30 ++++---
 ...2m-cmpgt-using-v2nim-pattern-big-endian.ll | 87 +++++++++++++++++++
 .../fold-vni2m-cmpgt-using-v2nim-pattern.ll   |  1 +
 3 files changed, 108 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/fold-vni2m-cmpgt-using-v2nim-pattern-big-endian.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index a09ae95debcfe..c37d45b28e8ea 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1376,6 +1376,7 @@ static Value *foldVecCmpEqOnHalfElementSize(Instruction &I,
 // can be derived from signed comparison by flipping the MSB of both operands.
 static Value *foldVecCmpGtOnHalfElementSize(Instruction &I,
                                             FixedVectorType *ResultVecType,
+                                            bool IsBigEndian,
                                             InstCombiner::BuilderTy &Builder) {
   // Check pattern existance
   Value *A, *B, *Greater1, *Greater2;
@@ -1403,12 +1404,16 @@ static Value *foldVecCmpGtOnHalfElementSize(Instruction &I,
   if (OldVecType != ResultVecType)
     return nullptr;
 
-  // Example lower shuffle mask: {0, 0, 2, 2}
-  // Example upper shuffle mask: {1, 1, 3, 3}
-  for (int I = 0; I < static_cast<int>(MaskLower.size()); I += 2)
-    if (MaskLower[I] != I || MaskLower[I + 1] != I || MaskUpper1[I] != I + 1 ||
-        MaskUpper1[I + 1] != I + 1)
+  // For little endian,
+  // example lower shuffle mask: {0, 0, 2, 2},
+  // example upper shuffle mask: {1, 1, 3, 3}
+  for (int I = 0; I < static_cast<int>(MaskLower.size()); I += 2) {
+    int LowerIdx = IsBigEndian ? I + 1 : I;
+    int UpperIdx = IsBigEndian ? I : I + 1;
+    if (MaskLower[I] != LowerIdx || MaskLower[I + 1] != LowerIdx ||
+        MaskUpper1[I] != UpperIdx || MaskUpper1[I + 1] != UpperIdx)
       return nullptr;
+  }
 
   // Check greater comparison
   ConstantInt *Zero =
@@ -1438,11 +1443,15 @@ static Value *foldVecCmpGtOnHalfElementSize(Instruction &I,
     if (!MsbFlipLower || MsbFlipLower2 != MsbFlipLower)
       return nullptr;
 
-    // Example MSB flip lower mask: {0x80000000, 0, 0x80000000, 0}
-    for (int I = 0; I < static_cast<int>(OldElementCount); I += 2)
-      if (MsbFlipLower->getAggregateElement(I) != MsbFlip ||
-          MsbFlipLower->getAggregateElement(I + 1) != Zero)
+    // For little endian,
+    // example MSB flip lower mask: {0x80000000, 0, 0x80000000, 0}
+    for (int I = 0; I < static_cast<int>(OldElementCount); I += 2) {
+      int LowerIdx = IsBigEndian ? I + 1 : I;
+      int UpperIdx = IsBigEndian ? I : I + 1;
+      if (MsbFlipLower->getAggregateElement(LowerIdx) != MsbFlip ||
+          MsbFlipLower->getAggregateElement(UpperIdx) != Zero)
         return nullptr;
+    }
   }
 
   LLVM_DEBUG(dbgs() << "IC: Folding Vn2im CmpGt using V2nim CmpGt pattern"
@@ -1474,7 +1483,8 @@ Instruction *InstCombinerImpl::foldVecCmpOnHalfElementSize(Instruction &I) {
   if (Value *V = foldVecCmpEqOnHalfElementSize(I, ResultVecType, Builder))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = foldVecCmpGtOnHalfElementSize(I, ResultVecType, Builder))
+  if (Value *V = foldVecCmpGtOnHalfElementSize(I, ResultVecType,
+                                               DL.isBigEndian(), Builder))
     return replaceInstUsesWith(I, V);
 
   return nullptr;
diff --git a/llvm/test/Transforms/InstCombine/fold-vni2m-cmpgt-using-v2nim-pattern-big-endian.ll b/llvm/test/Transforms/InstCombine/fold-vni2m-cmpgt-using-v2nim-pattern-big-endian.ll
new file mode 100644
index 0000000000000..e6cefc09666e8
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fold-vni2m-cmpgt-using-v2nim-pattern-big-endian.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+target datalayout = "E"
+
+define <4 x i32> @alt_cmpgt_epi64(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[OR:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 0, i32 -2147483648, i32 0, i32 -2147483648>
+  %xor.b = xor <4 x i32> %b, <i32 0, i32 -2147483648, i32 0, i32 -2147483648>
+  %gt = icmp sgt <4 x i32> %xor.a, %xor.b
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %and = and <4 x i32> %gt.0, %eq.0
+  %or = or <4 x i32> %and, %gt.1
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @alt_cmpgt_epi64_neg_0(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64_neg_0(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[XOR_A:%.*]] = xor <4 x i32> [[A]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[XOR_B:%.*]] = xor <4 x i32> [[B]], <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[GT:%.*]] = icmp sgt <4 x i32> [[XOR_A]], [[XOR_B]]
+; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
+; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT_EQ:%.*]] = sext <4 x i1> [[EQ]] to <4 x i32>
+; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i32> [[SEXT_EQ]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[GT_0]], [[EQ_0]]
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %xor.b = xor <4 x i32> %b, <i32 -2147483648, i32 0, i32 -2147483648, i32 0>
+  %gt = icmp sgt <4 x i32> %xor.a, %xor.b
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %and = and <4 x i32> %gt.0, %eq.0
+  %or = or <4 x i32> %and, %gt.1
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @alt_cmpgt_epi64_neg_1(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64_neg_1(
+; CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[XOR_A:%.*]] = xor <4 x i32> [[A]], <i32 0, i32 -2147483648, i32 0, i32 -2147483648>
+; CHECK-NEXT:    [[XOR_B:%.*]] = xor <4 x i32> [[B]], <i32 0, i32 -2147483648, i32 0, i32 -2147483648>
+; CHECK-NEXT:    [[GT:%.*]] = icmp sgt <4 x i32> [[XOR_A]], [[XOR_B]]
+; CHECK-NEXT:    [[SEXT_GT:%.*]] = sext <4 x i1> [[GT]] to <4 x i32>
+; CHECK-NEXT:    [[GT_0:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[GT_1:%.*]] = shufflevector <4 x i32> [[SEXT_GT]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[EQ:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEXT_EQ:%.*]] = sext <4 x i1> [[EQ]] to <4 x i32>
+; CHECK-NEXT:    [[EQ_0:%.*]] = shufflevector <4 x i32> [[SEXT_EQ]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[GT_0]], [[EQ_0]]
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND]], [[GT_1]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %xor.a = xor <4 x i32> %a, <i32 0, i32 -2147483648, i32 0, i32 -2147483648>
+  %xor.b = xor <4 x i32> %b, <i32 0, i32 -2147483648, i32 0, i32 -2147483648>
+  %gt = icmp sgt <4 x i32> %xor.a, %xor.b
+  %sext.gt = sext <4 x i1> %gt to <4 x i32>
+  %gt.0 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %gt.1 = shufflevector <4 x i32> %sext.gt, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %eq = icmp eq <4 x i32> %a, %b
+  %sext.eq = sext <4 x i1> %eq to <4 x i32>
+  %eq.0 = shufflevector <4 x i32> %sext.eq, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %and = and <4 x i32> %gt.0, %eq.0
+  %or = or <4 x i32> %and, %gt.1
+  ret <4 x i32> %or
+}
diff --git a/llvm/test/Transforms/InstCombine/fold-vni2m-cmpgt-using-v2nim-pattern.ll b/llvm/test/Transforms/InstCombine/fold-vni2m-cmpgt-using-v2nim-pattern.ll
index d9d0266e39e58..b7e249988989f 100644
--- a/llvm/test/Transforms/InstCombine/fold-vni2m-cmpgt-using-v2nim-pattern.ll
+++ b/llvm/test/Transforms/InstCombine/fold-vni2m-cmpgt-using-v2nim-pattern.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+target datalayout = "e"
 
 define <4 x i32> @alt_cmpgt_epi64(<4 x i32> noundef %a, <4 x i32> noundef %b) {
 ; CHECK-LABEL: define <4 x i32> @alt_cmpgt_epi64(