[llvm] [VPlan] Introduce m_Cmp; match more compares (PR #154771)

Thu Aug 21 08:25:01 PDT 2025

https://github.com/artagnon updated https://github.com/llvm/llvm-project/pull/154771

>From c7e4b8a42bdc310457bd970ea32ad1aa4d3ef323 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Thu, 21 Aug 2025 15:03:50 +0100
Subject: [PATCH 1/4] [VPlan] Introduce m_Cmp; match more compares

Extend [Specific]Cmp_match to handle floating-point compares, and
introduce m_Cmp that matches both integer and floating-point compares.
Use it in simplifyRecipe to match and simplify the general case of
compares.
---
 .../Transforms/Vectorize/VPlanPatternMatch.h  | 65 ++++++++++++++-----
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 22 +++----
 .../LoopVectorize/AArch64/masked-call.ll      | 20 +++---
 .../Transforms/LoopVectorize/select-cmp.ll    | 57 ++++++----------
 4 files changed, 87 insertions(+), 77 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 18ab7ddb425ab..75d5c4a472e2e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -395,16 +395,22 @@ m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) {
   return m_c_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
 }
 
-/// ICmp_match is a variant of BinaryRecipe_match that also binds the comparison
-/// predicate.
-template <typename Op0_t, typename Op1_t> struct ICmp_match {
+/// Cmp_match is a variant of BinaryRecipe_match that also binds the comparison
+/// predicate. Opcodes must be within Instruction::ICmp or Instruction::FCmp.
+template <typename Op0_t, typename Op1_t, unsigned... Opcodes>
+struct Cmp_match {
+  static_assert(sizeof...(Opcodes) == 1 || sizeof...(Opcodes) == 2);
+  static_assert((((Opcodes == Instruction::ICmp) ||
+                  (Opcodes == Instruction::FCmp)) ||
+                 ...));
+
   CmpPredicate *Predicate = nullptr;
   Op0_t Op0;
   Op1_t Op1;
 
-  ICmp_match(CmpPredicate &Pred, const Op0_t &Op0, const Op1_t &Op1)
+  Cmp_match(CmpPredicate &Pred, const Op0_t &Op0, const Op1_t &Op1)
       : Predicate(&Pred), Op0(Op0), Op1(Op1) {}
-  ICmp_match(const Op0_t &Op0, const Op1_t &Op1) : Op0(Op0), Op1(Op1) {}
+  Cmp_match(const Op0_t &Op0, const Op1_t &Op1) : Op0(Op0), Op1(Op1) {}
 
   bool match(const VPValue *V) const {
     auto *DefR = V->getDefiningRecipe();
@@ -412,7 +418,7 @@ template <typename Op0_t, typename Op1_t> struct ICmp_match {
   }
 
   bool match(const VPRecipeBase *V) const {
-    if (m_Binary<Instruction::ICmp>(Op0, Op1).match(V)) {
+    if ((m_Binary<Opcodes>(Op0, Op1).match(V) || ...)) {
       if (Predicate)
         *Predicate = cast<VPRecipeWithIRFlags>(V)->getPredicate();
       return true;
@@ -421,38 +427,63 @@ template <typename Op0_t, typename Op1_t> struct ICmp_match {
   }
 };
 
-/// SpecificICmp_match is a variant of ICmp_match that matches the comparison
+/// SpecificCmp_match is a variant of Cmp_match that matches the comparison
 /// predicate, instead of binding it.
-template <typename Op0_t, typename Op1_t> struct SpecificICmp_match {
+template <typename Op0_t, typename Op1_t, unsigned... Opcodes>
+struct SpecificCmp_match {
   const CmpPredicate Predicate;
   Op0_t Op0;
   Op1_t Op1;
 
-  SpecificICmp_match(CmpPredicate Pred, const Op0_t &LHS, const Op1_t &RHS)
+  SpecificCmp_match(CmpPredicate Pred, const Op0_t &LHS, const Op1_t &RHS)
       : Predicate(Pred), Op0(LHS), Op1(RHS) {}
 
   bool match(const VPValue *V) const {
     CmpPredicate CurrentPred;
-    return ICmp_match<Op0_t, Op1_t>(CurrentPred, Op0, Op1).match(V) &&
+    return Cmp_match<Op0_t, Op1_t, Opcodes...>(CurrentPred, Op0, Op1)
+               .match(V) &&
            CmpPredicate::getMatching(CurrentPred, Predicate);
   }
 };
 
 template <typename Op0_t, typename Op1_t>
-inline ICmp_match<Op0_t, Op1_t> m_ICmp(const Op0_t &Op0, const Op1_t &Op1) {
-  return ICmp_match<Op0_t, Op1_t>(Op0, Op1);
+inline Cmp_match<Op0_t, Op1_t, Instruction::ICmp> m_ICmp(const Op0_t &Op0,
+                                                         const Op1_t &Op1) {
+  return Cmp_match<Op0_t, Op1_t, Instruction::ICmp>(Op0, Op1);
 }
 
 template <typename Op0_t, typename Op1_t>
-inline ICmp_match<Op0_t, Op1_t> m_ICmp(CmpPredicate &Pred, const Op0_t &Op0,
-                                       const Op1_t &Op1) {
-  return ICmp_match<Op0_t, Op1_t>(Pred, Op0, Op1);
+inline Cmp_match<Op0_t, Op1_t, Instruction::ICmp>
+m_ICmp(CmpPredicate &Pred, const Op0_t &Op0, const Op1_t &Op1) {
+  return Cmp_match<Op0_t, Op1_t, Instruction::ICmp>(Pred, Op0, Op1);
 }
 
 template <typename Op0_t, typename Op1_t>
-inline SpecificICmp_match<Op0_t, Op1_t>
+inline SpecificCmp_match<Op0_t, Op1_t, Instruction::ICmp>
 m_SpecificICmp(CmpPredicate MatchPred, const Op0_t &Op0, const Op1_t &Op1) {
-  return SpecificICmp_match<Op0_t, Op1_t>(MatchPred, Op0, Op1);
+  return SpecificCmp_match<Op0_t, Op1_t, Instruction::ICmp>(MatchPred, Op0,
+                                                            Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline Cmp_match<Op0_t, Op1_t, Instruction::ICmp, Instruction::FCmp>
+m_Cmp(const Op0_t &Op0, const Op1_t &Op1) {
+  return Cmp_match<Op0_t, Op1_t, Instruction::ICmp, Instruction::FCmp>(Op0,
+                                                                       Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline Cmp_match<Op0_t, Op1_t, Instruction::ICmp, Instruction::FCmp>
+m_Cmp(CmpPredicate &Pred, const Op0_t &Op0, const Op1_t &Op1) {
+  return Cmp_match<Op0_t, Op1_t, Instruction::ICmp, Instruction::FCmp>(
+      Pred, Op0, Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline SpecificCmp_match<Op0_t, Op1_t, Instruction::ICmp, Instruction::FCmp>
+m_SpecificCmp(CmpPredicate MatchPred, const Op0_t &Op0, const Op1_t &Op1) {
+  return SpecificCmp_match<Op0_t, Op1_t, Instruction::ICmp, Instruction::FCmp>(
+      MatchPred, Op0, Op1);
 }
 
 template <typename Op0_t, typename Op1_t>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b25fc0af1fb51..e2cef0eafbec1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1106,18 +1106,16 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
       return Def->replaceAllUsesWith(A);
 
     // Try to fold Not into compares by adjusting the predicate in-place.
-    if (isa<VPWidenRecipe>(A) && A->getNumUsers() == 1) {
-      auto *WideCmp = cast<VPWidenRecipe>(A);
-      if (WideCmp->getOpcode() == Instruction::ICmp ||
-          WideCmp->getOpcode() == Instruction::FCmp) {
-        WideCmp->setPredicate(
-            CmpInst::getInversePredicate(WideCmp->getPredicate()));
-        Def->replaceAllUsesWith(WideCmp);
-        // If WideCmp doesn't have a debug location, use the one from the
-        // negation, to preserve the location.
-        if (!WideCmp->getDebugLoc() && R.getDebugLoc())
-          WideCmp->setDebugLoc(R.getDebugLoc());
-      }
+    CmpPredicate Pred;
+    if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue())) &&
+        A->getNumUsers() == 1) {
+      auto *Cmp = cast<VPRecipeWithIRFlags>(A);
+      Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
+      Def->replaceAllUsesWith(Cmp);
+      // If WideCmp doesn't have a debug location, use the one from the
+      // negation, to preserve the location.
+      if (!Cmp->getDebugLoc() && R.getDebugLoc())
+        Cmp->setDebugLoc(R.getDebugLoc());
     }
   }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 11bb4d234f3f3..251150e5e20db 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -973,18 +973,16 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ult i64 1, [[TMP0]]
 ; TFA_INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFA_INTERLEAVE:       [[VECTOR_BODY]]:
-; TFA_INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[TMP19:.*]] ]
-; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[TMP19]] ]
-; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[TMP19]] ]
+; TFA_INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[TMP18:.*]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[TMP18]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[TMP18]] ]
 ; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = load double, ptr [[P2]], align 8
 ; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = fcmp ogt double [[TMP5]], 0.000000e+00
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00
-; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = xor i1 [[TMP7]], true
-; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = xor i1 [[TMP8]], true
-; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = select i1 [[ACTIVE_LANE_MASK]], i1 [[TMP9]], i1 false
-; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], i1 [[TMP10]], i1 false
+; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = select i1 [[ACTIVE_LANE_MASK]], i1 [[TMP7]], i1 false
+; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], i1 [[TMP8]], i1 false
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP11]], double 1.000000e+00, double 0.000000e+00
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI3:%.*]] = select i1 [[TMP12]], double 1.000000e+00, double 0.000000e+00
 ; TFA_INTERLEAVE-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], double [[PREDPHI3]], double [[PREDPHI]]
@@ -993,11 +991,11 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP13]], true
 ; TFA_INTERLEAVE-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP14]], true
 ; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = or i1 [[TMP15]], [[TMP16]]
-; TFA_INTERLEAVE-NEXT:    br i1 [[TMP17]], label %[[BB18:.*]], label %[[TMP19]]
-; TFA_INTERLEAVE:       [[BB18]]:
+; TFA_INTERLEAVE-NEXT:    br i1 [[TMP17]], label %[[BB16:.*]], label %[[TMP18]]
+; TFA_INTERLEAVE:       [[BB16]]:
 ; TFA_INTERLEAVE-NEXT:    store double [[SPEC_SELECT]], ptr [[P]], align 8
-; TFA_INTERLEAVE-NEXT:    br label %[[TMP19]]
-; TFA_INTERLEAVE:       [[TMP19]]:
+; TFA_INTERLEAVE-NEXT:    br label %[[TMP18]]
+; TFA_INTERLEAVE:       [[TMP18]]:
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 1
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX]], [[TMP3]]
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
index 5e48b1f72b111..c9edf01e013fa 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -138,14 +138,10 @@ define i32 @select_const_i32_from_icmp(ptr %v, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP9]], 3
 ; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP10]], 3
 ; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP11]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP12]], true
-; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP13]], true
-; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP14]], true
-; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP15]], true
-; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]]
-; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]]
-; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]]
-; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]]
+; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
+; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
+; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
+; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP15]]
 ; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF1IC4-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -514,14 +510,10 @@ define i32 @select_i32_from_icmp(ptr %v, i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP9]], 3
 ; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP10]], 3
 ; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP11]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP12]], true
-; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP13]], true
-; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP14]], true
-; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP15]], true
-; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]]
-; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]]
-; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]]
-; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]]
+; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
+; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
+; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
+; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP15]]
 ; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF1IC4-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -704,14 +696,10 @@ define i32 @select_const_i32_from_fcmp_fast(ptr %v, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = fcmp fast ueq float [[TMP9]], 3.000000e+00
 ; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = fcmp fast ueq float [[TMP10]], 3.000000e+00
 ; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = fcmp fast ueq float [[TMP11]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP12]], true
-; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP13]], true
-; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP14]], true
-; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP15]], true
-; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]]
-; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]]
-; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]]
-; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]]
+; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
+; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
+; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
+; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP15]]
 ; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF1IC4-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -894,14 +882,10 @@ define i32 @select_const_i32_from_fcmp(ptr %v, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = fcmp ueq float [[TMP9]], 3.000000e+00
 ; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = fcmp ueq float [[TMP10]], 3.000000e+00
 ; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = fcmp ueq float [[TMP11]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP12]], true
-; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP13]], true
-; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP14]], true
-; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP15]], true
-; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]]
-; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]]
-; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]]
-; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]]
+; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
+; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
+; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
+; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP15]]
 ; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF1IC4-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1050,7 +1034,6 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
 ; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[A]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP0]], true
 ; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF1IC4:       [[VECTOR_BODY]]:
 ; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1058,10 +1041,10 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF1IC4-NEXT:    [[TMP5]] = or i1 [[VEC_PHI]], [[TMP4]]
-; CHECK-VF1IC4-NEXT:    [[TMP6]] = or i1 [[VEC_PHI1]], [[TMP4]]
-; CHECK-VF1IC4-NEXT:    [[TMP7]] = or i1 [[VEC_PHI2]], [[TMP4]]
-; CHECK-VF1IC4-NEXT:    [[TMP8]] = or i1 [[VEC_PHI3]], [[TMP4]]
+; CHECK-VF1IC4-NEXT:    [[TMP5]] = or i1 [[VEC_PHI]], [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP6]] = or i1 [[VEC_PHI1]], [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP7]] = or i1 [[VEC_PHI2]], [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP8]] = or i1 [[VEC_PHI3]], [[TMP0]]
 ; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]

>From 1faeac53424affa52bcff18bca708bf28c957d6c Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Thu, 21 Aug 2025 15:26:02 +0100
Subject: [PATCH 2/4] [VPlan/PatternMatch] Fix || versus && bug

---
 llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 75d5c4a472e2e..33d2fa8d5d94d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -400,9 +400,8 @@ m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) {
 template <typename Op0_t, typename Op1_t, unsigned... Opcodes>
 struct Cmp_match {
   static_assert(sizeof...(Opcodes) == 1 || sizeof...(Opcodes) == 2);
-  static_assert((((Opcodes == Instruction::ICmp) ||
-                  (Opcodes == Instruction::FCmp)) ||
-                 ...));
+  static_assert(
+      ((Opcodes == Instruction::ICmp || Opcodes == Instruction::FCmp) && ...));
 
   CmpPredicate *Predicate = nullptr;
   Op0_t Op0;

>From 301680e0f9e4d9c705ecee22afedabbb7a412807 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Thu, 21 Aug 2025 15:39:27 +0100
Subject: [PATCH 3/4] [VPlan] Fix Replicate-predicate bug

---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  2 ++
 .../LoopVectorize/AArch64/masked-call.ll      |  4 +--
 .../Transforms/LoopVectorize/select-cmp.ll    | 34 +++++++++----------
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 367a6ebf48dc2..26d54fbd009fb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -786,6 +786,8 @@ class VPIRFlags {
       I.setNonNeg(NonNegFlags.NonNeg);
       break;
     case OperationType::Cmp:
+      cast<CmpInst>(I).setPredicate(CmpPredicate);
+      break;
     case OperationType::Other:
       break;
     }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 251150e5e20db..e5697e121e9e9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -979,8 +979,8 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = load double, ptr [[P2]], align 8
 ; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
-; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = fcmp ogt double [[TMP5]], 0.000000e+00
-; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00
+; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = fcmp ule double [[TMP5]], 0.000000e+00
+; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = fcmp ule double [[TMP6]], 0.000000e+00
 ; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = select i1 [[ACTIVE_LANE_MASK]], i1 [[TMP7]], i1 false
 ; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], i1 [[TMP8]], i1 false
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP11]], double 1.000000e+00, double 0.000000e+00
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
index c9edf01e013fa..e4922d3e4f627 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -134,10 +134,10 @@ define i32 @select_const_i32_from_icmp(ptr %v, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
-; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP9]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP10]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP11]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP8]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP9]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP10]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = icmp ne i32 [[TMP11]], 3
 ; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
 ; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
 ; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
@@ -506,10 +506,10 @@ define i32 @select_i32_from_icmp(ptr %v, i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
-; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP9]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP10]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP11]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP8]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP9]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP10]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = icmp ne i32 [[TMP11]], 3
 ; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
 ; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
 ; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
@@ -692,10 +692,10 @@ define i32 @select_const_i32_from_fcmp_fast(ptr %v, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
-; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = fcmp fast ueq float [[TMP8]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = fcmp fast ueq float [[TMP9]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = fcmp fast ueq float [[TMP10]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = fcmp fast ueq float [[TMP11]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = fcmp fast one float [[TMP8]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = fcmp fast one float [[TMP9]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = fcmp fast one float [[TMP10]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = fcmp fast one float [[TMP11]], 3.000000e+00
 ; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
 ; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
 ; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
@@ -878,10 +878,10 @@ define i32 @select_const_i32_from_fcmp(ptr %v, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
-; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = fcmp ueq float [[TMP8]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = fcmp ueq float [[TMP9]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = fcmp ueq float [[TMP10]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = fcmp ueq float [[TMP11]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = fcmp one float [[TMP8]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = fcmp one float [[TMP9]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = fcmp one float [[TMP10]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = fcmp one float [[TMP11]], 3.000000e+00
 ; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
 ; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
 ; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
@@ -1033,7 +1033,7 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4:       [[VECTOR_PH]]:
 ; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
 ; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[A]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[A]], 3
 ; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF1IC4:       [[VECTOR_BODY]]:
 ; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]

>From 57ae88cd94fb8e888cd53ac69533cc51d1187d20 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Thu, 21 Aug 2025 16:24:26 +0100
Subject: [PATCH 4/4] [VPlan] Fix comment

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e2cef0eafbec1..dd8b0f1c873aa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1112,8 +1112,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
       auto *Cmp = cast<VPRecipeWithIRFlags>(A);
       Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
       Def->replaceAllUsesWith(Cmp);
-      // If WideCmp doesn't have a debug location, use the one from the
-      // negation, to preserve the location.
+      // If Cmp doesn't have a debug location, use the one from the negation, to
+      // preserve the location.
       if (!Cmp->getDebugLoc() && R.getDebugLoc())
         Cmp->setDebugLoc(R.getDebugLoc());
     }