[llvm] [VectorCombine] Refine cost model and decision logic in foldSelectShuffle (PR #146694)

Fri Jul 25 04:27:07 PDT 2025

https://github.com/igogo-x86 updated https://github.com/llvm/llvm-project/pull/146694

>From 5e8ec3b16c1720785b3a073bac6a2113999a820e Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov at arm.com>
Date: Wed, 2 Jul 2025 13:00:38 +0000
Subject: [PATCH 1/4] [VectorCombine] Refine cost model and decision logic in
 foldSelectShuffle

After PR #136329, shuffle indices may differ, which can cause the
existing cost-based logic to miss optimization opportunities for
binop/shuffle sequences.

This patch improves the cost model in foldSelectShuffle to more
accurately assess costs, recognizing when certain duplicate shuffles
do not require actual instructions.

Additionally, in break-even cases, this change introduces a check for
whether the pattern ultimately feeds into a vector reduction,
allowing the transform to proceed when it is likely to be profitable
overall.
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 116 +++++++++++++++++-
 .../PhaseOrdering/AArch64/slpordering.ll      |  50 ++++----
 2 files changed, 139 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 82adc34fdbd84..1f856f765f5bc 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3174,6 +3174,47 @@ bool VectorCombine::foldCastFromReductions(Instruction &I) {
   return true;
 }
 
+/// Returns true if this ShuffleVectorInst eventually feeds into a
+/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
+/// chains of shuffles and binary operators (in any combination/order).
+static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI) {
+  SmallPtrSet<Instruction *, 8> Visited;
+  SmallVector<Instruction *, 4> WorkList;
+  bool FoundReduction = false;
+
+  WorkList.push_back(SVI);
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.pop_back_val();
+    for (User *U : I->users()) {
+      auto *UI = dyn_cast<Instruction>(U);
+      if (!UI || !Visited.insert(UI).second)
+        continue;
+      if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
+        // More than one reduction reached
+        if (FoundReduction)
+          return false;
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::vector_reduce_add:
+        case Intrinsic::vector_reduce_mul:
+        case Intrinsic::vector_reduce_and:
+        case Intrinsic::vector_reduce_or:
+        case Intrinsic::vector_reduce_xor:
+          FoundReduction = true;
+          continue;
+        default:
+          return false;
+        }
+      }
+
+      if (!isa<BinaryOperator>(UI) && !isa<ShuffleVectorInst>(UI))
+        return false;
+
+      WorkList.emplace_back(UI);
+    }
+  }
+  return FoundReduction;
+}
+
 /// This method looks for groups of shuffles acting on binops, of the form:
 ///  %x = shuffle ...
 ///  %y = shuffle ...
@@ -3416,6 +3457,65 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
            TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, VT, Mask, CostKind);
   };
 
+  unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
+  unsigned MaxVectorSize =
+      TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
+  unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
+  // When there are multiple shufflevector operations on the same input,
+  // especially when the vector length is larger than the register size,
+  // identical shuffle patterns may occur across different groups of elements.
+  // To avoid overestimating the cost by counting these repeated shuffles more
+  // than once, we only account for unique shuffle patterns. This adjustment
+  // prevents inflated costs in the cost model for wide vectors split into
+  // several register-sized groups.
+  std::set<SmallVector<int, 4>> UniqueShuffles;
+  auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
+    // Compute the cost for performing the shuffle over the full vector.
+    auto ShuffleCost =
+        TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, VT, Mask, CostKind);
+    unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
+    if (NumFullVectors < 2)
+      return C + ShuffleCost;
+    SmallVector<int, 4> SubShuffle(MaxElementsInVector);
+    unsigned NumUniqueGroups = 0;
+    unsigned NumGroups = Mask.size() / MaxElementsInVector;
+    // For each group of MaxElementsInVector contiguous elements,
+    // collect their shuffle pattern and insert into the set of unique patterns.
+    for (unsigned k = 0; k < NumFullVectors; ++k) {
+      for (unsigned l = 0; l < MaxElementsInVector; ++l)
+        SubShuffle[l] = Mask[MaxElementsInVector * k + l];
+      if (UniqueShuffles.insert(SubShuffle).second)
+        NumUniqueGroups += 1;
+    }
+    return C + ShuffleCost * NumUniqueGroups / NumGroups;
+  };
+  auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
+    auto *SV = dyn_cast<ShuffleVectorInst>(I);
+    if (!SV)
+      return C;
+    SmallVector<int, 16> Mask;
+    SV->getShuffleMask(Mask);
+    return AddShuffleMaskAdjustedCost(C, Mask);
+  };
+  // Check that input consists of ShuffleVectors applied to the same input
+  auto AllShufflesHaveSameOperands =
+      [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
+        if (InputShuffles.size() < 2)
+          return false;
+        ShuffleVectorInst *FirstSV =
+            dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
+        if (!FirstSV)
+          return false;
+
+        Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
+        return std::all_of(
+            std::next(InputShuffles.begin()), InputShuffles.end(),
+            [&](Instruction *I) {
+              ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
+              return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
+            });
+      };
+
   // Get the costs of the shuffles + binops before and after with the new
   // shuffle masks.
   InstructionCost CostBefore =
@@ -3423,8 +3523,14 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
       TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
   CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
                                 InstructionCost(0), AddShuffleCost);
-  CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
-                                InstructionCost(0), AddShuffleCost);
+  if (AllShufflesHaveSameOperands(InputShuffles)) {
+    UniqueShuffles.clear();
+    CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
+                                  InstructionCost(0), AddShuffleAdjustedCost);
+  } else {
+    CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
+                                  InstructionCost(0), AddShuffleCost);
+  }
 
   // The new binops will be unused for lanes past the used shuffle lengths.
   // These types attempt to get the correct cost for that from the target.
@@ -3435,8 +3541,9 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
   InstructionCost CostAfter =
       TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
       TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
+  UniqueShuffles.clear();
   CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
-                               InstructionCost(0), AddShuffleMaskCost);
+                               InstructionCost(0), AddShuffleMaskAdjustedCost);
   std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
   CostAfter +=
       std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
@@ -3445,7 +3552,8 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
   LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
   LLVM_DEBUG(dbgs() << "  CostBefore: " << CostBefore
                     << " vs CostAfter: " << CostAfter << "\n");
-  if (CostBefore <= CostAfter)
+  if (CostBefore < CostAfter || CostBefore == 0 ||
+      (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
     return false;
 
   // The cost model has passed, create the new instructions.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
index 26573a3e613da..a2019836098e8 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
@@ -80,29 +80,33 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32
 ; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP48:%.*]] = add nsw <16 x i32> [[TMP45]], [[TMP47]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP44]], [[TMP46]]
-; CHECK-NEXT:    [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 16, i32 0, i32 17, i32 1, i32 18, i32 2, i32 19, i32 3, i32 20, i32 4, i32 21, i32 5, i32 22, i32 6, i32 23, i32 7>
-; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 17, i32 1, i32 16, i32 0, i32 19, i32 3, i32 18, i32 2, i32 21, i32 5, i32 20, i32 4, i32 23, i32 7, i32 22, i32 6>
-; CHECK-NEXT:    [[TMP52:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP51]]
-; CHECK-NEXT:    [[TMP53:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP51]]
-; CHECK-NEXT:    [[TMP54:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP52]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP56:%.*]] = sub nsw <16 x i32> [[TMP54]], [[TMP55]]
-; CHECK-NEXT:    [[TMP57:%.*]] = add nsw <16 x i32> [[TMP54]], [[TMP55]]
-; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP61]]
-; CHECK-NEXT:    [[TMP63:%.*]] = sub nsw <16 x i32> [[TMP58]], [[TMP60]]
-; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP65:%.*]] = lshr <16 x i32> [[TMP64]], splat (i32 15)
-; CHECK-NEXT:    [[TMP66:%.*]] = and <16 x i32> [[TMP65]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP67:%.*]] = mul nuw <16 x i32> [[TMP66]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP68:%.*]] = add <16 x i32> [[TMP67]], [[TMP64]]
-; CHECK-NEXT:    [[TMP69:%.*]] = xor <16 x i32> [[TMP68]], [[TMP67]]
-; CHECK-NEXT:    [[TMP70:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
-; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP70]], 65535
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP70]], 16
+; CHECK-NEXT:    [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]]
+; CHECK-NEXT:    [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]]
+; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP60:%.*]] = sub nsw <16 x i32> [[TMP57]], [[TMP59]]
+; CHECK-NEXT:    [[TMP61:%.*]] = add nsw <16 x i32> [[TMP56]], [[TMP58]]
+; CHECK-NEXT:    [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]]
+; CHECK-NEXT:    [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]]
+; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], splat (i32 15)
+; CHECK-NEXT:    [[TMP70:%.*]] = and <16 x i32> [[TMP69]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP72:%.*]] = add <16 x i32> [[TMP71]], [[TMP68]]
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <16 x i32> [[TMP72]], [[TMP71]]
+; CHECK-NEXT:    [[TMP74:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP73]])
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP74]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP74]], 16
 ; CHECK-NEXT:    [[RDD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
 ; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[RDD119]], 1
 ; CHECK-NEXT:    ret i32 [[SHR120]]

>From 1bca08619f1e9ba7be81c91ab49191ca98403243 Mon Sep 17 00:00:00 2001
From: "Igor.Kirillov at arm.com"
 <igokir01 at ip-10-248-7-180.eu-central-1.compute.internal>
Date: Wed, 23 Jul 2025 16:57:24 +0000
Subject: [PATCH 2/4] Address comments

---
 .../Transforms/Vectorize/VectorCombine.cpp    | 29 ++++++++++++-------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 1f856f765f5bc..b9b695588bd49 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3177,14 +3177,18 @@ bool VectorCombine::foldCastFromReductions(Instruction &I) {
 /// Returns true if this ShuffleVectorInst eventually feeds into a
 /// vector reduction intrinsic (e.g., vector_reduce_add) by only following
 /// chains of shuffles and binary operators (in any combination/order).
-static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI) {
+/// The search does not go deeper than the given Depth.
+static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI, unsigned Depth) {
   SmallPtrSet<Instruction *, 8> Visited;
-  SmallVector<Instruction *, 4> WorkList;
+  SmallVector<std::pair<Instruction *, unsigned>, 4> WorkList;
   bool FoundReduction = false;
 
-  WorkList.push_back(SVI);
+  WorkList.push_back({SVI, 0});
   while (!WorkList.empty()) {
-    Instruction *I = WorkList.pop_back_val();
+    auto [I, CurDepth] = WorkList.pop_back_val();
+    if (CurDepth > Depth)
+      return false;
+
     for (User *U : I->users()) {
       auto *UI = dyn_cast<Instruction>(U);
       if (!UI || !Visited.insert(UI).second)
@@ -3199,6 +3203,10 @@ static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI) {
         case Intrinsic::vector_reduce_and:
         case Intrinsic::vector_reduce_or:
         case Intrinsic::vector_reduce_xor:
+        case Intrinsic::vector_reduce_smin:
+        case Intrinsic::vector_reduce_smax:
+        case Intrinsic::vector_reduce_umin:
+        case Intrinsic::vector_reduce_umax:
           FoundReduction = true;
           continue;
         default:
@@ -3208,8 +3216,7 @@ static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI) {
 
       if (!isa<BinaryOperator>(UI) && !isa<ShuffleVectorInst>(UI))
         return false;
-
-      WorkList.emplace_back(UI);
+      WorkList.emplace_back(UI, CurDepth + 1);
     }
   }
   return FoundReduction;
@@ -3481,9 +3488,9 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
     unsigned NumGroups = Mask.size() / MaxElementsInVector;
     // For each group of MaxElementsInVector contiguous elements,
     // collect their shuffle pattern and insert into the set of unique patterns.
-    for (unsigned k = 0; k < NumFullVectors; ++k) {
-      for (unsigned l = 0; l < MaxElementsInVector; ++l)
-        SubShuffle[l] = Mask[MaxElementsInVector * k + l];
+    for (unsigned I = 0; I < NumFullVectors; ++I) {
+      for (unsigned J = 0; J < MaxElementsInVector; ++J)
+        SubShuffle[J] = Mask[MaxElementsInVector * I + J];
       if (UniqueShuffles.insert(SubShuffle).second)
         NumUniqueGroups += 1;
     }
@@ -3552,8 +3559,8 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
   LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
   LLVM_DEBUG(dbgs() << "  CostBefore: " << CostBefore
                     << " vs CostAfter: " << CostAfter << "\n");
-  if (CostBefore < CostAfter || CostBefore == 0 ||
-      (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
+  if (CostBefore < CostAfter ||
+      (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI, 8)))
     return false;
 
   // The cost model has passed, create the new instructions.

>From 76e7657523115a6564b5d43af5c5e21b4de18ccc Mon Sep 17 00:00:00 2001
From: "Igor.Kirillov at arm.com"
 <igokir01 at ip-10-248-7-180.eu-central-1.compute.internal>
Date: Thu, 24 Jul 2025 09:10:32 +0000
Subject: [PATCH 3/4] Replace depth with size

---
 .../Transforms/Vectorize/VectorCombine.cpp    | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index b9b695588bd49..e4f2e00a291d2 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3178,21 +3178,21 @@ bool VectorCombine::foldCastFromReductions(Instruction &I) {
 /// vector reduction intrinsic (e.g., vector_reduce_add) by only following
 /// chains of shuffles and binary operators (in any combination/order).
 /// The search does not go deeper than the given Depth.
-static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI, unsigned Depth) {
+static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI) {
+  constexpr unsigned MaxVisited = 32;
   SmallPtrSet<Instruction *, 8> Visited;
-  SmallVector<std::pair<Instruction *, unsigned>, 4> WorkList;
+  SmallVector<Instruction *, 4> WorkList;
   bool FoundReduction = false;
 
-  WorkList.push_back({SVI, 0});
+  WorkList.push_back(SVI);
   while (!WorkList.empty()) {
-    auto [I, CurDepth] = WorkList.pop_back_val();
-    if (CurDepth > Depth)
-      return false;
-
+    Instruction *I = WorkList.pop_back_val();
     for (User *U : I->users()) {
-      auto *UI = dyn_cast<Instruction>(U);
+      auto *UI = cast<Instruction>(U);
       if (!UI || !Visited.insert(UI).second)
         continue;
+      if (Visited.size() > MaxVisited)
+        return false;
       if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
         // More than one reduction reached
         if (FoundReduction)
@@ -3216,7 +3216,8 @@ static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI, unsigned Depth) {
 
       if (!isa<BinaryOperator>(UI) && !isa<ShuffleVectorInst>(UI))
         return false;
-      WorkList.emplace_back(UI, CurDepth + 1);
+
+      WorkList.emplace_back(UI);;
     }
   }
   return FoundReduction;
@@ -3560,7 +3561,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
   LLVM_DEBUG(dbgs() << "  CostBefore: " << CostBefore
                     << " vs CostAfter: " << CostAfter << "\n");
   if (CostBefore < CostAfter ||
-      (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI, 8)))
+      (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
     return false;
 
   // The cost model has passed, create the new instructions.

>From 12de60ed6623bdfd2e134d495fcdc538ae1b65c7 Mon Sep 17 00:00:00 2001
From: "Igor.Kirillov at arm.com"
 <igokir01 at ip-10-248-7-180.eu-central-1.compute.internal>
Date: Thu, 24 Jul 2025 10:01:17 +0000
Subject: [PATCH 4/4] Remove trailing ';'

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index e4f2e00a291d2..6252f4f0507cb 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3217,7 +3217,7 @@ static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI) {
       if (!isa<BinaryOperator>(UI) && !isa<ShuffleVectorInst>(UI))
         return false;
 
-      WorkList.emplace_back(UI);;
+      WorkList.emplace_back(UI);
     }
   }
   return FoundReduction;