[llvm] edf7bed - [SLP][NFC] Outline lookahead heuristics into a separate helper class.

Fri Apr 22 18:59:57 PDT 2022

Author: Valery N Dmitriev
Date: 2022-04-22T18:59:08-07:00
New Revision: edf7bed87b77262abca0349f0f31f7e5092a5259

URL: https://github.com/llvm/llvm-project/commit/edf7bed87b77262abca0349f0f31f7e5092a5259
DIFF: https://github.com/llvm/llvm-project/commit/edf7bed87b77262abca0349f0f31f7e5092a5259.diff

LOG: [SLP][NFC] Outline lookahead heuristics into a separate helper class.

Minor refactoring to reduce size of functional change D124309:
  look-ahead scoring routines pulled out of VLOperands and formed
  new LookAheadHeuristics helper class.

Reviewed By: Alexey Bataev (ABataev), Vasileios Porpodas (vporpo)
Differential Revision: https://reviews.llvm.org/D124313

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2150ed99e79de..ea914a08d75a7 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1034,96 +1034,18 @@ class BoUpSLP {
 #endif
   };
 
-  /// A helper data structure to hold the operands of a vector of instructions.
-  /// This supports a fixed vector length for all operand vectors.
-  class VLOperands {
-    /// For each operand we need (i) the value, and (ii) the opcode that it
-    /// would be attached to if the expression was in a left-linearized form.
-    /// This is required to avoid illegal operand reordering.
-    /// For example:
-    /// \verbatim
-    ///                         0 Op1
-    ///                         |/
-    /// Op1 Op2   Linearized    + Op2
-    ///   \ /     ---------->   |/
-    ///    -                    -
-    ///
-    /// Op1 - Op2            (0 + Op1) - Op2
-    /// \endverbatim
-    ///
-    /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
-    ///
-    /// Another way to think of this is to track all the operations across the
-    /// path from the operand all the way to the root of the tree and to
-    /// calculate the operation that corresponds to this path. For example, the
-    /// path from Op2 to the root crosses the RHS of the '-', therefore the
-    /// corresponding operation is a '-' (which matches the one in the
-    /// linearized tree, as shown above).
-    ///
-    /// For lack of a better term, we refer to this operation as Accumulated
-    /// Path Operation (APO).
-    struct OperandData {
-      OperandData() = default;
-      OperandData(Value *V, bool APO, bool IsUsed)
-          : V(V), APO(APO), IsUsed(IsUsed) {}
-      /// The operand value.
-      Value *V = nullptr;
-      /// TreeEntries only allow a single opcode, or an alternate sequence of
-      /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
-      /// APO. It is set to 'true' if 'V' is attached to an inverse operation
-      /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
-      /// (e.g., Add/Mul)
-      bool APO = false;
-      /// Helper data for the reordering function.
-      bool IsUsed = false;
-    };
-
-    /// During operand reordering, we are trying to select the operand at lane
-    /// that matches best with the operand at the neighboring lane. Our
-    /// selection is based on the type of value we are looking for. For example,
-    /// if the neighboring lane has a load, we need to look for a load that is
-    /// accessing a consecutive address. These strategies are summarized in the
-    /// 'ReorderingMode' enumerator.
-    enum class ReorderingMode {
-      Load,     ///< Matching loads to consecutive memory addresses
-      Opcode,   ///< Matching instructions based on opcode (same or alternate)
-      Constant, ///< Matching constants
-      Splat,    ///< Matching the same instruction multiple times (broadcast)
-      Failed,   ///< We failed to create a vectorizable group
-    };
-
-    using OperandDataVec = SmallVector<OperandData, 2>;
-
-    /// A vector of operand vectors.
-    SmallVector<OperandDataVec, 4> OpsVec;
-
+  /// A helper class used for scoring candidates for two consecutive lanes.
+  class LookAheadHeuristics {
     const DataLayout &DL;
     ScalarEvolution &SE;
     const BoUpSLP &R;
+    int NumLanes; // Total number of lanes (aka vectorization factor).
+    int MaxLevel; // The maximum recursion depth for accumulating score.
 
-    /// \returns the operand data at \p OpIdx and \p Lane.
-    OperandData &getData(unsigned OpIdx, unsigned Lane) {
-      return OpsVec[OpIdx][Lane];
-    }
-
-    /// \returns the operand data at \p OpIdx and \p Lane. Const version.
-    const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
-      return OpsVec[OpIdx][Lane];
-    }
-
-    /// Clears the used flag for all entries.
-    void clearUsed() {
-      for (unsigned OpIdx = 0, NumOperands = getNumOperands();
-           OpIdx != NumOperands; ++OpIdx)
-        for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
-             ++Lane)
-          OpsVec[OpIdx][Lane].IsUsed = false;
-    }
-
-    /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
-    void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
-      std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
-    }
+  public:
+    LookAheadHeuristics(const DataLayout &DL, ScalarEvolution &SE,
+                        const BoUpSLP &R, int NumLanes, int MaxLevel)
+        : DL(DL), SE(SE), R(R), NumLanes(NumLanes), MaxLevel(MaxLevel) {}
 
     // The hard-coded scores listed here are not very important, though it shall
     // be higher for better matches to improve the resulting cost. When
@@ -1169,8 +1091,7 @@ class BoUpSLP {
     /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
     /// MainAltOps.
     int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
-                        const DataLayout &DL, ScalarEvolution &SE, int NumLanes,
-                        ArrayRef<Value *> MainAltOps) {
+                        ArrayRef<Value *> MainAltOps) const {
       if (V1 == V2) {
         if (isa<LoadInst>(V1)) {
           // Retruns true if the users of V1 and V2 won't need to be extracted.
@@ -1192,37 +1113,37 @@ class BoUpSLP {
                                           ElementCount::getFixed(NumLanes)) &&
               ((int)V1->getNumUses() == NumLanes ||
                AllUsersAreInternal(V1, V2)))
-            return VLOperands::ScoreSplatLoads;
+            return LookAheadHeuristics::ScoreSplatLoads;
         }
-        return VLOperands::ScoreSplat;
+        return LookAheadHeuristics::ScoreSplat;
       }
 
       auto *LI1 = dyn_cast<LoadInst>(V1);
       auto *LI2 = dyn_cast<LoadInst>(V2);
       if (LI1 && LI2) {
         if (LI1->getParent() != LI2->getParent())
-          return VLOperands::ScoreFail;
+          return LookAheadHeuristics::ScoreFail;
 
         Optional<int> Dist = getPointersDiff(
             LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
             LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
         if (!Dist || *Dist == 0)
-          return VLOperands::ScoreFail;
+          return LookAheadHeuristics::ScoreFail;
         // The distance is too large - still may be profitable to use masked
         // loads/gathers.
         if (std::abs(*Dist) > NumLanes / 2)
-          return VLOperands::ScoreAltOpcodes;
+          return LookAheadHeuristics::ScoreAltOpcodes;
         // This still will detect consecutive loads, but we might have "holes"
         // in some cases. It is ok for non-power-2 vectorization and may produce
         // better results. It should not affect current vectorization.
-        return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads
-                           : VLOperands::ScoreReversedLoads;
+        return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
+                           : LookAheadHeuristics::ScoreReversedLoads;
       }
 
       auto *C1 = dyn_cast<Constant>(V1);
       auto *C2 = dyn_cast<Constant>(V2);
       if (C1 && C2)
-        return VLOperands::ScoreConstants;
+        return LookAheadHeuristics::ScoreConstants;
 
       // Extracts from consecutive indexes of the same vector better score as
       // the extracts could be optimized away.
@@ -1231,7 +1152,7 @@ class BoUpSLP {
       if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
         // Undefs are always profitable for extractelements.
         if (isa<UndefValue>(V2))
-          return VLOperands::ScoreConsecutiveExtracts;
+          return LookAheadHeuristics::ScoreConsecutiveExtracts;
         Value *EV2 = nullptr;
         ConstantInt *Ex2Idx = nullptr;
         if (match(V2,
@@ -1239,9 +1160,9 @@ class BoUpSLP {
                                                          m_Undef())))) {
           // Undefs are always profitable for extractelements.
           if (!Ex2Idx)
-            return VLOperands::ScoreConsecutiveExtracts;
+            return LookAheadHeuristics::ScoreConsecutiveExtracts;
           if (isUndefVector(EV2) && EV2->getType() == EV1->getType())
-            return VLOperands::ScoreConsecutiveExtracts;
+            return LookAheadHeuristics::ScoreConsecutiveExtracts;
           if (EV2 == EV1) {
             int Idx1 = Ex1Idx->getZExtValue();
             int Idx2 = Ex2Idx->getZExtValue();
@@ -1249,22 +1170,22 @@ class BoUpSLP {
             // The distance is too large - still may be profitable to use
             // shuffles.
             if (std::abs(Dist) == 0)
-              return VLOperands::ScoreSplat;
+              return LookAheadHeuristics::ScoreSplat;
             if (std::abs(Dist) > NumLanes / 2)
-              return VLOperands::ScoreSameOpcode;
-            return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts
-                              : VLOperands::ScoreReversedExtracts;
+              return LookAheadHeuristics::ScoreSameOpcode;
+            return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
+                              : LookAheadHeuristics::ScoreReversedExtracts;
           }
-          return VLOperands::ScoreAltOpcodes;
+          return LookAheadHeuristics::ScoreAltOpcodes;
         }
-        return VLOperands::ScoreFail;
+        return LookAheadHeuristics::ScoreFail;
       }
 
       auto *I1 = dyn_cast<Instruction>(V1);
       auto *I2 = dyn_cast<Instruction>(V2);
       if (I1 && I2) {
         if (I1->getParent() != I2->getParent())
-          return VLOperands::ScoreFail;
+          return LookAheadHeuristics::ScoreFail;
         SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
         Ops.push_back(I1);
         Ops.push_back(I2);
@@ -1278,77 +1199,17 @@ class BoUpSLP {
               return cast<Instruction>(V)->getNumOperands() ==
                      S.MainOp->getNumOperands();
             }))
-          return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
-                                  : VLOperands::ScoreSameOpcode;
+          return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
+                                  : LookAheadHeuristics::ScoreSameOpcode;
       }
 
       if (isa<UndefValue>(V2))
-        return VLOperands::ScoreUndef;
+        return LookAheadHeuristics::ScoreUndef;
 
-      return VLOperands::ScoreFail;
+      return LookAheadHeuristics::ScoreFail;
     }
 
-    /// \param Lane lane of the operands under analysis.
-    /// \param OpIdx operand index in \p Lane lane we're looking the best
-    /// candidate for.
-    /// \param Idx operand index of the current candidate value.
-    /// \returns The additional score due to possible broadcasting of the
-    /// elements in the lane. It is more profitable to have power-of-2 unique
-    /// elements in the lane, it will be vectorized with higher probability
-    /// after removing duplicates. Currently the SLP vectorizer supports only
-    /// vectorization of the power-of-2 number of unique scalars.
-    int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
-      Value *IdxLaneV = getData(Idx, Lane).V;
-      if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
-        return 0;
-      SmallPtrSet<Value *, 4> Uniques;
-      for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
-        if (Ln == Lane)
-          continue;
-        Value *OpIdxLnV = getData(OpIdx, Ln).V;
-        if (!isa<Instruction>(OpIdxLnV))
-          return 0;
-        Uniques.insert(OpIdxLnV);
-      }
-      int UniquesCount = Uniques.size();
-      int UniquesCntWithIdxLaneV =
-          Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
-      Value *OpIdxLaneV = getData(OpIdx, Lane).V;
-      int UniquesCntWithOpIdxLaneV =
-          Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
-      if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
-        return 0;
-      return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
-              UniquesCntWithOpIdxLaneV) -
-             (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
-    }
-
-    /// \param Lane lane of the operands under analysis.
-    /// \param OpIdx operand index in \p Lane lane we're looking the best
-    /// candidate for.
-    /// \param Idx operand index of the current candidate value.
-    /// \returns The additional score for the scalar which users are all
-    /// vectorized.
-    int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
-      Value *IdxLaneV = getData(Idx, Lane).V;
-      Value *OpIdxLaneV = getData(OpIdx, Lane).V;
-      // Do not care about number of uses for vector-like instructions
-      // (extractelement/extractvalue with constant indices), they are extracts
-      // themselves and already externally used. Vectorization of such
-      // instructions does not add extra extractelement instruction, just may
-      // remove it.
-      if (isVectorLikeInstWithConstOps(IdxLaneV) &&
-          isVectorLikeInstWithConstOps(OpIdxLaneV))
-        return VLOperands::ScoreAllUserVectorized;
-      auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
-      if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
-        return 0;
-      return R.areAllUsersVectorized(IdxLaneI, None)
-                 ? VLOperands::ScoreAllUserVectorized
-                 : 0;
-    }
-
-    /// Go through the operands of \p LHS and \p RHS recursively until \p
+    /// Go through the operands of \p LHS and \p RHS recursively until
     /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
     /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
     /// of \p U1 and \p U2), except at the beginning of the recursion where
@@ -1365,8 +1226,8 @@ class BoUpSLP {
     /// each level recursively, accumulating the score. It starts from matching
     /// the additions at level 0, then moves on to the loads (level 1). The
     /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
-    /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
-    /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
+    /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
+    /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
     /// Please note that the order of the operands does not matter, as we
     /// evaluate the score of all profitable combinations of operands. In
     /// other words the score of G1 and G4 is the same as G1 and G2. This
@@ -1375,12 +1236,12 @@ class BoUpSLP {
     ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
     ///   Luís F. W. Góes
     int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,
-                           Instruction *U2, int CurrLevel, int MaxLevel,
-                           ArrayRef<Value *> MainAltOps) {
+                           Instruction *U2, int CurrLevel,
+                           ArrayRef<Value *> MainAltOps) const {
 
       // Get the shallow score of V1 and V2.
       int ShallowScoreAtThisLevel =
-          getShallowScore(LHS, RHS, U1, U2, DL, SE, getNumLanes(), MainAltOps);
+          getShallowScore(LHS, RHS, U1, U2, MainAltOps);
 
       // If reached MaxLevel,
       //  or if V1 and V2 are not instructions,
@@ -1391,7 +1252,7 @@ class BoUpSLP {
       auto *I1 = dyn_cast<Instruction>(LHS);
       auto *I2 = dyn_cast<Instruction>(RHS);
       if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
-          ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
+          ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
           (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
             (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
             (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
@@ -1423,9 +1284,10 @@ class BoUpSLP {
           // Recursively calculate the cost at each level
           int TmpScore =
               getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
-                                 I1, I2, CurrLevel + 1, MaxLevel, None);
+                                 I1, I2, CurrLevel + 1, None);
           // Look for the best score.
-          if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
+          if (TmpScore > LookAheadHeuristics::ScoreFail &&
+              TmpScore > MaxTmpScore) {
             MaxTmpScore = TmpScore;
             MaxOpIdx2 = OpIdx2;
             FoundBest = true;
@@ -1439,6 +1301,157 @@ class BoUpSLP {
       }
       return ShallowScoreAtThisLevel;
     }
+  };
+  /// A helper data structure to hold the operands of a vector of instructions.
+  /// This supports a fixed vector length for all operand vectors.
+  class VLOperands {
+    /// For each operand we need (i) the value, and (ii) the opcode that it
+    /// would be attached to if the expression was in a left-linearized form.
+    /// This is required to avoid illegal operand reordering.
+    /// For example:
+    /// \verbatim
+    ///                         0 Op1
+    ///                         |/
+    /// Op1 Op2   Linearized    + Op2
+    ///   \ /     ---------->   |/
+    ///    -                    -
+    ///
+    /// Op1 - Op2            (0 + Op1) - Op2
+    /// \endverbatim
+    ///
+    /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
+    ///
+    /// Another way to think of this is to track all the operations across the
+    /// path from the operand all the way to the root of the tree and to
+    /// calculate the operation that corresponds to this path. For example, the
+    /// path from Op2 to the root crosses the RHS of the '-', therefore the
+    /// corresponding operation is a '-' (which matches the one in the
+    /// linearized tree, as shown above).
+    ///
+    /// For lack of a better term, we refer to this operation as Accumulated
+    /// Path Operation (APO).
+    struct OperandData {
+      OperandData() = default;
+      OperandData(Value *V, bool APO, bool IsUsed)
+          : V(V), APO(APO), IsUsed(IsUsed) {}
+      /// The operand value.
+      Value *V = nullptr;
+      /// TreeEntries only allow a single opcode, or an alternate sequence of
+      /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
+      /// APO. It is set to 'true' if 'V' is attached to an inverse operation
+      /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
+      /// (e.g., Add/Mul)
+      bool APO = false;
+      /// Helper data for the reordering function.
+      bool IsUsed = false;
+    };
+
+    /// During operand reordering, we are trying to select the operand at lane
+    /// that matches best with the operand at the neighboring lane. Our
+    /// selection is based on the type of value we are looking for. For example,
+    /// if the neighboring lane has a load, we need to look for a load that is
+    /// accessing a consecutive address. These strategies are summarized in the
+    /// 'ReorderingMode' enumerator.
+    enum class ReorderingMode {
+      Load,     ///< Matching loads to consecutive memory addresses
+      Opcode,   ///< Matching instructions based on opcode (same or alternate)
+      Constant, ///< Matching constants
+      Splat,    ///< Matching the same instruction multiple times (broadcast)
+      Failed,   ///< We failed to create a vectorizable group
+    };
+
+    using OperandDataVec = SmallVector<OperandData, 2>;
+
+    /// A vector of operand vectors.
+    SmallVector<OperandDataVec, 4> OpsVec;
+
+    const DataLayout &DL;
+    ScalarEvolution &SE;
+    const BoUpSLP &R;
+
+    /// \returns the operand data at \p OpIdx and \p Lane.
+    OperandData &getData(unsigned OpIdx, unsigned Lane) {
+      return OpsVec[OpIdx][Lane];
+    }
+
+    /// \returns the operand data at \p OpIdx and \p Lane. Const version.
+    const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
+      return OpsVec[OpIdx][Lane];
+    }
+
+    /// Clears the used flag for all entries.
+    void clearUsed() {
+      for (unsigned OpIdx = 0, NumOperands = getNumOperands();
+           OpIdx != NumOperands; ++OpIdx)
+        for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
+             ++Lane)
+          OpsVec[OpIdx][Lane].IsUsed = false;
+    }
+
+    /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
+    void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
+      std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
+    }
+
+    /// \param Lane lane of the operands under analysis.
+    /// \param OpIdx operand index in \p Lane lane we're looking the best
+    /// candidate for.
+    /// \param Idx operand index of the current candidate value.
+    /// \returns The additional score due to possible broadcasting of the
+    /// elements in the lane. It is more profitable to have power-of-2 unique
+    /// elements in the lane, it will be vectorized with higher probability
+    /// after removing duplicates. Currently the SLP vectorizer supports only
+    /// vectorization of the power-of-2 number of unique scalars.
+    int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
+      Value *IdxLaneV = getData(Idx, Lane).V;
+      if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
+        return 0;
+      SmallPtrSet<Value *, 4> Uniques;
+      for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
+        if (Ln == Lane)
+          continue;
+        Value *OpIdxLnV = getData(OpIdx, Ln).V;
+        if (!isa<Instruction>(OpIdxLnV))
+          return 0;
+        Uniques.insert(OpIdxLnV);
+      }
+      int UniquesCount = Uniques.size();
+      int UniquesCntWithIdxLaneV =
+          Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
+      Value *OpIdxLaneV = getData(OpIdx, Lane).V;
+      int UniquesCntWithOpIdxLaneV =
+          Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
+      if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
+        return 0;
+      return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
+              UniquesCntWithOpIdxLaneV) -
+             (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
+    }
+
+    /// \param Lane lane of the operands under analysis.
+    /// \param OpIdx operand index in \p Lane lane we're looking the best
+    /// candidate for.
+    /// \param Idx operand index of the current candidate value.
+    /// \returns The additional score for the scalar which users are all
+    /// vectorized.
+    int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
+      Value *IdxLaneV = getData(Idx, Lane).V;
+      Value *OpIdxLaneV = getData(OpIdx, Lane).V;
+      // Do not care about number of uses for vector-like instructions
+      // (extractelement/extractvalue with constant indices), they are extracts
+      // themselves and already externally used. Vectorization of such
+      // instructions does not add extra extractelement instruction, just may
+      // remove it.
+      if (isVectorLikeInstWithConstOps(IdxLaneV) &&
+          isVectorLikeInstWithConstOps(OpIdxLaneV))
+        return LookAheadHeuristics::ScoreAllUserVectorized;
+      auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
+      if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
+        return 0;
+      return R.areAllUsersVectorized(IdxLaneI, None)
+                 ? LookAheadHeuristics::ScoreAllUserVectorized
+                 : 0;
+    }
 
     /// Score scaling factor for fully compatible instructions but with
     /// 
diff erent number of external uses. Allows better selection of the
@@ -1453,10 +1466,13 @@ class BoUpSLP {
     int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
                           int Lane, unsigned OpIdx, unsigned Idx,
                           bool &IsUsed) {
+      LookAheadHeuristics LookAhead(DL, SE, R, getNumLanes(),
+                                    LookAheadMaxDepth);
       // Keep track of the instruction stack as we recurse into the operands
       // during the look-ahead score exploration.
-      int Score = getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
-                                     1, LookAheadMaxDepth, MainAltOps);
+      int Score =
+          LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
+                                       /*CurrLevel=*/1, MainAltOps);
       if (Score) {
         int SplatScore = getSplatScore(Lane, OpIdx, Idx);
         if (Score <= -SplatScore) {