[llvm] [SLP]Improved reduction cost/codegen (PR #118293)

Mon Dec 2 05:37:53 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-analysis

Author: Alexey Bataev (alexey-bataev)

<details>
<summary>Changes</summary>

AVX512, -O3+LTO
Metric: size..text

Program                                                                                           size..text
                                                                                                  results     results0    diff
                      test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test     4553.00     4615.00  1.4%
                                 test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test   412708.00   416820.00  1.0%
        test-suite :: SingleSource/UnitTests/Vector/AVX512BWVL/Vector-AVX512BWVL-mask_set_bw.test    12901.00    12981.00  0.6%
                        test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test    22717.00    22813.00  0.4%
                             test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test    39722.00    39850.00  0.3%
                      test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test    39725.00    39853.00  0.3%
test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test    15918.00    15967.00  0.3%
                                       test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test   155491.00   155587.00  0.1%
                                     test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test   227894.00   227942.00  0.0%
                                    test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test  1062188.00  1062364.00  0.0%
                                test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test   793672.00   793720.00  0.0%
                              test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test   657371.00   657403.00  0.0%
                             test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test   657371.00   657403.00  0.0%
                   test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test  2074917.00  2074933.00  0.0%
                    test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test  2074917.00  2074933.00  0.0%
                                     test-suite :: MultiSource/Applications/JM/lencod/lencod.test   855219.00   855203.00 -0.0%

Benchmarks/Shootout-C++ - same transformed reduction
Adobe-C++/loop_unroll - same transformed reductions, new vector code
AVX512BWVL/Vector-AVX512BWVL-mask_set_bw - same transformed reductions
FreeBench/fourinarow - same transformed reductions
MiBench/telecomm-gsm - same transformed reductions
execute/GCC-C-execute-builtin-bitops-1 - same transformed reductions
CFP2006/433.milc - better vector code, several x i64 reductions + trunc
to i32 gets trunced to x i32 reductions
ImageProcessing/Blur - same transformed reductions
Benchmarks/7zip - same transformed reductions, extra 4 x vectorization
CINT2006/464.h264ref - same transformed reductions
CINT2017rate/525.x264_r
CINT2017speed/625.x264_s - same transformed reductions
CINT2017speed/600.perlbench_s
CINT2017rate/500.perlbench_r - transformed same reduction
JM/lencod - extra 4 x vectorization

RISC-V, SiFive-p670, -O3+LTO

Metric: size..text

Program                                                                                           size..text
                                                                                                  results    results0   diff
test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-builtin-bitops-1.test    8990.00    9514.00   5.8%
                                test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test  588504.00  588488.00  -0.0%
                    test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test  147464.00  147440.00  -0.0%
              test-suite :: MultiSource/Benchmarks/MiBench/automotive-susan/automotive-susan.test   21496.00   21492.00  -0.0%
                                     test-suite :: MicroBenchmarks/ImageProcessing/Blur/blur.test  165420.00  165372.00  -0.0%
                                    test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test  843928.00  843648.00  -0.0%
                                    test-suite :: External/SPEC/CINT2006/458.sjeng/458.sjeng.test  100712.00  100672.00  -0.0%
                      test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test   24384.00   24336.00  -0.2%
                             test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test   24380.00   24332.00  -0.2%
             test-suite :: SingleSource/UnitTests/Vectorizer/VPlanNativePath/outer-loop-vect.test   10348.00   10316.00  -0.3%
                                 test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test  221304.00  220480.00  -0.4%
                      test-suite :: SingleSource/Benchmarks/Shootout-C++/Shootout-C++-matrix.test    3750.00    3736.00  -0.4%
                            test-suite :: SingleSource/Regression/C/Regression-C-DuffsDevice.test     678.00     370.00 -45.4%

execute/GCC-C-execute-builtin-bitops-1 - extra 4 x reductions, same
transformed reductions
CINT2006/464.h264ref - extra 4 x reductions, same transformed reductions
MiBench/consumer-lame - 2 4 x i1 merged to 8 x i1 reductions (bitcast
+ ctpop)
MiBench/automotive-susan - same transformed reductions
ImageProcessing/Blur - same transformed reductions
Benchmarks/7zip - same transformed reductions
CINT2006/458.sjeng - 2 4 x i1 merged to 8 x i1 reductions (bitcast
+ ctpop)
MiBench/telecomm-gsm - same transformed reductions
Benchmarks/mediabench - same transformed reductions
Vectorizer/VPlanNativePath - same transformed reductions
Adobe-C++/loop_unroll - extra 4 x reductions, same transformed reductions
Benchmarks/Shootout-C++ - extra 4 x reductions, same transformed reductions
Regression/C/Regression-C-DuffsDevice - same transformed reductions


---

Patch is 22.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118293.diff


5 Files Affected:

- (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+8) 
- (modified) llvm/include/llvm/Analysis/TargetTransformInfoImpl.h (+1) 
- (modified) llvm/include/llvm/CodeGen/BasicTTIImpl.h (+16) 
- (modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+4) 
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+290-32) 


``````````diff

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 985ca1532e0149..f2f0e56a3f2014 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1584,6 +1584,10 @@ class TargetTransformInfo {
   /// split during legalization. Zero is returned when the answer is unknown.
   unsigned getNumberOfParts(Type *Tp) const;
 
+  /// \return true if \p Tp represent a type, fully occupying whole register,
+  /// false otherwise.
+  bool isFullSingleRegisterType(Type *Tp) const;
+
   /// \returns The cost of the address computation. For most targets this can be
   /// merged into the instruction indexing mode. Some targets might want to
   /// distinguish between address computation for memory operations on vector
@@ -2196,6 +2200,7 @@ class TargetTransformInfo::Concept {
                                            ArrayRef<Type *> Tys,
                                            TTI::TargetCostKind CostKind) = 0;
   virtual unsigned getNumberOfParts(Type *Tp) = 0;
+  virtual bool isFullSingleRegisterType(Type *Tp) const = 0;
   virtual InstructionCost
   getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) = 0;
   virtual InstructionCost
@@ -2930,6 +2935,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getNumberOfParts(Type *Tp) override {
     return Impl.getNumberOfParts(Tp);
   }
+  bool isFullSingleRegisterType(Type *Tp) const override {
+    return Impl.isFullSingleRegisterType(Tp);
+  }
   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
                                             const SCEV *Ptr) override {
     return Impl.getAddressComputationCost(Ty, SE, Ptr);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 38aba183f6a173..ce6a96ea317ba7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -833,6 +833,7 @@ class TargetTransformInfoImplBase {
 
   // Assume that we have a register of the right size for the type.
   unsigned getNumberOfParts(Type *Tp) const { return 1; }
+  bool isFullSingleRegisterType(Type *Tp) const { return false; }
 
   InstructionCost getAddressComputationCost(Type *Tp, ScalarEvolution *,
                                             const SCEV *) const {
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 98cbb4886642bf..9e7ce48f901dc5 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2612,6 +2612,22 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return *LT.first.getValue();
   }
 
+  bool isFullSingleRegisterType(Type *Tp) const {
+    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
+    if (!LT.first.isValid() || LT.first > 1)
+      return false;
+
+    if (auto *FTp = dyn_cast<FixedVectorType>(Tp);
+        Tp && LT.second.isFixedLengthVector()) {
+      // Check if the n x i1 fits fully into largest integer.
+      if (unsigned VF = LT.second.getVectorNumElements();
+          LT.second.getVectorElementType() == MVT::i1)
+        return DL.isLegalInteger(VF) && !DL.isLegalInteger(VF * 2);
+      return FTp == EVT(LT.second).getTypeForEVT(Tp->getContext());
+    }
+    return false;
+  }
+
   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
                                             const SCEV *) {
     return 0;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 1fb2b9836de0cc..f7ad9ed905e3a1 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1171,6 +1171,10 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
   return TTIImpl->getNumberOfParts(Tp);
 }
 
+bool TargetTransformInfo::isFullSingleRegisterType(Type *Tp) const {
+  return TTIImpl->isFullSingleRegisterType(Tp);
+}
+
 InstructionCost
 TargetTransformInfo::getAddressComputationCost(Type *Tp, ScalarEvolution *SE,
                                                const SCEV *Ptr) const {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7723442bc0fb6e..5df21b77643746 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12080,7 +12080,11 @@ bool BoUpSLP::isTreeNotExtendable() const {
     TreeEntry &E = *VectorizableTree[Idx];
     if (!E.isGather())
       continue;
-    if (E.getOpcode() && E.getOpcode() != Instruction::Load)
+    if ((E.getOpcode() && E.getOpcode() != Instruction::Load) ||
+        (!E.getOpcode() &&
+         all_of(E.Scalars, IsaPred<ExtractElementInst, LoadInst>)) ||
+        (isa<ExtractElementInst>(E.Scalars.front()) &&
+         getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).getOpcode()))
       return false;
     if (isSplat(E.Scalars) || allConstant(E.Scalars))
       continue;
@@ -19174,6 +19178,9 @@ class HorizontalReduction {
   /// Checks if the optimization of original scalar identity operations on
   /// matched horizontal reductions is enabled and allowed.
   bool IsSupportedHorRdxIdentityOp = false;
+  /// Contains vector values for reduction including their scale factor and
+  /// signedness.
+  SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
 
   static bool isCmpSelMinMax(Instruction *I) {
     return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
@@ -19225,17 +19232,22 @@ class HorizontalReduction {
   static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
                          Value *RHS, const Twine &Name, bool UseSelect) {
     unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
+    Type *OpTy = LHS->getType();
+    assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
     switch (Kind) {
     case RecurKind::Or:
-      if (UseSelect &&
-          LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
-        return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
+      if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
+        return Builder.CreateSelect(
+            LHS,
+            ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
+            RHS, Name);
       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
                                  Name);
     case RecurKind::And:
-      if (UseSelect &&
-          LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
-        return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
+      if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
+        return Builder.CreateSelect(
+            LHS, RHS,
+            ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name);
       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
                                  Name);
     case RecurKind::Add:
@@ -20108,12 +20120,11 @@ class HorizontalReduction {
                                          SameValuesCounter, TrackedToOrig);
         }
 
-        Value *ReducedSubTree;
         Type *ScalarTy = VL.front()->getType();
         if (isa<FixedVectorType>(ScalarTy)) {
           assert(SLPReVec && "FixedVectorType is not expected.");
           unsigned ScalarTyNumElements = getNumElements(ScalarTy);
-          ReducedSubTree = PoisonValue::get(FixedVectorType::get(
+          Value *ReducedSubTree = PoisonValue::get(getWidenedType(
               VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
           for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
             // Do reduction for each lane.
@@ -20131,30 +20142,32 @@ class HorizontalReduction {
             SmallVector<int, 16> Mask =
                 createStrideMask(I, ScalarTyNumElements, VL.size());
             Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
-            ReducedSubTree = Builder.CreateInsertElement(
-                ReducedSubTree,
-                emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I);
+            Value *Val =
+                createSingleOp(Builder, *TTI, Lane,
+                               OptReusedScalars && SameScaleFactor
+                                   ? SameValuesCounter.front().second
+                                   : 1,
+                               Lane->getType()->getScalarType() !=
+                                       VL.front()->getType()->getScalarType()
+                                   ? V.isSignedMinBitwidthRootNode()
+                                   : true, RdxRootInst->getType());
+            ReducedSubTree =
+                Builder.CreateInsertElement(ReducedSubTree, Val, I);
           }
+          VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
         } else {
-          ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI,
-                                         RdxRootInst->getType());
+          Type *VecTy = VectorizedRoot->getType();
+          Type *RedScalarTy = VecTy->getScalarType();
+          VectorValuesAndScales.emplace_back(
+              VectorizedRoot,
+              OptReusedScalars && SameScaleFactor
+                  ? SameValuesCounter.front().second
+                  : 1,
+              RedScalarTy != ScalarTy->getScalarType()
+                  ? V.isSignedMinBitwidthRootNode()
+                  : true);
         }
-        if (ReducedSubTree->getType() != VL.front()->getType()) {
-          assert(ReducedSubTree->getType() != VL.front()->getType() &&
-                 "Expected different reduction type.");
-          ReducedSubTree =
-              Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
-                                    V.isSignedMinBitwidthRootNode());
-        }
-
-        // Improved analysis for add/fadd/xor reductions with same scale factor
-        // for all operands of reductions. We can emit scalar ops for them
-        // instead.
-        if (OptReusedScalars && SameScaleFactor)
-          ReducedSubTree = emitScaleForReusedOps(
-              ReducedSubTree, Builder, SameValuesCounter.front().second);
 
-        VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
         // Count vectorized reduced values to exclude them from final reduction.
         for (Value *RdxVal : VL) {
           Value *OrigV = TrackedToOrig.at(RdxVal);
@@ -20183,6 +20196,10 @@ class HorizontalReduction {
         continue;
       }
     }
+    if (!VectorValuesAndScales.empty())
+      VectorizedTree = GetNewVectorizedTree(
+          VectorizedTree,
+          emitReduction(Builder, *TTI, ReductionRoot->getType()));
     if (VectorizedTree) {
       // Reorder operands of bool logical op in the natural order to avoid
       // possible problem with poison propagation. If not possible to reorder
@@ -20317,6 +20334,28 @@ class HorizontalReduction {
   }
 
 private:
+  /// Checks if the given type \p Ty is a vector type, which does not occupy the
+  /// whole vector register or is expensive for extraction.
+  static bool isNotFullVectorType(const TargetTransformInfo &TTI, Type *Ty) {
+    return TTI.getNumberOfParts(Ty) == 1 && !TTI.isFullSingleRegisterType(Ty);
+  }
+
+  /// Creates the reduction from the given \p Vec vector value with the given
+  /// scale \p Scale and signedness \p IsSigned.
+  Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
+                        Value *Vec, unsigned Scale, bool IsSigned,
+                        Type *DestTy) {
+    Value *Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
+    if (Rdx->getType() != DestTy->getScalarType())
+      Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
+    // Improved analysis for add/fadd/xor reductions with same scale
+    // factor for all operands of reductions. We can emit scalar ops for
+    // them instead.
+    if (Scale > 1)
+      Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
+    return Rdx;
+  }
+
   /// Calculate the cost of a reduction.
   InstructionCost getReductionCost(TargetTransformInfo *TTI,
                                    ArrayRef<Value *> ReducedVals,
@@ -20359,6 +20398,22 @@ class HorizontalReduction {
       }
       return Cost;
     };
+    // Require reduction cost if:
+    // 1. This type is not a full register type and no other vectors with the
+    // same type in the storage (first vector with small type).
+    // 2. The storage does not have any vector with full vector use (first
+    // vector with full register use).
+    bool DoesRequireReductionOp =
+        !AllConsts &&
+        (VectorValuesAndScales.empty() ||
+         (isNotFullVectorType(*TTI, VectorTy) &&
+          none_of(VectorValuesAndScales,
+                  [&](const auto &P) {
+                    return std::get<0>(P)->getType() == VectorTy;
+                  })) ||
+         all_of(VectorValuesAndScales, [&](const auto &P) {
+           return isNotFullVectorType(*TTI, std::get<0>(P)->getType());
+         }));
     switch (RdxKind) {
     case RecurKind::Add:
     case RecurKind::Mul:
@@ -20382,7 +20437,7 @@ class HorizontalReduction {
           VectorCost += TTI->getScalarizationOverhead(
               VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
               /*Extract*/ false, TTI::TCK_RecipThroughput);
-        } else {
+        } else if (DoesRequireReductionOp) {
           Type *RedTy = VectorTy->getElementType();
           auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
               std::make_pair(RedTy, true));
@@ -20394,6 +20449,14 @@ class HorizontalReduction {
                 RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth),
                 FMF, CostKind);
           }
+        } else {
+          unsigned NumParts = TTI->getNumberOfParts(VectorTy);
+          unsigned RegVF = getPartNumElems(getNumElements(VectorTy), NumParts);
+          VectorCost +=
+              NumParts * TTI->getArithmeticInstrCost(
+                             RdxOpcode,
+                             getWidenedType(VectorTy->getScalarType(), RegVF),
+                             CostKind);
         }
       }
       ScalarCost = EvaluateScalarCost([&]() {
@@ -20410,8 +20473,19 @@ class HorizontalReduction {
     case RecurKind::UMax:
     case RecurKind::UMin: {
       Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
-      if (!AllConsts)
-        VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
+      if (!AllConsts) {
+        if (DoesRequireReductionOp) {
+          VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
+        } else {
+          // Check if the previous reduction already exists and account it as
+          // series of operations + single reduction.
+          unsigned NumParts = TTI->getNumberOfParts(VectorTy);
+          unsigned RegVF = getPartNumElems(getNumElements(VectorTy), NumParts);
+          auto *RegVecTy = getWidenedType(VectorTy->getScalarType(), RegVF);
+          IntrinsicCostAttributes ICA(Id, RegVecTy, {RegVecTy, RegVecTy}, FMF);
+          VectorCost += NumParts * TTI->getIntrinsicInstrCost(ICA, CostKind);
+        }
+      }
       ScalarCost = EvaluateScalarCost([&]() {
         IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
         return TTI->getIntrinsicInstrCost(ICA, CostKind);
@@ -20428,6 +20502,190 @@ class HorizontalReduction {
     return VectorCost - ScalarCost;
   }
 
+  /// Splits the values, stored in VectorValuesAndScales, into registers/free
+  /// sub-registers, combines them with the given reduction operation as a
+  /// vector operation and then performs single (small enough) reduction.
+  Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
+                       Type *DestTy) {
+    Value *ReducedSubTree = nullptr;
+    // Creates reduction and combines with the previous reduction.
+    auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
+      Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
+      if (ReducedSubTree)
+        ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
+                                  "op.rdx", ReductionOps);
+      else
+        ReducedSubTree = Rdx;
+    };
+    if (VectorValuesAndScales.size() == 1) {
+      const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
+      CreateSingleOp(Vec, Scale, IsSigned);
+      return ReducedSubTree;
+    }
+    // Splits multivector value into per-register values.
+    auto SplitVector = [&](Value *Vec) {
+      auto *ScalarTy = cast<VectorType>(Vec->getType())->getElementType();
+      unsigned Sz = getNumElements(Vec->getType());
+      unsigned NumParts = TTI.getNumberOfParts(Vec->getType());
+      if (NumParts <= 1 || NumParts >= Sz ||
+          isNotFullVectorType(TTI, Vec->getType()))
+        return SmallVector<Value *>(1, Vec);
+      unsigned RegSize = getPartNumElems(Sz, NumParts);
+      auto *DstTy = getWidenedType(ScalarTy, RegSize);
+      SmallVector<Value *> Regs(NumParts);
+      for (unsigned Part : seq<unsigned>(NumParts))
+        Regs[Part] = Builder.CreateExtractVector(
+            DstTy, Vec, Builder.getInt64(Part * RegSize));
+      return Regs;
+    };
+    SmallMapVector<Type *, Value *, 4> VecOps;
+    // Scales Vec using given Cnt scale factor and then performs vector combine
+    // with previous value of VecOp.
+    auto CreateVecOp = [&](Value *Vec, unsigned Cnt) {
+      Type *ScalarTy = cast<VectorType>(Vec->getType())->getElementType();
+      // Scale Vec using given Cnt scale factor.
+      if (Cnt > 1) {
+        ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
+        switch (RdxKind) {
+        case RecurKind::Add: {
+          if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
+            unsigned VF = getNumElements(Vec->getType());
+            LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
+                              << ". (HorRdx)\n");
+            SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
+            for (unsigned I : seq<unsigned>(Cnt))
+              std::iota(std::next(Mask.begin(), VF * I),
+                        std::next(Mask.begin(), VF * (I + 1)), 0);
+            ++NumVectorInstructions;
+            Vec = Builder.CreateShuffleVector(Vec, Mask);
+            break;
+          }
+          // res = mul vv, n
+          Value *Scale =
+              ConstantVector::getSplat(EC, ConstantInt::get(ScalarTy, Cnt));
+          LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
+                            << ". (HorRdx)\n");
+          ++NumVectorInstructions;
+          Vec = Builder.CreateMul(Vec, Scale);
+          break;
+        }
+        case RecurKind::Xor: {
+          // res = n % 2 ? 0 : vv
+          LLVM_DEBUG(dbgs()
+                     << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
+          if (Cnt % 2 == 0)
+            Vec = Constant::getNullValue(Vec->getType());
+          break;
+        }
+        case RecurKind::FAdd: {
+          // res = fmul v, n
+          Value *Scale =
+              ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
+          LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
+                            << ". (HorRdx)\n");
+          ++NumVectorInstructions;
+          Vec = Builder.CreateFMul(Vec, Scale);
+          break;
+        }
+        case RecurKind::And:
+        case RecurKind::Or:
+        case RecurKind::SMax:
+        case RecurKind::SMin:
+        case RecurKind::UMax:
+        case RecurKind::UMin:
+        case RecurKind::FMax:
+        case RecurKind::FMin:
+        case RecurKind::FMaximum:
+        case RecurKind::FMinimum:
+          // res = vv
+          break;
+        case RecurKind::Mul:
+        case RecurKind::FMul:
+        case RecurKind::FMulAdd:
+        case RecurKind::IAnyOf:
+        case RecurKind::FAnyOf:
+        case RecurKind::None:
+          llvm_unreachable("Unexpected reduction kind for repeated scalar.");
+        }
+      }
+      // Combine Vec w...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/118293