[llvm] [LoopVectorizer] Add support for partial reductions (PR #92418)

Sam Tebbs via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 6 05:33:32 PST 2024


https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/92418

>From ae4a2ca9fc880c018e294e6fe0a206cf997818ff Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy at arm.com>
Date: Fri, 17 May 2024 11:17:26 +0100
Subject: [PATCH 01/56] [LoopVectorizer] Add support for partial reductions

---
 .../llvm/Analysis/TargetTransformInfo.h       |   33 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |    9 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |    8 +
 .../AArch64/AArch64TargetTransformInfo.h      |   37 +
 .../Transforms/Vectorize/LoopVectorize.cpp    |  256 +++-
 .../Transforms/Vectorize/VPRecipeBuilder.h    |    4 +
 llvm/lib/Transforms/Vectorize/VPlan.h         |   52 +-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |    7 +-
 llvm/lib/Transforms/Vectorize/VPlanAnalysis.h |    2 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   56 +-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |    1 +
 .../CodeGen/AArch64/partial-reduce-sdot.ll    |   99 ++
 .../AArch64/partial-reduce-dot-product.ll     | 1322 +++++++++++++++++
 .../LoopVectorize/AArch64/vplan-printing.ll   |   77 +
 14 files changed, 1947 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0dc513d8e65b76..3c43110b4ebd85 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -24,6 +24,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/IR/FMF.h"
 #include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/AtomicOrdering.h"
@@ -210,6 +211,17 @@ typedef TargetTransformInfo TTI;
 /// for IR-level transformations.
 class TargetTransformInfo {
 public:
+  enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend };
+
+  static PartialReductionExtendKind
+  getPartialReductionExtendKind(Instruction *I) {
+    if (isa<SExtInst>(I))
+      return PR_SignExtend;
+    if (isa<ZExtInst>(I))
+      return PR_ZeroExtend;
+    return PR_None;
+  }
+
   /// Construct a TTI object using a type implementing the \c Concept
   /// API below.
   ///
@@ -1260,6 +1272,12 @@ class TargetTransformInfo {
   /// \return if target want to issue a prefetch in address space \p AS.
   bool shouldPrefetchAddressSpace(unsigned AS) const;
 
+  InstructionCost
+  getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
+                          ElementCount VF, PartialReductionExtendKind OpAExtend,
+                          PartialReductionExtendKind OpBExtend,
+                          std::optional<unsigned> BinOp = std::nullopt) const;
+
   /// \return The maximum interleave factor that any transform should try to
   /// perform for this target. This number depends on the level of parallelism
   /// and the number of execution units in the CPU.
@@ -2067,6 +2085,12 @@ class TargetTransformInfo::Concept {
   /// \return if target want to issue a prefetch in address space \p AS.
   virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;
 
+  virtual InstructionCost
+  getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
+                          ElementCount VF, PartialReductionExtendKind OpAExtend,
+                          PartialReductionExtendKind OpBExtend,
+                          std::optional<unsigned> BinOp) const = 0;
+
   virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0;
   virtual InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
@@ -2722,6 +2746,15 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.shouldPrefetchAddressSpace(AS);
   }
 
+  InstructionCost getPartialReductionCost(
+      unsigned Opcode, Type *InputType, Type *AccumType, ElementCount VF,
+      PartialReductionExtendKind OpAExtend,
+      PartialReductionExtendKind OpBExtend,
+      std::optional<unsigned> BinOp = std::nullopt) const override {
+    return Impl.getPartialReductionCost(Opcode, InputType, AccumType, VF,
+                                        OpAExtend, OpBExtend, BinOp);
+  }
+
   unsigned getMaxInterleaveFactor(ElementCount VF) override {
     return Impl.getMaxInterleaveFactor(VF);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 317c13917c0cfc..8435f2af201480 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -571,6 +571,15 @@ class TargetTransformInfoImplBase {
   bool enableWritePrefetching() const { return false; }
   bool shouldPrefetchAddressSpace(unsigned AS) const { return !AS; }
 
+  InstructionCost
+  getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
+                          ElementCount VF,
+                          TTI::PartialReductionExtendKind OpAExtend,
+                          TTI::PartialReductionExtendKind OpBExtend,
+                          std::optional<unsigned> BinOp = std::nullopt) const {
+    return InstructionCost::getInvalid();
+  }
+
   unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
 
   InstructionCost getArithmeticInstrCost(
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c5c7b7c7c0a57f..02df8017a5380f 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -849,6 +849,14 @@ bool TargetTransformInfo::shouldPrefetchAddressSpace(unsigned AS) const {
   return TTIImpl->shouldPrefetchAddressSpace(AS);
 }
 
+InstructionCost TargetTransformInfo::getPartialReductionCost(
+    unsigned Opcode, Type *InputType, Type *AccumType, ElementCount VF,
+    PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend,
+    std::optional<unsigned> BinOp) const {
+  return TTIImpl->getPartialReductionCost(Opcode, InputType, AccumType, VF,
+                                          OpAExtend, OpBExtend, BinOp);
+}
+
 unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
   return TTIImpl->getMaxInterleaveFactor(VF);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 1d09d67f6ec9e3..a2b6fbbd6bb824 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -342,6 +342,43 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     return BaseT::isLegalNTLoad(DataType, Alignment);
   }
 
+  InstructionCost
+  getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
+                          ElementCount VF,
+                          TTI::PartialReductionExtendKind OpAExtend,
+                          TTI::PartialReductionExtendKind OpBExtend,
+                          std::optional<unsigned> BinOp) const {
+    InstructionCost Invalid = InstructionCost::getInvalid();
+
+    if (Opcode != Instruction::Add)
+      return Invalid;
+
+    EVT InputEVT = EVT::getEVT(InputType);
+    EVT AccumEVT = EVT::getEVT(AccumType);
+
+    if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable())
+      return Invalid;
+    if (VF.isFixed() && !ST->isNeonAvailable() && !ST->hasDotProd())
+      return Invalid;
+
+    if (InputEVT == MVT::i8) {
+      if (AccumEVT != MVT::i32)
+        return Invalid;
+    } else if (InputEVT == MVT::i16) {
+      if (AccumEVT != MVT::i64)
+        return Invalid;
+    } else
+      return Invalid;
+
+    if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None)
+      return Invalid;
+
+    if (!BinOp || (*BinOp) != Instruction::Mul)
+      return Invalid;
+
+    return InstructionCost::getMin();
+  }
+
   bool enableOrderedReductions() const { return true; }
 
   InstructionCost getInterleavedMemoryOpCost(
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c07af8519049c4..0faea87decae80 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1537,7 +1537,139 @@ class LoopVectorizationCostModel {
   /// trivially hoistable.
   bool shouldConsiderInvariant(Value *Op);
 
+  /// A chain of instructions that form a partial reduction.
+  /// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))),
+  /// accumulator)
+  struct PartialReductionChain {
+    /// The top-level binary operation that forms the reduction to a scalar
+    /// after the loop body
+    Instruction *Reduction;
+    /// The inner binary operation that forms the reduction to a vector value
+    /// within the loop body
+    Instruction *BinOp;
+    /// The extension of each of the inner binary operation's operands
+    Instruction *ExtendA;
+    Instruction *ExtendB;
+
+    /// The accumulator that is reduced to a scalar after the loop body
+    Value *Accumulator;
+
+    /// The scaling factor between the size of the reduction type and the
+    /// (possibly extended) inputs
+    unsigned ScaleFactor;
+  };
+
+  using PartialReductionList = DenseMap<Instruction *, PartialReductionChain>;
+
+  PartialReductionList getPartialReductionChains() {
+    return PartialReductionChains;
+  }
+
+  std::optional<PartialReductionChain>
+  getInstructionsPartialReduction(Instruction *I) const {
+    auto PairIt = PartialReductionChains.find(I);
+    if (PairIt == PartialReductionChains.end())
+      return std::nullopt;
+    return PairIt->second;
+  }
+
+  void removePartialReduction(Instruction *Instr) {
+    PartialReductionChains.erase(Instr);
+  }
+
+  void addPartialReductionIfSupported(Instruction *Instr, ElementCount VF) {
+
+    // Try to commutatively match:
+    // bin_op (one_use bin_op (z_or_sext, z_or_sext), phi)
+
+    auto *Root = dyn_cast<BinaryOperator>(Instr);
+    if (!Root)
+      return;
+
+    auto *BinOp = dyn_cast<BinaryOperator>(Root->getOperand(0));
+    auto *Phi = dyn_cast<PHINode>(Root->getOperand(1));
+    if (!BinOp) {
+      BinOp = dyn_cast<BinaryOperator>(Root->getOperand(1));
+      Phi = dyn_cast<PHINode>(Root->getOperand(0));
+    }
+    if (!BinOp || !BinOp->hasOneUse()) {
+      LLVM_DEBUG(
+          dbgs() << "Root was not a one-use binary operator, cannot create a "
+                    "partial reduction.\n");
+      return;
+    }
+    if (!Phi) {
+      LLVM_DEBUG(dbgs() << "Expected Phi node was not a phi, cannot create a "
+                           "partial reduction.\n");
+      return;
+    }
+
+    auto IsSextOrZext = [](Instruction *I) {
+      return I && (I->getOpcode() == Instruction::ZExt ||
+                   I->getOpcode() == Instruction::SExt);
+    };
+
+    auto *ExtA = dyn_cast<Instruction>(BinOp->getOperand(0));
+    auto *ExtB = dyn_cast<Instruction>(BinOp->getOperand(1));
+    if (!IsSextOrZext(ExtA) || !IsSextOrZext(ExtB)) {
+      LLVM_DEBUG(dbgs() << "Expected extends were not extends, cannot create a "
+                           "partial reduction.\n");
+      return;
+    }
+
+    Value *A = ExtA->getOperand(0);
+    Value *B = ExtB->getOperand(0);
+    // Check that the extends extend from the same type
+    if (A->getType() != B->getType()) {
+      LLVM_DEBUG(dbgs() << "Extends don't extend from the same type, cannot "
+                           "create a partial reduction.\n");
+      return;
+    }
+
+    // Check that the extends extend to the same type
+    if (ExtA->getType() != ExtB->getType()) {
+      LLVM_DEBUG(
+          dbgs() << "Extends don't extend to the same type, cannot create "
+                    "a partial reduction.\n");
+      return;
+    }
+
+    // Check that the second phi value is the instruction we're looking at
+    Instruction *MaybeAdd = dyn_cast<Instruction>(
+        Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
+    if (!MaybeAdd || MaybeAdd != Instr) {
+      LLVM_DEBUG(dbgs() << "Second PHI value is not the root binop, cannot "
+                           "create a partial reduction.\n");
+      return;
+    }
+
+    TTI::PartialReductionExtendKind OpAExtend =
+        TargetTransformInfo::getPartialReductionExtendKind(ExtA);
+    TTI::PartialReductionExtendKind OpBExtend =
+        TargetTransformInfo::getPartialReductionExtendKind(ExtB);
+    InstructionCost Cost = TTI.getPartialReductionCost(
+        Instr->getOpcode(), A->getType(), Phi->getType(), VF, OpAExtend,
+        OpBExtend, std::make_optional(BinOp->getOpcode()));
+    if (Cost == InstructionCost::getInvalid())
+      return;
+
+    PartialReductionChain Chain;
+    Chain.Reduction = Instr;
+    Chain.BinOp = BinOp;
+    Chain.ExtendA = ExtA;
+    Chain.ExtendB = ExtB;
+    Chain.Accumulator = Phi;
+
+    unsigned InputSizeBits = A->getType()->getScalarSizeInBits();
+    unsigned ResultSizeBits = Chain.Reduction->getType()->getScalarSizeInBits();
+    Chain.ScaleFactor = ResultSizeBits / InputSizeBits;
+
+    PartialReductionChains[Instr] = Chain;
+  }
+
 private:
+  PartialReductionList PartialReductionChains;
+
   unsigned NumPredStores = 0;
 
   /// \return An upper bound for the vectorization factors for both
@@ -4662,6 +4794,11 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
         return false;
   }
 
+  // Prevent epilogue vectorization if a partial reduction is involved
+  // TODO Is there a cleaner way to check this?
+  if (CM.getPartialReductionChains().size() > 0)
+    return false;
+
   // Epilogue vectorization code has not been auditted to ensure it handles
   // non-latch exits properly.  It may be fine, but it needs auditted and
   // tested.
@@ -6986,6 +7123,18 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   }
+
+  // Ignore any values that we know will be flattened
+  for (auto It : getPartialReductionChains()) {
+    PartialReductionChain Chain = It.second;
+    SmallVector<Value *> PartialReductionValues{Chain.Reduction, Chain.BinOp,
+                                                Chain.ExtendA, Chain.ExtendB,
+                                                Chain.Accumulator};
+    ValuesToIgnore.insert(PartialReductionValues.begin(),
+                          PartialReductionValues.end());
+    VecValuesToIgnore.insert(PartialReductionValues.begin(),
+                             PartialReductionValues.end());
+  }
 }
 
 void LoopVectorizationCostModel::collectInLoopReductions() {
@@ -7102,6 +7251,47 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
 
 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
+
+  for (auto ReductionVar : Legal->getReductionVars()) {
+    auto *ReductionExitInstr = ReductionVar.second.getLoopExitInstr();
+    CM.addPartialReductionIfSupported(ReductionExitInstr, UserVF);
+  }
+
+  // Wider-than-legal vector types (coming from extends in partial reductions)
+  // should only be used by partial reductions so that they are lowered properly
+
+  // Build up a set of partial reduction bin ops for efficient use checking
+  SmallSet<Instruction *, 4> PartialReductionBinOps;
+  for (auto It : CM.getPartialReductionChains()) {
+    if (It.second.BinOp)
+      PartialReductionBinOps.insert(It.second.BinOp);
+  }
+
+  auto ExtendIsOnlyUsedByPartialReductions =
+      [PartialReductionBinOps](Instruction *Extend) {
+        for (auto *Use : Extend->users()) {
+          Instruction *UseInstr = dyn_cast<Instruction>(Use);
+          if (!PartialReductionBinOps.contains(UseInstr))
+            return false;
+        }
+        return true;
+      };
+
+  // Check if each use of a chain's two extends is a partial reduction
+  // and remove those that have non-partial reduction users
+  SmallSet<Instruction *, 4> PartialReductionsToRemove;
+  for (auto It : CM.getPartialReductionChains()) {
+    LoopVectorizationCostModel::PartialReductionChain Chain = It.second;
+    if (!ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) ||
+        !ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)) {
+      PartialReductionsToRemove.insert(Chain.Reduction);
+      LLVM_DEBUG(dbgs() << "Removing the partial reduction for an instruction "
+                           "with an extend used by something other than a "
+                           "partial reduction "
+                        << *Chain.Reduction << "\n");
+    }
+  }
+
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
 
@@ -7126,6 +7316,23 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   if (CM.foldTailByMasking())
     Legal->prepareToFoldTailByMasking();
 
+  for (auto Pair : CM.getPartialReductionChains()) {
+    // TODO: Allow creating partial reductions when predicating. The select at
+    // the end of the loop chooses between the phi value and most recent partial
+    // reduction result, both of which have different VFs to the active lane
+    // mask.
+    Instruction *Instr = Pair.first;
+    if (CM.blockNeedsPredicationForAnyReason(Instr->getParent())) {
+      LLVM_DEBUG(dbgs() << "LV: Removing the partial reduction for an "
+                           "instruction in a predicated block: "
+                        << *Instr << "\n");
+      PartialReductionsToRemove.insert(Instr);
+    }
+  }
+
+  for (auto *Insn : PartialReductionsToRemove)
+    CM.removePartialReduction(Insn);
+
   ElementCount MaxUserVF =
       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
   if (UserVF) {
@@ -8663,6 +8870,18 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
   return Recipe;
 }
 
+unsigned getScaleFactorForReductionPhi(PHINode *Phi,
+                                       LoopVectorizationCostModel &CM) {
+  for (auto *User : Phi->users()) {
+    if (auto *I = dyn_cast<Instruction>(User)) {
+      if (auto Chain = CM.getInstructionsPartialReduction(I)) {
+        return Chain->ScaleFactor;
+      }
+    }
+  }
+  return 1;
+}
+
 VPRecipeBase *
 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
                                         ArrayRef<VPValue *> Operands,
@@ -8687,9 +8906,12 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
           Legal->getReductionVars().find(Phi)->second;
       assert(RdxDesc.getRecurrenceStartValue() ==
              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
-      PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
-                                           CM.isInLoopReduction(Phi),
-                                           CM.useOrderedReductions(RdxDesc));
+
+      // If the PHI is used by a partial reduction, set the scale factor
+      unsigned ScaleFactor = getScaleFactorForReductionPhi(Phi, CM);
+      PhiRecipe = new VPReductionPHIRecipe(
+          Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
+          CM.useOrderedReductions(RdxDesc), ScaleFactor);
     } else {
       // TODO: Currently fixed-order recurrences are modeled as chains of
       // first-order recurrences. If there are no users of the intermediate
@@ -8741,6 +8963,24 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   return tryToWiden(Instr, Operands, VPBB);
 }
 
+VPRecipeBase *
+VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
+                                             unsigned ScaleFactor,
+                                             ArrayRef<VPValue *> Operands) {
+  assert(Operands.size() == 2 &&
+         "Unexpected number of operands for partial reduction");
+
+  VPValue *BinOp = Operands[0];
+  VPValue *Phi = Operands[1];
+  VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
+  if (isa<VPReductionPHIRecipe>(BinOpRecipe))
+    std::swap(BinOp, Phi);
+
+  SmallVector<VPValue *, 2> OrderedOperands = {BinOp, Phi};
+  return new VPPartialReductionRecipe(
+      *Reduction, make_range(OrderedOperands.begin(), OrderedOperands.end()));
+}
+
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
                                                         ElementCount MaxVF) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -9131,8 +9371,14 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         continue;
       }
 
-      VPRecipeBase *Recipe =
-          RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
+      VPRecipeBase *Recipe = nullptr;
+
+      if (auto Chain = CM.getInstructionsPartialReduction(Instr))
+        Recipe = RecipeBuilder.tryToCreatePartialReduction(
+            Chain->Reduction, Chain->ScaleFactor, Operands);
+      else if (!Recipe)
+        Recipe =
+            RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
       if (!Recipe)
         Recipe = RecipeBuilder.handleReplication(Instr, Range);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 5d4a3b555981ce..136766eb455fde 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -125,6 +125,10 @@ class VPRecipeBuilder {
                                        ArrayRef<VPValue *> Operands,
                                        VFRange &Range, VPBasicBlock *VPBB);
 
+  VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
+                                            unsigned ScaleFactor,
+                                            ArrayRef<VPValue *> Operands);
+
   /// Set the recipe created for given ingredient.
   void setRecipe(Instruction *I, VPRecipeBase *R) {
     assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 18f5f13073aa63..eb5e187a118272 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -879,6 +879,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPWidenPointerInductionSC:
     case VPRecipeBase::VPReductionPHISC:
     case VPRecipeBase::VPScalarCastSC:
+    case VPRecipeBase::VPPartialReductionSC:
       return true;
     case VPRecipeBase::VPBranchOnMaskSC:
     case VPRecipeBase::VPInterleaveSC:
@@ -2327,23 +2328,31 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
   /// The phi is part of an ordered reduction. Requires IsInLoop to be true.
   bool IsOrdered;
 
+  /// The scaling difference between the size of the output of the entire
+  /// reduction and the size of the input.
+
+  /// When expanding the reduction PHI, the plan's VF element count is divided
+  /// by this factor to form the reduction phi's VF.
+  unsigned VFScaleFactor = 1;
+
 public:
   /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
   /// RdxDesc.
   VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
                        VPValue &Start, bool IsInLoop = false,
-                       bool IsOrdered = false)
+                       bool IsOrdered = false, unsigned VFScaleFactor = 1)
       : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
-        RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
+        RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
+        VFScaleFactor(VFScaleFactor) {
     assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
   }
 
   ~VPReductionPHIRecipe() override = default;
 
   VPReductionPHIRecipe *clone() override {
-    auto *R =
-        new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc,
-                                 *getOperand(0), IsInLoop, IsOrdered);
+    auto *R = new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()),
+                                       RdxDesc, *getOperand(0), IsInLoop,
+                                       IsOrdered, VFScaleFactor);
     R->addOperand(getBackedgeValue());
     return R;
   }
@@ -2374,6 +2383,39 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
   bool isInLoop() const { return IsInLoop; }
 };
 
+/// A recipe for forming partial reductions. In the loop, an accumulator and
+/// vector operand are added together and passed to the next iteration as the
+/// next accumulator. After the loop body, the accumulator is reduced to a
+/// scalar value.
+class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
+  unsigned Opcode;
+  Instruction &Reduction;
+
+public:
+  template <typename IterT>
+  VPPartialReductionRecipe(Instruction &I, iterator_range<IterT> Operands)
+      : VPRecipeWithIRFlags(VPDef::VPPartialReductionSC, Operands, I),
+        Opcode(I.getOpcode()), Reduction(I) {
+    assert(isa<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe()) &&
+           "Unexpected operand order for partial reduction recipe");
+  }
+  ~VPPartialReductionRecipe() override = default;
+  VPPartialReductionRecipe *clone() override {
+    auto Ops = operands();
+    return new VPPartialReductionRecipe(Reduction,
+                                        make_range(Ops.begin(), Ops.end()));
+  }
+  VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
+  /// Generate the reduction in the loop
+  void execute(VPTransformState &State) override;
+  unsigned getOpcode() { return Opcode; }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
 /// instructions.
 class VPBlendRecipe : public VPSingleDefRecipe {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 8b8ab6be99b0d5..1d8d949a8b9306 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -235,6 +235,11 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) {
   llvm_unreachable("Unhandled opcode");
 }
 
+Type *
+VPTypeAnalysis::inferScalarTypeForRecipe(const VPPartialReductionRecipe *R) {
+  return R->getUnderlyingInstr()->getType();
+}
+
 Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
   if (Type *CachedTy = CachedTypes.lookup(V))
     return CachedTy;
@@ -269,7 +274,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
               })
           .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPWidenEVLRecipe,
                 VPReplicateRecipe, VPWidenCallRecipe, VPWidenMemoryRecipe,
-                VPWidenSelectRecipe>(
+                VPWidenSelectRecipe, VPPartialReductionRecipe>(
               [this](const auto *R) { return inferScalarTypeForRecipe(R); })
           .Case<VPWidenIntrinsicRecipe>([](const VPWidenIntrinsicRecipe *R) {
             return R->getResultType();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index cc21870bee2e3b..a34d9629eff9dd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -27,6 +27,7 @@ struct VPWidenSelectRecipe;
 class VPReplicateRecipe;
 class VPRecipeBase;
 class VPlan;
+class VPPartialReductionRecipe;
 class Type;
 
 /// An analysis for type-inference for VPValues.
@@ -53,6 +54,7 @@ class VPTypeAnalysis {
   Type *inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R);
   Type *inferScalarTypeForRecipe(const VPWidenSelectRecipe *R);
   Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R);
+  Type *inferScalarTypeForRecipe(const VPPartialReductionRecipe *R);
 
 public:
   VPTypeAnalysis(Type *CanonicalIVTy)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 6254ea15191819..b8efa10f2f9af7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -291,6 +291,50 @@ InstructionCost VPRecipeBase::computeCost(ElementCount VF,
   llvm_unreachable("subclasses should implement computeCost");
 }
 
+InstructionCost VPSingleDefRecipe::computeCost(ElementCount VF,
+                                               VPCostContext &Ctx) const {
+  Instruction *UI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
+  if (isa<VPReplicateRecipe>(this)) {
+    assert(UI && "VPReplicateRecipe must have an underlying instruction");
+    // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
+    // transform, avoid computing their cost multiple times for now.
+    Ctx.SkipCostComputation.insert(UI);
+  }
+  return UI ? Ctx.getLegacyCost(UI, VF) : 0;
+}
+
+void VPPartialReductionRecipe::execute(VPTransformState &State) {
+  State.setDebugLocFrom(getDebugLoc());
+  auto &Builder = State.Builder;
+
+  assert(Opcode == Instruction::Add && "Unhandled partial reduction opcode");
+
+  Value *BinOpVal = State.get(getOperand(0), 0);
+  Value *PhiVal = State.get(getOperand(1), 0);
+  assert(PhiVal && BinOpVal && "Phi and Mul must be set");
+
+  Type *RetTy = PhiVal->getType();
+
+  CallInst *V = Builder.CreateIntrinsic(
+      RetTy, Intrinsic::experimental_vector_partial_reduce_add,
+      {PhiVal, BinOpVal}, nullptr, Twine("partial.reduce"));
+
+  // Use this vector value for all users of the original instruction.
+  State.set(this, V, 0);
+  State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
+                                     VPSlotTracker &SlotTracker) const {
+  O << Indent << "PARTIAL-REDUCE ";
+  printAsOperand(O, SlotTracker);
+  O << " = " << Instruction::getOpcodeName(Opcode);
+  printFlags(O);
+  printOperands(O, SlotTracker);
+}
+#endif
+
 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
   assert(OpType == OperationType::FPMathOp &&
          "recipe doesn't have fast math flags");
@@ -3341,6 +3385,8 @@ void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
 void VPReductionPHIRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
 
+  auto VF = State.VF.divideCoefficientBy(VFScaleFactor);
+
   // Reductions do not have to start at zero. They can start with
   // any loop invariant values.
   VPValue *StartVPV = getStartValue();
@@ -3350,9 +3396,9 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
   // this value when we vectorize all of the instructions that use the PHI.
-  bool ScalarPHI = State.VF.isScalar() || IsInLoop;
-  Type *VecTy = ScalarPHI ? StartV->getType()
-                          : VectorType::get(StartV->getType(), State.VF);
+  bool ScalarPHI = VF.isScalar() || IsInLoop;
+  Type *VecTy =
+      ScalarPHI ? StartV->getType() : VectorType::get(StartV->getType(), VF);
 
   BasicBlock *HeaderBB = State.CFG.PrevBB;
   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
@@ -3386,13 +3432,13 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
         // Create start and identity vector values for the reduction in the
         // preheader.
         // TODO: Introduce recipes in VPlan preheader to create initial values.
-        Iden = Builder.CreateVectorSplat(State.VF, Iden);
+        Iden = Builder.CreateVectorSplat(VF, Iden);
         IRBuilderBase::InsertPointGuard IPBuilder(Builder);
         Builder.SetInsertPoint(VectorPH->getTerminator());
         Constant *Zero = Builder.getInt32(0);
         StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
       } else {
-        Iden = Builder.CreateVectorSplat(State.VF, Iden);
+        Iden = Builder.CreateVectorSplat(VF, Iden);
       }
     }
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 691b0d40823cfb..465a985792f9ab 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -329,6 +329,7 @@ class VPDef {
     VPInterleaveSC,
     VPReductionEVLSC,
     VPReductionSC,
+    VPPartialReductionSC,
     VPReplicateSC,
     VPScalarCastSC,
     VPScalarIVStepsSC,
diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll b/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
new file mode 100644
index 00000000000000..fc6e3239a1b43c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define void @dotp(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define void @dotp(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP19:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP29:%.*]] = mul <vscale x 16 x i32> [[TMP27]], [[TMP19]]
+; CHECK-NEXT:    [[TMP14]] = add <vscale x 16 x i32> [[TMP29]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP14]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = lshr i32 [[ADD_LCSSA]], 0
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ACC_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP18]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP22]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[CONV3]], [[CONV]]
+; CHECK-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACC_010]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %0 = lshr i32 %add, 0
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %acc.010 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr i8, ptr %a, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %1 to i32
+  %arrayidx2 = getelementptr i8, ptr %b, i64 %indvars.iv
+  %2 = load i8, ptr %arrayidx2, align 1
+  %conv3 = zext i8 %2 to i32
+  %mul = mul i32 %conv3, %conv
+  %add = add i32 %mul, %acc.010
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 0
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+
+; uselistorder directives
+  uselistorder i32 %add, { 1, 0 }
+}
+
+attributes #0 = { "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+v8a" }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
new file mode 100644
index 00000000000000..bcac4d674123d8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -0,0 +1,1322 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVE1
+; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVED
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @dotp(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @dotp(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP13]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP15]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul <vscale x 16 x i32> [[TMP19]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 16 x i32> [[TMP20]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP22]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %result = lshr i32 %add, 0
+  ret i32 %result
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @not_dotp_different_types(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5
+; CHECK-INTERLEAVE1-NEXT:    [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6
+; CHECK-INTERLEAVE1-NEXT:    [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7
+; CHECK-INTERLEAVE1-NEXT:    [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9
+; CHECK-INTERLEAVE1-NEXT:    [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10
+; CHECK-INTERLEAVE1-NEXT:    [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11
+; CHECK-INTERLEAVE1-NEXT:    [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12
+; CHECK-INTERLEAVE1-NEXT:    [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13
+; CHECK-INTERLEAVE1-NEXT:    [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14
+; CHECK-INTERLEAVE1-NEXT:    [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define void @not_dotp_different_types(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 17
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 18
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 19
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 20
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 21
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 22
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 23
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = add i64 [[INDEX]], 24
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX]], 25
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = add i64 [[INDEX]], 26
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 27
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = add i64 [[INDEX]], 28
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 29
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 30
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 31
+; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = load i16, ptr [[TMP37]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = load i16, ptr [[TMP38]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = load i16, ptr [[TMP39]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = load i16, ptr [[TMP40]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = load i16, ptr [[TMP41]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = load i16, ptr [[TMP42]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = load i16, ptr [[TMP43]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = load i16, ptr [[TMP44]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = load i16, ptr [[TMP45]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = load i16, ptr [[TMP46]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = load i16, ptr [[TMP47]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP80:%.*]] = load i16, ptr [[TMP48]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP81:%.*]] = load i16, ptr [[TMP49]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = load i16, ptr [[TMP50]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = load i16, ptr [[TMP51]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP101:%.*]] = load i16, ptr [[TMP53]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP102:%.*]] = load i16, ptr [[TMP54]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP103:%.*]] = load i16, ptr [[TMP55]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP104:%.*]] = load i16, ptr [[TMP56]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP105:%.*]] = load i16, ptr [[TMP57]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP106:%.*]] = load i16, ptr [[TMP58]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP107:%.*]] = load i16, ptr [[TMP59]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP108:%.*]] = load i16, ptr [[TMP60]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP109:%.*]] = load i16, ptr [[TMP61]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP110:%.*]] = load i16, ptr [[TMP62]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP111:%.*]] = load i16, ptr [[TMP63]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP112:%.*]] = load i16, ptr [[TMP64]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP113:%.*]] = load i16, ptr [[TMP65]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP66]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP67]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP68]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP140:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %0 = lshr i32 %add, 0
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i16, ptr %gep.b, align 2
+  %ext.b = zext i16 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @not_dotp_not_loop_carried(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP16]], i32 -1)
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define void @not_dotp_not_loop_carried(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1)
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %0 = lshr i32 %add, 0
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @not_dotp_not_phi(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @not_dotp_not_phi(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define void @not_dotp_not_phi(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP19]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 8 x i32> [[TMP22]], [[TMP21]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %0 = lshr i32 %add, 0
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %ext.b
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %w, ptr %scales, ptr %u, ptr %v) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @dotp_unrolled(
+; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i32 [[NUM_IN:%.*]], ptr [[W:%.*]], ptr [[SCALES:%.*]], ptr [[U:%.*]], ptr [[V:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[CMP154:%.*]] = icmp sgt i32 [[NUM_OUT]], 3
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP154]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END98:%.*]]
+; CHECK-INTERLEAVE1:       for.body.lr.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[DIV:%.*]] = sdiv i32 [[NUM_OUT]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = shl nsw i32 [[DIV]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[CMP11145:%.*]] = icmp sgt i32 [[NUM_IN]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[IDXPROM44:%.*]] = sext i32 [[NUM_IN]] to i64
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[MUL]] to i64
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP11145]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK-INTERLEAVE1:       for.body.preheader:
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_END98]]
+; CHECK-INTERLEAVE1:       for.body.us.preheader:
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[NUM_IN]] to i64
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY_US:%.*]]
+; CHECK-INTERLEAVE1:       for.body.us:
+; CHECK-INTERLEAVE1-NEXT:    [[INDVARS_IV164:%.*]] = phi i64 [ 0, [[FOR_BODY_US_PREHEADER]] ], [ [[INDVARS_IV_NEXT165:%.*]], [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US:%.*]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[INDVARS_IV164]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[INDVARS_IV164]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[ARRAYIDX3_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX3_US]], align 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[INDVARS_IV164]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARRAYIDX6_US]], align 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[INDVARS_IV164]], 3
+; CHECK-INTERLEAVE1-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX9_US]], align 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE181:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI172:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE179:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI173:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE177:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI174:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD175:%.*]] = load <vscale x 16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD175]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul nsw <vscale x 16 x i32> [[TMP20]], [[TMP17]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI174]], <vscale x 16 x i32> [[TMP21]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD176:%.*]] = load <vscale x 16 x i8>, ptr [[TMP23]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD176]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = mul nsw <vscale x 16 x i32> [[TMP24]], [[TMP20]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE177]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI173]], <vscale x 16 x i32> [[TMP25]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD178:%.*]] = load <vscale x 16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD178]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP28]], [[TMP20]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE179]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI172]], <vscale x 16 x i32> [[TMP29]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD180:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD180]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP32]], [[TMP20]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE181]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP33]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE177_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE177]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE179_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE179]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE181_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE181]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE181_LCSSA]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE179_LCSSA]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE177_LCSSA]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_LCSSA]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX182:%.*]] = phi i32 [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX183:%.*]] = phi i32 [ [[TMP37]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX184:%.*]] = phi i32 [ [[TMP38]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[BC_RESUME_VAL]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP40]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[TMP39]], [[TMP41]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK7]], label [[SCALAR_PH6:%.*]], label [[VECTOR_PH8:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph8:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[TMP39]], [[TMP43]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC10:%.*]] = sub i64 [[TMP39]], [[N_MOD_VF9]]
+; CHECK-INTERLEAVE1-NEXT:    [[IND_END:%.*]] = add i64 [[BC_RESUME_VAL]], [[N_VEC10]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP46:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP47:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX182]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP48:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX183]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP49:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX184]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY12:%.*]]
+; CHECK-INTERLEAVE1:       vector.body12:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX13:%.*]] = phi i64 [ 0, [[VECTOR_PH8]] ], [ [[INDEX_NEXT27:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI14:%.*]] = phi <vscale x 4 x i32> [ [[TMP46]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE26:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI15:%.*]] = phi <vscale x 4 x i32> [ [[TMP47]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE24:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI16:%.*]] = phi <vscale x 4 x i32> [ [[TMP48]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE22:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI17:%.*]] = phi <vscale x 4 x i32> [ [[TMP49]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE20:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[BC_RESUME_VAL]], [[INDEX13]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP50]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP51]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 16 x i8>, ptr [[TMP52]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP53:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD18]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP50]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD19:%.*]] = load <vscale x 16 x i8>, ptr [[TMP55]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP56:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD19]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP57:%.*]] = mul nsw <vscale x 16 x i32> [[TMP56]], [[TMP53]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE20]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI17]], <vscale x 16 x i32> [[TMP57]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP50]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP58]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD21:%.*]] = load <vscale x 16 x i8>, ptr [[TMP59]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP60:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD21]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP61:%.*]] = mul nsw <vscale x 16 x i32> [[TMP60]], [[TMP56]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE22]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI16]], <vscale x 16 x i32> [[TMP61]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP50]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[TMP62]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD23:%.*]] = load <vscale x 16 x i8>, ptr [[TMP63]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP64:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD23]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP65:%.*]] = mul nsw <vscale x 16 x i32> [[TMP64]], [[TMP56]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE24]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI15]], <vscale x 16 x i32> [[TMP65]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP50]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i8, ptr [[TMP66]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD25:%.*]] = load <vscale x 16 x i8>, ptr [[TMP67]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP68:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD25]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP69:%.*]] = mul nsw <vscale x 16 x i32> [[TMP68]], [[TMP56]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE26]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI14]], <vscale x 16 x i32> [[TMP69]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT27]] = add nuw i64 [[INDEX13]], [[TMP45]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT27]], [[N_VEC10]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK5:%.*]], label [[VECTOR_BODY12]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block5:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE26]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP72:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE24]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE22]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP74:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE20]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N28:%.*]] = icmp eq i64 [[TMP39]], [[N_VEC10]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N28]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US_LOOPEXIT:%.*]], label [[SCALAR_PH6]]
+;
+; CHECK-INTERLEAVED-LABEL: define void @dotp_unrolled(
+; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i32 [[NUM_IN:%.*]], ptr [[W:%.*]], ptr [[SCALES:%.*]], ptr [[U:%.*]], ptr [[V:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[CMP154:%.*]] = icmp sgt i32 [[NUM_OUT]], 3
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP154]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END98:%.*]]
+; CHECK-INTERLEAVED:       for.body.lr.ph:
+; CHECK-INTERLEAVED-NEXT:    [[DIV:%.*]] = sdiv i32 [[NUM_OUT]], 4
+; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = shl nsw i32 [[DIV]], 2
+; CHECK-INTERLEAVED-NEXT:    [[CMP11145:%.*]] = icmp sgt i32 [[NUM_IN]], 0
+; CHECK-INTERLEAVED-NEXT:    [[IDXPROM44:%.*]] = sext i32 [[NUM_IN]] to i64
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[MUL]] to i64
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP11145]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK-INTERLEAVED:       for.body.preheader:
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_END98]]
+; CHECK-INTERLEAVED:       for.body.us.preheader:
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[NUM_IN]] to i64
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_BODY_US:%.*]]
+; CHECK-INTERLEAVED:       for.body.us:
+; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV164:%.*]] = phi i64 [ 0, [[FOR_BODY_US_PREHEADER]] ], [ [[INDVARS_IV_NEXT165:%.*]], [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US:%.*]] ]
+; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[INDVARS_IV164]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[INDVARS_IV164]], 1
+; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX3_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX3_US]], align 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[INDVARS_IV164]], 2
+; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARRAYIDX6_US]], align 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[INDVARS_IV164]], 3
+; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX9_US]], align 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE181:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI172:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE179:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI173:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE177:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI174:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD175:%.*]] = load <vscale x 16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD175]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul nsw <vscale x 16 x i32> [[TMP20]], [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI174]], <vscale x 16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD176:%.*]] = load <vscale x 16 x i8>, ptr [[TMP23]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD176]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = mul nsw <vscale x 16 x i32> [[TMP24]], [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE177]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI173]], <vscale x 16 x i32> [[TMP25]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD178:%.*]] = load <vscale x 16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD178]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP28]], [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE179]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI172]], <vscale x 16 x i32> [[TMP29]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD180:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD180]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP32]], [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE181]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP33]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE177_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE177]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE179_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE179]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE181_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE181]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE181_LCSSA]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE179_LCSSA]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE177_LCSSA]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_LCSSA]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX182:%.*]] = phi i32 [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX183:%.*]] = phi i32 [ [[TMP37]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX184:%.*]] = phi i32 [ [[TMP38]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[BC_RESUME_VAL]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP40]], 32
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[TMP39]], [[TMP41]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK7]], label [[SCALAR_PH6:%.*]], label [[VECTOR_PH8:%.*]]
+; CHECK-INTERLEAVED:       vector.ph8:
+; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 32
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[TMP39]], [[TMP43]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC10:%.*]] = sub i64 [[TMP39]], [[N_MOD_VF9]]
+; CHECK-INTERLEAVED-NEXT:    [[IND_END:%.*]] = add i64 [[BC_RESUME_VAL]], [[N_VEC10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX182]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX183]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX184]], i32 0
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY12:%.*]]
+; CHECK-INTERLEAVED:       vector.body12:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX13:%.*]] = phi i64 [ 0, [[VECTOR_PH8]] ], [ [[INDEX_NEXT40:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI14:%.*]] = phi <vscale x 4 x i32> [ [[TMP46]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE38:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI15:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE39:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI16:%.*]] = phi <vscale x 4 x i32> [ [[TMP47]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE34:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI17:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE35:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI18:%.*]] = phi <vscale x 4 x i32> [ [[TMP48]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE30:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI19:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE31:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI20:%.*]] = phi <vscale x 4 x i32> [ [[TMP49]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE26:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI21:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE27:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[BC_RESUME_VAL]], [[INDEX13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP50]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP51]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = mul i64 [[TMP53]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP51]], i64 [[TMP54]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD22:%.*]] = load <vscale x 16 x i8>, ptr [[TMP52]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD23:%.*]] = load <vscale x 16 x i8>, ptr [[TMP55]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD22]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD23]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP50]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP58]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = mul i64 [[TMP60]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[TMP58]], i64 [[TMP61]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD24:%.*]] = load <vscale x 16 x i8>, ptr [[TMP59]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD25:%.*]] = load <vscale x 16 x i8>, ptr [[TMP62]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD24]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP64:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD25]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP65:%.*]] = mul nsw <vscale x 16 x i32> [[TMP63]], [[TMP56]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = mul nsw <vscale x 16 x i32> [[TMP64]], [[TMP57]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE26]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI20]], <vscale x 16 x i32> [[TMP65]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE27]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI21]], <vscale x 16 x i32> [[TMP66]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP50]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[TMP67]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = mul i64 [[TMP69]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i8, ptr [[TMP67]], i64 [[TMP70]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD28:%.*]] = load <vscale x 16 x i8>, ptr [[TMP68]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD29:%.*]] = load <vscale x 16 x i8>, ptr [[TMP71]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD28]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD29]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = mul nsw <vscale x 16 x i32> [[TMP72]], [[TMP63]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = mul nsw <vscale x 16 x i32> [[TMP73]], [[TMP64]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE30]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI18]], <vscale x 16 x i32> [[TMP74]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE31]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI19]], <vscale x 16 x i32> [[TMP75]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP50]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = getelementptr inbounds i8, ptr [[TMP76]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = mul i64 [[TMP78]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i8, ptr [[TMP76]], i64 [[TMP79]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD32:%.*]] = load <vscale x 16 x i8>, ptr [[TMP77]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD33:%.*]] = load <vscale x 16 x i8>, ptr [[TMP80]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP81:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD32]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD33]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = mul nsw <vscale x 16 x i32> [[TMP81]], [[TMP63]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = mul nsw <vscale x 16 x i32> [[TMP82]], [[TMP64]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE34]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI16]], <vscale x 16 x i32> [[TMP83]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE35]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI17]], <vscale x 16 x i32> [[TMP84]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP50]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = getelementptr inbounds i8, ptr [[TMP85]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP87:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP88:%.*]] = mul i64 [[TMP87]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP89:%.*]] = getelementptr inbounds i8, ptr [[TMP85]], i64 [[TMP88]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD36:%.*]] = load <vscale x 16 x i8>, ptr [[TMP86]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD37:%.*]] = load <vscale x 16 x i8>, ptr [[TMP89]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP90:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD36]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP91:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD37]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP92:%.*]] = mul nsw <vscale x 16 x i32> [[TMP90]], [[TMP63]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP93:%.*]] = mul nsw <vscale x 16 x i32> [[TMP91]], [[TMP64]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE38]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI14]], <vscale x 16 x i32> [[TMP92]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE39]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI15]], <vscale x 16 x i32> [[TMP93]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT40]] = add nuw i64 [[INDEX13]], [[TMP45]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP94:%.*]] = icmp eq i64 [[INDEX_NEXT40]], [[N_VEC10]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP94]], label [[MIDDLE_BLOCK5:%.*]], label [[VECTOR_BODY12]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block5:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE39]], [[PARTIAL_REDUCE38]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP95:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX41:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE35]], [[PARTIAL_REDUCE34]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX41]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX42:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE31]], [[PARTIAL_REDUCE30]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX42]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX43:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE27]], [[PARTIAL_REDUCE26]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX43]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N44:%.*]] = icmp eq i64 [[TMP39]], [[N_VEC10]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N44]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US_LOOPEXIT:%.*]], label [[SCALAR_PH6]]
+;
+entry:
+  %cmp154 = icmp sgt i32 %num_out, 3
+  br i1 %cmp154, label %for.body.lr.ph, label %for.end98
+
+for.body.lr.ph:                                   ; preds = %entry
+  %div = sdiv i32 %num_out, 4
+  %mul = shl nsw i32 %div, 2
+  %cmp11145 = icmp sgt i32 %num_in, 0
+  %idxprom44 = sext i32 %num_in to i64
+  %0 = zext nneg i32 %mul to i64
+  br i1 %cmp11145, label %for.body.us.preheader, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %for.body.lr.ph
+  br label %for.end98
+
+for.body.us.preheader:                            ; preds = %for.body.lr.ph
+  %wide.trip.count = zext nneg i32 %num_in to i64
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.body.us.preheader, %for.cond10.for.cond.cleanup_crit_edge.us
+  %indvars.iv164 = phi i64 [ 0, %for.body.us.preheader ], [ %indvars.iv.next165, %for.cond10.for.cond.cleanup_crit_edge.us ]
+  %arrayidx.us = getelementptr inbounds ptr, ptr %w, i64 %indvars.iv164
+  %1 = load ptr, ptr %arrayidx.us, align 8
+  %2 = or disjoint i64 %indvars.iv164, 1
+  %arrayidx3.us = getelementptr inbounds ptr, ptr %w, i64 %2
+  %3 = load ptr, ptr %arrayidx3.us, align 8
+  %4 = or disjoint i64 %indvars.iv164, 2
+  %arrayidx6.us = getelementptr inbounds ptr, ptr %w, i64 %4
+  %5 = load ptr, ptr %arrayidx6.us, align 8
+  %6 = or disjoint i64 %indvars.iv164, 3
+  %arrayidx9.us = getelementptr inbounds ptr, ptr %w, i64 %6
+  %7 = load ptr, ptr %arrayidx9.us, align 8
+  %8 = call i64 @llvm.vscale.i64()
+  %9 = mul i64 %8, 16
+  %min.iters.check = icmp ult i64 %wide.trip.count, %9
+  br i1 %min.iters.check, label %scalar.ph, label %vector.ph
+
+vector.ph:                                        ; preds = %for.body.us
+  %10 = call i64 @llvm.vscale.i64()
+  %11 = mul i64 %10, 16
+  %n.mod.vf = urem i64 %wide.trip.count, %11
+  %n.vec = sub i64 %wide.trip.count, %n.mod.vf
+  %12 = call i64 @llvm.vscale.i64()
+  %13 = mul i64 %12, 16
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce181, %vector.body ]
+  %vec.phi172 = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce179, %vector.body ]
+  %vec.phi173 = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce177, %vector.body ]
+  %vec.phi174 = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce, %vector.body ]
+  %14 = add i64 %index, 0
+  %15 = getelementptr inbounds i8, ptr %1, i64 %14
+  %16 = getelementptr inbounds i8, ptr %15, i32 0
+  %wide.load = load <vscale x 16 x i8>, ptr %16, align 1
+  %17 = sext <vscale x 16 x i8> %wide.load to <vscale x 16 x i32>
+  %18 = getelementptr inbounds i8, ptr %u, i64 %14
+  %19 = getelementptr inbounds i8, ptr %18, i32 0
+  %wide.load175 = load <vscale x 16 x i8>, ptr %19, align 1
+  %20 = sext <vscale x 16 x i8> %wide.load175 to <vscale x 16 x i32>
+  %21 = mul nsw <vscale x 16 x i32> %20, %17
+  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi174, <vscale x 16 x i32> %21)
+  %22 = getelementptr inbounds i8, ptr %3, i64 %14
+  %23 = getelementptr inbounds i8, ptr %22, i32 0
+  %wide.load176 = load <vscale x 16 x i8>, ptr %23, align 1
+  %24 = sext <vscale x 16 x i8> %wide.load176 to <vscale x 16 x i32>
+  %25 = mul nsw <vscale x 16 x i32> %24, %20
+  %partial.reduce177 = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi173, <vscale x 16 x i32> %25)
+  %26 = getelementptr inbounds i8, ptr %5, i64 %14
+  %27 = getelementptr inbounds i8, ptr %26, i32 0
+  %wide.load178 = load <vscale x 16 x i8>, ptr %27, align 1
+  %28 = sext <vscale x 16 x i8> %wide.load178 to <vscale x 16 x i32>
+  %29 = mul nsw <vscale x 16 x i32> %28, %20
+  %partial.reduce179 = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi172, <vscale x 16 x i32> %29)
+  %30 = getelementptr inbounds i8, ptr %7, i64 %14
+  %31 = getelementptr inbounds i8, ptr %30, i32 0
+  %wide.load180 = load <vscale x 16 x i8>, ptr %31, align 1
+  %32 = sext <vscale x 16 x i8> %wide.load180 to <vscale x 16 x i32>
+  %33 = mul nsw <vscale x 16 x i32> %32, %20
+  %partial.reduce181 = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %33)
+  %index.next = add nuw i64 %index, %13
+  %34 = icmp eq i64 %index.next, %n.vec
+  br i1 %34, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %35 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce181)
+  %36 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce179)
+  %37 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce177)
+  %38 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce)
+  %cmp.n = icmp eq i64 %wide.trip.count, %n.vec
+  br i1 %cmp.n, label %for.cond10.for.cond.cleanup_crit_edge.us, label %scalar.ph
+
+scalar.ph:                                        ; preds = %middle.block, %for.body.us
+  %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.us ]
+  %bc.merge.rdx = phi i32 [ %35, %middle.block ], [ 0, %for.body.us ]
+  %bc.merge.rdx182 = phi i32 [ %36, %middle.block ], [ 0, %for.body.us ]
+  %bc.merge.rdx183 = phi i32 [ %37, %middle.block ], [ 0, %for.body.us ]
+  %bc.merge.rdx184 = phi i32 [ %38, %middle.block ], [ 0, %for.body.us ]
+  br label %for.body12.us
+
+for.body12.us:                                    ; preds = %scalar.ph, %for.body12.us
+  %indvars.iv161 = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next162, %for.body12.us ]
+  %total3.0149.us = phi i32 [ %bc.merge.rdx, %scalar.ph ], [ %add43.us, %for.body12.us ]
+  %total2.0148.us = phi i32 [ %bc.merge.rdx182, %scalar.ph ], [ %add35.us, %for.body12.us ]
+  %total1.0147.us = phi i32 [ %bc.merge.rdx183, %scalar.ph ], [ %add27.us, %for.body12.us ]
+  %total0.0146.us = phi i32 [ %bc.merge.rdx184, %scalar.ph ], [ %add19.us, %for.body12.us ]
+  %arrayidx14.us = getelementptr inbounds i8, ptr %1, i64 %indvars.iv161
+  %39 = load i8, ptr %arrayidx14.us, align 1
+  %conv.us = sext i8 %39 to i32
+  %arrayidx16.us = getelementptr inbounds i8, ptr %u, i64 %indvars.iv161
+  %40 = load i8, ptr %arrayidx16.us, align 1
+  %conv17.us = sext i8 %40 to i32
+  %mul18.us = mul nsw i32 %conv17.us, %conv.us
+  %add19.us = add nsw i32 %mul18.us, %total0.0146.us
+  %arrayidx21.us = getelementptr inbounds i8, ptr %3, i64 %indvars.iv161
+  %41 = load i8, ptr %arrayidx21.us, align 1
+  %conv22.us = sext i8 %41 to i32
+  %mul26.us = mul nsw i32 %conv22.us, %conv17.us
+  %add27.us = add nsw i32 %mul26.us, %total1.0147.us
+  %arrayidx29.us = getelementptr inbounds i8, ptr %5, i64 %indvars.iv161
+  %42 = load i8, ptr %arrayidx29.us, align 1
+  %conv30.us = sext i8 %42 to i32
+  %mul34.us = mul nsw i32 %conv30.us, %conv17.us
+  %add35.us = add nsw i32 %mul34.us, %total2.0148.us
+  %arrayidx37.us = getelementptr inbounds i8, ptr %7, i64 %indvars.iv161
+  %43 = load i8, ptr %arrayidx37.us, align 1
+  %conv38.us = sext i8 %43 to i32
+  %mul42.us = mul nsw i32 %conv38.us, %conv17.us
+  %add43.us = add nsw i32 %mul42.us, %total3.0149.us
+  %indvars.iv.next162 = add nuw nsw i64 %indvars.iv161, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next162, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond10.for.cond.cleanup_crit_edge.us, label %for.body12.us
+
+for.cond10.for.cond.cleanup_crit_edge.us:         ; preds = %middle.block, %for.body12.us
+  %add19.us.lcssa = phi i32 [ %add19.us, %for.body12.us ], [ %38, %middle.block ]
+  %add27.us.lcssa = phi i32 [ %add27.us, %for.body12.us ], [ %37, %middle.block ]
+  %add35.us.lcssa = phi i32 [ %add35.us, %for.body12.us ], [ %36, %middle.block ]
+  %add43.us.lcssa = phi i32 [ %add43.us, %for.body12.us ], [ %35, %middle.block ]
+  %arrayidx45.us = getelementptr inbounds i8, ptr %1, i64 %idxprom44
+  %44 = load i8, ptr %arrayidx45.us, align 1
+  %conv46.us = sext i8 %44 to i32
+  %mul47.us = mul nsw i32 %conv46.us, 127
+  %add48.us = add nsw i32 %mul47.us, %add19.us.lcssa
+  %conv49.us = sitofp i32 %add48.us to float
+  %arrayidx52.us = getelementptr inbounds float, ptr %scales, i64 %indvars.iv164
+  %45 = load float, ptr %arrayidx52.us, align 4
+  %mul53.us = fmul float %45, %conv49.us
+  %arrayidx56.us = getelementptr inbounds float, ptr %v, i64 %indvars.iv164
+  store float %mul53.us, ptr %arrayidx56.us, align 4
+  %arrayidx58.us = getelementptr inbounds i8, ptr %3, i64 %idxprom44
+  %46 = load i8, ptr %arrayidx58.us, align 1
+  %conv59.us = sext i8 %46 to i32
+  %mul60.us = mul nsw i32 %conv59.us, 127
+  %add61.us = add nsw i32 %mul60.us, %add27.us.lcssa
+  %conv62.us = sitofp i32 %add61.us to float
+  %arrayidx65.us = getelementptr inbounds float, ptr %scales, i64 %2
+  %47 = load float, ptr %arrayidx65.us, align 4
+  %mul66.us = fmul float %47, %conv62.us
+  %arrayidx69.us = getelementptr inbounds float, ptr %v, i64 %2
+  store float %mul66.us, ptr %arrayidx69.us, align 4
+  %arrayidx71.us = getelementptr inbounds i8, ptr %5, i64 %idxprom44
+  %48 = load i8, ptr %arrayidx71.us, align 1
+  %conv72.us = sext i8 %48 to i32
+  %mul73.us = mul nsw i32 %conv72.us, 127
+  %add74.us = add nsw i32 %mul73.us, %add35.us.lcssa
+  %conv75.us = sitofp i32 %add74.us to float
+  %arrayidx78.us = getelementptr inbounds float, ptr %scales, i64 %4
+  %49 = load float, ptr %arrayidx78.us, align 4
+  %mul79.us = fmul float %49, %conv75.us
+  %arrayidx82.us = getelementptr inbounds float, ptr %v, i64 %4
+  store float %mul79.us, ptr %arrayidx82.us, align 4
+  %arrayidx84.us = getelementptr inbounds i8, ptr %7, i64 %idxprom44
+  %50 = load i8, ptr %arrayidx84.us, align 1
+  %conv85.us = sext i8 %50 to i32
+  %mul86.us = mul nsw i32 %conv85.us, 127
+  %add87.us = add nsw i32 %mul86.us, %add43.us.lcssa
+  %conv88.us = sitofp i32 %add87.us to float
+  %arrayidx91.us = getelementptr inbounds float, ptr %scales, i64 %6
+  %51 = load float, ptr %arrayidx91.us, align 4
+  %mul92.us = fmul float %51, %conv88.us
+  %arrayidx95.us = getelementptr inbounds float, ptr %v, i64 %6
+  store float %mul92.us, ptr %arrayidx95.us, align 4
+  %indvars.iv.next165 = add nuw nsw i64 %indvars.iv164, 4
+  %cmp.us = icmp ult i64 %indvars.iv.next165, %0
+  br i1 %cmp.us, label %for.body.us, label %for.end98
+
+for.end98:                                        ; preds = %for.end98.loopexit171, %for.end98.loopexit, %entry
+  ret void
+}
+
+define i32 @not_dotp_predicated(i32 %N, ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define i32 @not_dotp_predicated(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[N]], 16
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[REM]], 0
+; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[REM]] to i64
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP7]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP10]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 8 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw <vscale x 8 x i32> [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 8 x i32> [[TMP12]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP14]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i32> [[TMP13]], <vscale x 8 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 8 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP14]])
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  %rem = srem i32 %N, 16
+  %cmp8 = icmp sgt i32 %rem, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext nneg i32 %rem to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %total.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %total.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %total.09 = phi i32 [ 0, %for.body.preheader ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %conv3 = sext i8 %1 to i32
+  %mul = mul nsw i32 %conv3, %conv
+  %add = add nsw i32 %mul, %total.09
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define i32 @not_dotp_predicated_pragma(i32 %N, ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP7]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP10]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw <vscale x 16 x i32> [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 16 x i32> [[TMP12]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP14]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP13]], <vscale x 16 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP14]])
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  %cmp8.not = icmp eq i32 %N, 0
+  br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %total.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %total.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %total.09 = phi i32 [ 0, %for.body.preheader ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %conv3 = sext i8 %1 to i32
+  %mul = mul nsw i32 %conv3, %conv
+  %add = add nsw i32 %mul, %total.09
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !7
+}
+
+!7 = distinct !{!7, !8, !9, !10}
+!8 = !{!"llvm.loop.mustprogress"}
+!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!10 = !{!"llvm.loop.vectorize.enable", i1 true}
+attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
new file mode 100644
index 00000000000000..7fcb33b8584f33
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -0,0 +1,77 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -disable-output %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+; Tests for printing VPlans that are enabled under AArch64
+
+define void @print_partial_reduction(ptr %a, ptr %b) {
+; CHECK-LABEL: Checking a loop in 'print_partial_reduction'
+; CHECK:      VPlan 'Initial VPlan for VF={2,4,8,16},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<0> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<%add>
+; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:   CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<[[STEPS]]>
+; CHECK-NEXT:   vp<%4> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:   WIDEN ir<%1> = load vp<%4>
+; CHECK-NEXT:   WIDEN-CAST ir<%conv> = zext ir<%1> to i32
+; CHECK-NEXT:   CLONE ir<%arrayidx2> = getelementptr ir<%b>, vp<[[STEPS]]>
+; CHECK-NEXT:   vp<%5> = vector-pointer ir<%arrayidx2>
+; CHECK-NEXT:   WIDEN ir<%2> = load vp<%5>
+; CHECK-NEXT:   WIDEN-CAST ir<%conv3> = zext ir<%2> to i32
+; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%conv3>, ir<%conv>
+; CHECK-NEXT:   PARTIAL-REDUCE ir<%add> = add ir<%mul>, ir<%acc.010>
+; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<%8> = compute-reduction-result ir<[[ACC]]>, ir<%add>
+; CHECK-NEXT:   EMIT vp<%9> = extract-from-end vp<%8>, ir<1>
+; CHECK-NEXT:   EMIT vp<%10> = icmp eq ir<0>, vp<%1>
+; CHECK-NEXT:   EMIT branch-on-cond vp<%10>
+; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%9>)
+; CHECK-NEXT:   IR   %0 = lshr i32 %add.lcssa, 0
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %0 = lshr i32 %add, 0
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %acc.010 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr i8, ptr %a, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %1 to i32
+  %arrayidx2 = getelementptr i8, ptr %b, i64 %indvars.iv
+  %2 = load i8, ptr %arrayidx2, align 1
+  %conv3 = zext i8 %2 to i32
+  %mul = mul i32 %conv3, %conv
+  %add = add i32 %mul, %acc.010
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 0
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}

>From 0b07c7d7c4007d9228f18f26b438ec806066f88a Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 10 Oct 2024 17:14:27 +0100
Subject: [PATCH 02/56] Use a different recipe for each VF range

---
 .../AArch64/AArch64TargetTransformInfo.h      |   26 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  259 ++--
 .../Transforms/Vectorize/VPRecipeBuilder.h    |   10 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   12 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |    2 +
 .../Transforms/Vectorize/VPlanTransforms.cpp  |    1 +
 .../Transforms/Vectorize/VPlanTransforms.h    |    3 +
 .../AArch64/partial-reduce-dot-product.ll     | 1109 ++++++++---------
 .../LoopVectorize/AArch64/vplan-printing.ll   |   48 +-
 9 files changed, 690 insertions(+), 780 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a2b6fbbd6bb824..25517f1ccf42ce 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/InstructionCost.h"
 #include <cstdint>
 #include <optional>
 
@@ -348,7 +349,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
                           TTI::PartialReductionExtendKind OpAExtend,
                           TTI::PartialReductionExtendKind OpBExtend,
                           std::optional<unsigned> BinOp) const {
+
     InstructionCost Invalid = InstructionCost::getInvalid();
+    InstructionCost Cost(TTI::TCC_Basic);
 
     if (Opcode != Instruction::Add)
       return Invalid;
@@ -361,11 +364,28 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     if (VF.isFixed() && !ST->isNeonAvailable() && !ST->hasDotProd())
       return Invalid;
 
+    // FIXME: There should be a nicer way of doing this?
     if (InputEVT == MVT::i8) {
-      if (AccumEVT != MVT::i32)
+      switch (VF.getKnownMinValue()) {
+      default:
         return Invalid;
+      case 8:
+        if (AccumEVT == MVT::i32)
+          Cost *= 2;
+        else if (AccumEVT != MVT::i64)
+          return Invalid;
+        break;
+      case 16:
+        if (AccumEVT == MVT::i64)
+          Cost *= 2;
+        else if (AccumEVT != MVT::i32)
+          return Invalid;
+        break;
+      }
     } else if (InputEVT == MVT::i16) {
-      if (AccumEVT != MVT::i64)
+      // FIXME: Allow i32 accumulator but increase cost, as we would extend
+      //        it to i64.
+      if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64)
         return Invalid;
     } else
       return Invalid;
@@ -376,7 +396,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     if (!BinOp || (*BinOp) != Instruction::Mul)
       return Invalid;
 
-    return InstructionCost::getMin();
+    return Cost;
   }
 
   bool enableOrderedReductions() const { return true; }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0faea87decae80..d992927b5a9e15 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1573,100 +1573,6 @@ class LoopVectorizationCostModel {
     return PairIt->second;
   }
 
-  void removePartialReduction(Instruction *Instr) {
-    PartialReductionChains.erase(Instr);
-  }
-
-  void addPartialReductionIfSupported(Instruction *Instr, ElementCount VF) {
-
-    // Try to commutatively match:
-    // bin_op (one_use bin_op (z_or_sext, z_or_sext), phi)
-
-    auto *Root = dyn_cast<BinaryOperator>(Instr);
-    if (!Root)
-      return;
-
-    auto *BinOp = dyn_cast<BinaryOperator>(Root->getOperand(0));
-    auto *Phi = dyn_cast<PHINode>(Root->getOperand(1));
-    if (!BinOp) {
-      BinOp = dyn_cast<BinaryOperator>(Root->getOperand(1));
-      Phi = dyn_cast<PHINode>(Root->getOperand(0));
-    }
-    if (!BinOp || !BinOp->hasOneUse()) {
-      LLVM_DEBUG(
-          dbgs() << "Root was not a one-use binary operator, cannot create a "
-                    "partial reduction.\n");
-      return;
-    }
-    if (!Phi) {
-      LLVM_DEBUG(dbgs() << "Expected Phi node was not a phi, cannot create a "
-                           "partial reduction.\n");
-      return;
-    }
-
-    auto IsSextOrZext = [](Instruction *I) {
-      return I && (I->getOpcode() == Instruction::ZExt ||
-                   I->getOpcode() == Instruction::SExt);
-    };
-
-    auto *ExtA = dyn_cast<Instruction>(BinOp->getOperand(0));
-    auto *ExtB = dyn_cast<Instruction>(BinOp->getOperand(1));
-    if (!IsSextOrZext(ExtA) || !IsSextOrZext(ExtB)) {
-      LLVM_DEBUG(dbgs() << "Expected extends were not extends, cannot create a "
-                           "partial reduction.\n");
-      return;
-    }
-
-    Value *A = ExtA->getOperand(0);
-    Value *B = ExtB->getOperand(0);
-    // Check that the extends extend from the same type
-    if (A->getType() != B->getType()) {
-      LLVM_DEBUG(dbgs() << "Extends don't extend from the same type, cannot "
-                           "create a partial reduction.\n");
-      return;
-    }
-
-    // Check that the extends extend to the same type
-    if (ExtA->getType() != ExtB->getType()) {
-      LLVM_DEBUG(
-          dbgs() << "Extends don't extend to the same type, cannot create "
-                    "a partial reduction.\n");
-      return;
-    }
-
-    // Check that the second phi value is the instruction we're looking at
-    Instruction *MaybeAdd = dyn_cast<Instruction>(
-        Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
-    if (!MaybeAdd || MaybeAdd != Instr) {
-      LLVM_DEBUG(dbgs() << "Second PHI value is not the root binop, cannot "
-                           "create a partial reduction.\n");
-      return;
-    }
-
-    TTI::PartialReductionExtendKind OpAExtend =
-        TargetTransformInfo::getPartialReductionExtendKind(ExtA);
-    TTI::PartialReductionExtendKind OpBExtend =
-        TargetTransformInfo::getPartialReductionExtendKind(ExtB);
-    InstructionCost Cost = TTI.getPartialReductionCost(
-        Instr->getOpcode(), A->getType(), Phi->getType(), VF, OpAExtend,
-        OpBExtend, std::make_optional(BinOp->getOpcode()));
-    if (Cost == InstructionCost::getInvalid())
-      return;
-
-    PartialReductionChain Chain;
-    Chain.Reduction = Instr;
-    Chain.BinOp = BinOp;
-    Chain.ExtendA = ExtA;
-    Chain.ExtendB = ExtB;
-    Chain.Accumulator = Phi;
-
-    unsigned InputSizeBits = A->getType()->getScalarSizeInBits();
-    unsigned ResultSizeBits = Chain.Reduction->getType()->getScalarSizeInBits();
-    Chain.ScaleFactor = ResultSizeBits / InputSizeBits;
-
-    PartialReductionChains[Instr] = Chain;
-  }
-
 private:
   PartialReductionList PartialReductionChains;
 
@@ -7123,18 +7029,6 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   }
-
-  // Ignore any values that we know will be flattened
-  for (auto It : getPartialReductionChains()) {
-    PartialReductionChain Chain = It.second;
-    SmallVector<Value *> PartialReductionValues{Chain.Reduction, Chain.BinOp,
-                                                Chain.ExtendA, Chain.ExtendB,
-                                                Chain.Accumulator};
-    ValuesToIgnore.insert(PartialReductionValues.begin(),
-                          PartialReductionValues.end());
-    VecValuesToIgnore.insert(PartialReductionValues.begin(),
-                             PartialReductionValues.end());
-  }
 }
 
 void LoopVectorizationCostModel::collectInLoopReductions() {
@@ -7252,46 +7146,6 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
 
-  for (auto ReductionVar : Legal->getReductionVars()) {
-    auto *ReductionExitInstr = ReductionVar.second.getLoopExitInstr();
-    CM.addPartialReductionIfSupported(ReductionExitInstr, UserVF);
-  }
-
-  // Wider-than-legal vector types (coming from extends in partial reductions)
-  // should only be used by partial reductions so that they are lowered properly
-
-  // Build up a set of partial reduction bin ops for efficient use checking
-  SmallSet<Instruction *, 4> PartialReductionBinOps;
-  for (auto It : CM.getPartialReductionChains()) {
-    if (It.second.BinOp)
-      PartialReductionBinOps.insert(It.second.BinOp);
-  }
-
-  auto ExtendIsOnlyUsedByPartialReductions =
-      [PartialReductionBinOps](Instruction *Extend) {
-        for (auto *Use : Extend->users()) {
-          Instruction *UseInstr = dyn_cast<Instruction>(Use);
-          if (!PartialReductionBinOps.contains(UseInstr))
-            return false;
-        }
-        return true;
-      };
-
-  // Check if each use of a chain's two extends is a partial reduction
-  // and remove those that have non-partial reduction users
-  SmallSet<Instruction *, 4> PartialReductionsToRemove;
-  for (auto It : CM.getPartialReductionChains()) {
-    LoopVectorizationCostModel::PartialReductionChain Chain = It.second;
-    if (!ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) ||
-        !ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)) {
-      PartialReductionsToRemove.insert(Chain.Reduction);
-      LLVM_DEBUG(dbgs() << "Removing the partial reduction for an instruction "
-                           "with an extend used by something other than a "
-                           "partial reduction "
-                        << *Chain.Reduction << "\n");
-    }
-  }
-
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
 
@@ -7316,23 +7170,6 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   if (CM.foldTailByMasking())
     Legal->prepareToFoldTailByMasking();
 
-  for (auto Pair : CM.getPartialReductionChains()) {
-    // TODO: Allow creating partial reductions when predicating. The select at
-    // the end of the loop chooses between the phi value and most recent partial
-    // reduction result, both of which have different VFs to the active lane
-    // mask.
-    Instruction *Instr = Pair.first;
-    if (CM.blockNeedsPredicationForAnyReason(Instr->getParent())) {
-      LLVM_DEBUG(dbgs() << "LV: Removing the partial reduction for an "
-                           "instruction in a predicated block: "
-                        << *Instr << "\n");
-      PartialReductionsToRemove.insert(Instr);
-    }
-  }
-
-  for (auto *Insn : PartialReductionsToRemove)
-    CM.removePartialReduction(Insn);
-
   ElementCount MaxUserVF =
       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
   if (UserVF) {
@@ -8870,16 +8707,74 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
   return Recipe;
 }
 
-unsigned getScaleFactorForReductionPhi(PHINode *Phi,
-                                       LoopVectorizationCostModel &CM) {
-  for (auto *User : Phi->users()) {
-    if (auto *I = dyn_cast<Instruction>(User)) {
-      if (auto Chain = CM.getInstructionsPartialReduction(I)) {
-        return Chain->ScaleFactor;
-      }
-    }
-  }
-  return 1;
+/// Examines reduction operations to see if the target can use a cheaper
+/// operation with a wider per-iteration input VF and narrower PHI VF.
+/// Returns the ratio between the two VFs (1 by default).
+static unsigned getReductionScaleFactor(PHINode *PHI,
+                                        const RecurrenceDescriptor &Rdx,
+                                        const TargetTransformInfo *TTI,
+                                        VFRange &Range,
+                                        LoopVectorizationCostModel &CM) {
+  // FIXME: Should we move this to VPRecipeBuilder and cache the values needed
+  //        for the TTI query?
+  unsigned DefaultScaleFactor = 1;
+
+  // TODO: Allow scaling reductions when predicating. The select at
+  // the end of the loop chooses between the phi value and most recent
+  // reduction result, both of which have different VFs to the active lane
+  // mask when scaling.
+  if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
+    return DefaultScaleFactor;
+
+  auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
+  if (!Update)
+    return DefaultScaleFactor;
+
+  Value *Op = Update->getOperand(0);
+  if (Op == PHI)
+    Op = Update->getOperand(1);
+
+  // Match dot product pattern
+  auto *BinOp = dyn_cast<BinaryOperator>(Op);
+  if (!BinOp || !BinOp->hasOneUse())
+    return DefaultScaleFactor;
+
+  auto IsSextOrZext = [](Instruction *I) {
+    return I && (I->getOpcode() == Instruction::ZExt ||
+                 I->getOpcode() == Instruction::SExt);
+  };
+
+  auto *ExtA = dyn_cast<Instruction>(BinOp->getOperand(0));
+  auto *ExtB = dyn_cast<Instruction>(BinOp->getOperand(1));
+  if (!IsSextOrZext(ExtA) || !IsSextOrZext(ExtB))
+    return DefaultScaleFactor;
+
+  Value *A = ExtA->getOperand(0);
+  Value *B = ExtB->getOperand(0);
+  // Check that the extends extend from the same type
+  if (A->getType() != B->getType())
+    return DefaultScaleFactor;
+
+  unsigned TargetScaleFactor =
+      PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
+          A->getType()->getPrimitiveSizeInBits());
+
+  TTI::PartialReductionExtendKind OpAExtend =
+      TargetTransformInfo::getPartialReductionExtendKind(ExtA);
+  TTI::PartialReductionExtendKind OpBExtend =
+      TargetTransformInfo::getPartialReductionExtendKind(ExtB);
+
+  if (LoopVectorizationPlanner::getDecisionAndClampRange(
+          [&](ElementCount VF) {
+            InstructionCost Cost = TTI->getPartialReductionCost(
+                Update->getOpcode(), A->getType(), PHI->getType(), VF,
+                OpAExtend, OpBExtend, std::make_optional(BinOp->getOpcode()));
+            return Cost.isValid();
+          },
+          Range))
+    return TargetScaleFactor;
+
+  return DefaultScaleFactor;
 }
 
 VPRecipeBase *
@@ -8908,7 +8803,11 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
 
       // If the PHI is used by a partial reduction, set the scale factor
-      unsigned ScaleFactor = getScaleFactorForReductionPhi(Phi, CM);
+      unsigned ScaleFactor =
+          getReductionScaleFactor(Phi, RdxDesc, TTI, Range, CM);
+      Instruction *ReductionInstr = RdxDesc.getLoopExitInstr();
+      if (ScaleFactor != 1)
+        Plan.addScaledReductionExitInstr(RdxDesc.getLoopExitInstr());
       PhiRecipe = new VPReductionPHIRecipe(
           Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
           CM.useOrderedReductions(RdxDesc), ScaleFactor);
@@ -8943,6 +8842,9 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
     return tryToWidenMemory(Instr, Operands, Range);
 
+  if (Plan.isScaledReductionExitInstr(Instr))
+    return tryToCreatePartialReduction(Instr, Operands);
+
   if (!shouldWiden(Instr, Range))
     return nullptr;
 
@@ -8965,7 +8867,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
 
 VPRecipeBase *
 VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
-                                             unsigned ScaleFactor,
                                              ArrayRef<VPValue *> Operands) {
   assert(Operands.size() == 2 &&
          "Unexpected number of operands for partial reduction");
@@ -9281,7 +9182,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   bool HasNUW = Style == TailFoldingStyle::None;
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
 
-  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
+  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
+                                Builder);
 
   // ---------------------------------------------------------------------------
   // Pre-construction: record ingredients whose recipes we'll need to further
@@ -9373,10 +9275,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
 
       VPRecipeBase *Recipe = nullptr;
 
-      if (auto Chain = CM.getInstructionsPartialReduction(Instr))
-        Recipe = RecipeBuilder.tryToCreatePartialReduction(
-            Chain->Reduction, Chain->ScaleFactor, Operands);
-      else if (!Recipe)
+      if (!Recipe)
         Recipe =
             RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
       if (!Recipe)
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 136766eb455fde..b93b8df8653ef2 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -22,6 +22,7 @@ class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 class TargetLibraryInfo;
 struct HistogramInfo;
+class TargetTransformInfo;
 
 /// Helper class to create VPRecipies from IR instructions.
 class VPRecipeBuilder {
@@ -34,6 +35,9 @@ class VPRecipeBuilder {
   /// Target Library Info.
   const TargetLibraryInfo *TLI;
 
+  // Target Transform Info
+  const TargetTransformInfo *TTI;
+
   /// The legality analysis.
   LoopVectorizationLegality *Legal;
 
@@ -113,11 +117,12 @@ class VPRecipeBuilder {
 
 public:
   VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI,
+                  const TargetTransformInfo *TTI,
                   LoopVectorizationLegality *Legal,
                   LoopVectorizationCostModel &CM,
                   PredicatedScalarEvolution &PSE, VPBuilder &Builder)
-      : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM),
-        PSE(PSE), Builder(Builder) {}
+      : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal),
+        CM(CM), PSE(PSE), Builder(Builder) {}
 
   /// Create and return a widened recipe for \p I if one can be created within
   /// the given VF \p Range.
@@ -126,7 +131,6 @@ class VPRecipeBuilder {
                                        VFRange &Range, VPBasicBlock *VPBB);
 
   VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
-                                            unsigned ScaleFactor,
                                             ArrayRef<VPValue *> Operands);
 
   /// Set the recipe created for given ingredient.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index eb5e187a118272..6fe0f4006facac 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3803,6 +3803,10 @@ class VPlan {
   /// been modeled in VPlan directly.
   DenseMap<const SCEV *, VPValue *> SCEVToExpansion;
 
+  /// Stores the set of reduction exit instructions that will be scaled to
+  /// a smaller VF in this plan via partial reductions.
+  SmallPtrSet<const Instruction *, 2> ScaledReductionExitInstrs;
+
 public:
   /// Construct a VPlan with original preheader \p Preheader, trip count \p TC,
   /// \p Entry to the plan and with \p ScalarHeader wrapping the original header
@@ -4022,6 +4026,14 @@ class VPlan {
   /// Clone the current VPlan, update all VPValues of the new VPlan and cloned
   /// recipes to refer to the clones, and return it.
   VPlan *duplicate();
+
+  void addScaledReductionExitInstr(const Instruction *ExitInst) {
+    ScaledReductionExitInstrs.insert(ExitInst);
+  }
+
+  bool isScaledReductionExitInstr(const Instruction *ExitInst) {
+    return ScaledReductionExitInstrs.contains(ExitInst);
+  }
 };
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b8efa10f2f9af7..dd719b3973b01e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3456,6 +3456,8 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
   printAsOperand(O, SlotTracker);
   O << " = phi ";
   printOperands(O, SlotTracker);
+  if (VFScaleFactor != 1)
+    O << " (VF scaled by 1/" << VFScaleFactor << ")";
 }
 #endif
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ea8845eaa75d4d..9ea5e1b5ed097c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 11e094db6294f6..54af9a7c84585a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -15,10 +15,12 @@
 
 #include "VPlan.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 
 namespace llvm {
 
 class InductionDescriptor;
+class RecurrenceDescriptor;
 class Instruction;
 class PHINode;
 class ScalarEvolution;
@@ -26,6 +28,7 @@ class PredicatedScalarEvolution;
 class TargetLibraryInfo;
 class VPBuilder;
 class VPRecipeBuilder;
+class TargetTransformInfo;
 
 struct VPlanTransforms {
   /// Replaces the VPInstructions in \p Plan with corresponding
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index bcac4d674123d8..9d7dda55a45c41 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVE1
-; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVED
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-target-instruction-cost=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVE1
+; RUN: opt -passes=loop-vectorize -force-target-instruction-cost=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVED
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"
@@ -8,91 +8,151 @@ target triple = "aarch64-none-unknown-elf"
 define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-LABEL: define i32 @dotp(
 ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:  iter.check:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.main.loop.iter.check:
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY1:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE1:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP23]], [[TMP19]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE1]])
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-INTERLEAVE1:       vec.epilog.iter.check:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-INTERLEAVE1:       vec.epilog.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP28]], 4
 ; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1:       vec.epilog.vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP13]])
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul <vscale x 4 x i32> [[TMP25]], [[TMP22]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = add <vscale x 4 x i32> [[TMP26]], [[VEC_PHI3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVE1:       vec.epilog.middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[SCALAR_PH]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:  iter.check:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.main.loop.iter.check:
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY1:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP20]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP28]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul <16 x i32> [[TMP11]], [[TMP25]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = mul <16 x i32> [[TMP12]], [[TMP26]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP13]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP29]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX1:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX1]])
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-INTERLEAVED:       vec.epilog.iter.check:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-INTERLEAVED-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP18]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-INTERLEAVED:       vec.epilog.ph:
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP19]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED:       vec.epilog.vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI7:%.*]] = phi <vscale x 4 x i32> [ [[TMP33]], [[VEC_EPILOG_PH]] ], [ [[BIN_RDX:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP15]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul <vscale x 16 x i32> [[TMP19]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 16 x i32> [[TMP20]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP21]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP22]])
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP15]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul <vscale x 4 x i32> [[TMP30]], [[TMP27]]
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX]] = add <vscale x 4 x i32> [[TMP31]], [[VEC_PHI7]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVED:       vec.epilog.middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[SCALAR_PH]]
 ;
 entry:
   br label %for.body
@@ -120,7 +180,12 @@ for.body:                                         ; preds = %for.body, %entry
 define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-LABEL: define void @not_dotp_different_types(
 ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:  iter.check:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP72:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP73:%.*]] = mul i64 [[TMP72]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP73]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-INTERLEAVE1:       vector.main.loop.iter.check:
 ; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -200,14 +265,89 @@ define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-INTERLEAVE1:       vec.epilog.iter.check:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP74:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP75:%.*]] = mul i64 [[TMP74]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP75]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1:       vec.epilog.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP76:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP77:%.*]] = mul i64 [[TMP76]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP77]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP78:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP79:%.*]] = mul i64 [[TMP78]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT:    [[TMP80:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP81:%.*]] = add <vscale x 4 x i64> [[TMP80]], zeroinitializer
+; CHECK-INTERLEAVE1-NEXT:    [[TMP82:%.*]] = mul <vscale x 4 x i64> [[TMP81]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-INTERLEAVE1-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT]], [[TMP82]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP83:%.*]] = mul i64 1, [[TMP79]]
+; CHECK-INTERLEAVE1-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP83]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT:    [[TMP84:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vec.epilog.vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 4 x i32> [ [[TMP84]], [[SCALAR_PH]] ], [ [[TMP92:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP85:%.*]] = add i64 [[INDEX2]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP86:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP85]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP87:%.*]] = getelementptr i8, ptr [[TMP86]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP87]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP88:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD6]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP89:%.*]] = getelementptr i8, ptr [[B]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP89]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
+; CHECK-INTERLEAVE1-NEXT:    [[TMP90:%.*]] = zext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP91:%.*]] = mul <vscale x 4 x i32> [[TMP90]], [[TMP88]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP92]] = add <vscale x 4 x i32> [[TMP91]], [[VEC_PHI5]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT7]] = add nuw i64 [[INDEX2]], [[TMP79]]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT4]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP93:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP93]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVE1:       vec.epilog.middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP94:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP92]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-INTERLEAVE1:       vec.epilog.scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX8:%.*]] = phi i32 [ [[TMP94]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       for.cond.cleanup.loopexit:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP71]], [[MIDDLE_BLOCK]] ], [ [[TMP94]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP95:%.*]] = lshr i32 [[ADD_LCSSA]], 0
+; CHECK-INTERLEAVE1-NEXT:    ret void
+; CHECK-INTERLEAVE1:       for.body:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX8]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define void @not_dotp_different_types(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:  iter.check:
+; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = mul i64 [[TMP37]], 4
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP38]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-INTERLEAVED:       vector.main.loop.iter.check:
 ; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -254,54 +394,54 @@ define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = load i16, ptr [[TMP37]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = load i16, ptr [[TMP38]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = load i16, ptr [[TMP39]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = load i16, ptr [[TMP40]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = load i16, ptr [[TMP41]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = load i16, ptr [[TMP42]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = load i16, ptr [[TMP43]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = load i16, ptr [[TMP44]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = load i16, ptr [[TMP45]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = load i16, ptr [[TMP46]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = load i16, ptr [[TMP47]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP80:%.*]] = load i16, ptr [[TMP48]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP81:%.*]] = load i16, ptr [[TMP49]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = load i16, ptr [[TMP50]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = load i16, ptr [[TMP51]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP141:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP142:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = load i16, ptr [[TMP39]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = load i16, ptr [[TMP40]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = load i16, ptr [[TMP41]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = load i16, ptr [[TMP42]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = load i16, ptr [[TMP43]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = load i16, ptr [[TMP44]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = load i16, ptr [[TMP45]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = load i16, ptr [[TMP46]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = load i16, ptr [[TMP47]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = load i16, ptr [[TMP48]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = load i16, ptr [[TMP49]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP80:%.*]] = load i16, ptr [[TMP50]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP81:%.*]] = load i16, ptr [[TMP51]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = load i16, ptr [[TMP53]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = load i16, ptr [[TMP54]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
@@ -318,22 +458,22 @@ define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
 ; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
 ; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
-; CHECK-INTERLEAVED-NEXT:    [[TMP101:%.*]] = load i16, ptr [[TMP53]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP102:%.*]] = load i16, ptr [[TMP54]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP103:%.*]] = load i16, ptr [[TMP55]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP104:%.*]] = load i16, ptr [[TMP56]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP105:%.*]] = load i16, ptr [[TMP57]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP106:%.*]] = load i16, ptr [[TMP58]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP107:%.*]] = load i16, ptr [[TMP59]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP108:%.*]] = load i16, ptr [[TMP60]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP109:%.*]] = load i16, ptr [[TMP61]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP110:%.*]] = load i16, ptr [[TMP62]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP111:%.*]] = load i16, ptr [[TMP63]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP112:%.*]] = load i16, ptr [[TMP64]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP113:%.*]] = load i16, ptr [[TMP65]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP66]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP67]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP68]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP101:%.*]] = load i16, ptr [[TMP55]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP102:%.*]] = load i16, ptr [[TMP56]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP103:%.*]] = load i16, ptr [[TMP57]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP104:%.*]] = load i16, ptr [[TMP58]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP105:%.*]] = load i16, ptr [[TMP59]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP106:%.*]] = load i16, ptr [[TMP60]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP107:%.*]] = load i16, ptr [[TMP61]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP108:%.*]] = load i16, ptr [[TMP62]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP109:%.*]] = load i16, ptr [[TMP63]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP110:%.*]] = load i16, ptr [[TMP64]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP111:%.*]] = load i16, ptr [[TMP65]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP112:%.*]] = load i16, ptr [[TMP66]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP113:%.*]] = load i16, ptr [[TMP67]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP68]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP141]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP142]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
@@ -358,11 +498,11 @@ define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP140:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ;
 entry:
   br label %for.body
@@ -392,88 +532,88 @@ define void @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 16 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP16]], i32 -1)
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16]] = mul <vscale x 16 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> [[VECTOR_RECUR]], <vscale x 16 x i32> [[TMP16]], i32 -1)
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 16 x i32> [[TMP16]], [[TMP17]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define void @not_dotp_not_loop_carried(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 16 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1)
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = mul <vscale x 16 x i32> [[TMP22]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = mul <vscale x 16 x i32> [[TMP23]], [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> [[TMP24]], <vscale x 16 x i32> [[TMP25]], i32 -1)
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add <vscale x 16 x i32> [[TMP25]], [[TMP26]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -503,81 +643,81 @@ define void @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 16 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul <vscale x 16 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17]] = add <vscale x 16 x i32> [[TMP16]], [[TMP15]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define void @not_dotp_not_phi(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 16 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP19]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP20]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 8 x i32> [[TMP22]], [[TMP21]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 16 x i32> [[TMP21]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 16 x i32> [[TMP22]], [[TMP21]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -603,370 +743,165 @@ for.body:                                         ; preds = %for.body, %entry
 }
 
 define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %w, ptr %scales, ptr %u, ptr %v) #0 {
-; CHECK-INTERLEAVE1-LABEL: define void @dotp_unrolled(
-; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i32 [[NUM_IN:%.*]], ptr [[W:%.*]], ptr [[SCALES:%.*]], ptr [[U:%.*]], ptr [[V:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    [[CMP154:%.*]] = icmp sgt i32 [[NUM_OUT]], 3
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP154]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END98:%.*]]
-; CHECK-INTERLEAVE1:       for.body.lr.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[DIV:%.*]] = sdiv i32 [[NUM_OUT]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = shl nsw i32 [[DIV]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[CMP11145:%.*]] = icmp sgt i32 [[NUM_IN]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[IDXPROM44:%.*]] = sext i32 [[NUM_IN]] to i64
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[MUL]] to i64
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP11145]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK-INTERLEAVE1:       for.body.preheader:
-; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_END98]]
-; CHECK-INTERLEAVE1:       for.body.us.preheader:
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[NUM_IN]] to i64
-; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY_US:%.*]]
-; CHECK-INTERLEAVE1:       for.body.us:
-; CHECK-INTERLEAVE1-NEXT:    [[INDVARS_IV164:%.*]] = phi i64 [ 0, [[FOR_BODY_US_PREHEADER]] ], [ [[INDVARS_IV_NEXT165:%.*]], [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US:%.*]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[INDVARS_IV164]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[INDVARS_IV164]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[ARRAYIDX3_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP2]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX3_US]], align 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[INDVARS_IV164]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP4]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARRAYIDX6_US]], align 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[INDVARS_IV164]], 3
-; CHECK-INTERLEAVE1-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX9_US]], align 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP11]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE181:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI172:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE179:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI173:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE177:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI174:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD175:%.*]] = load <vscale x 16 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD175]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul nsw <vscale x 16 x i32> [[TMP20]], [[TMP17]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI174]], <vscale x 16 x i32> [[TMP21]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD176:%.*]] = load <vscale x 16 x i8>, ptr [[TMP23]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD176]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = mul nsw <vscale x 16 x i32> [[TMP24]], [[TMP20]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE177]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI173]], <vscale x 16 x i32> [[TMP25]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD178:%.*]] = load <vscale x 16 x i8>, ptr [[TMP27]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD178]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP28]], [[TMP20]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE179]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI172]], <vscale x 16 x i32> [[TMP29]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD180:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD180]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP32]], [[TMP20]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE181]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP33]])
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
-; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE177_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE177]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE179_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE179]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE181_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE181]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE181_LCSSA]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE179_LCSSA]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE177_LCSSA]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_LCSSA]])
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[SCALAR_PH]]
-; CHECK-INTERLEAVE1:       scalar.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX182:%.*]] = phi i32 [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX183:%.*]] = phi i32 [ [[TMP37]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX184:%.*]] = phi i32 [ [[TMP38]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[BC_RESUME_VAL]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP40]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[TMP39]], [[TMP41]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK7]], label [[SCALAR_PH6:%.*]], label [[VECTOR_PH8:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph8:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[TMP39]], [[TMP43]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC10:%.*]] = sub i64 [[TMP39]], [[N_MOD_VF9]]
-; CHECK-INTERLEAVE1-NEXT:    [[IND_END:%.*]] = add i64 [[BC_RESUME_VAL]], [[N_VEC10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[TMP46:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP47:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX182]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP48:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX183]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP49:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX184]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY12:%.*]]
-; CHECK-INTERLEAVE1:       vector.body12:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX13:%.*]] = phi i64 [ 0, [[VECTOR_PH8]] ], [ [[INDEX_NEXT27:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI14:%.*]] = phi <vscale x 4 x i32> [ [[TMP46]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE26:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI15:%.*]] = phi <vscale x 4 x i32> [ [[TMP47]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE24:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI16:%.*]] = phi <vscale x 4 x i32> [ [[TMP48]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE22:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI17:%.*]] = phi <vscale x 4 x i32> [ [[TMP49]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE20:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[BC_RESUME_VAL]], [[INDEX13]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP50]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP51]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 16 x i8>, ptr [[TMP52]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP53:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD18]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP50]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD19:%.*]] = load <vscale x 16 x i8>, ptr [[TMP55]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP56:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD19]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP57:%.*]] = mul nsw <vscale x 16 x i32> [[TMP56]], [[TMP53]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE20]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI17]], <vscale x 16 x i32> [[TMP57]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP50]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP58]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD21:%.*]] = load <vscale x 16 x i8>, ptr [[TMP59]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP60:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD21]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP61:%.*]] = mul nsw <vscale x 16 x i32> [[TMP60]], [[TMP56]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE22]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI16]], <vscale x 16 x i32> [[TMP61]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP50]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[TMP62]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD23:%.*]] = load <vscale x 16 x i8>, ptr [[TMP63]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP64:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD23]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP65:%.*]] = mul nsw <vscale x 16 x i32> [[TMP64]], [[TMP56]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE24]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI15]], <vscale x 16 x i32> [[TMP65]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP50]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i8, ptr [[TMP66]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD25:%.*]] = load <vscale x 16 x i8>, ptr [[TMP67]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP68:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD25]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP69:%.*]] = mul nsw <vscale x 16 x i32> [[TMP68]], [[TMP56]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE26]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI14]], <vscale x 16 x i32> [[TMP69]])
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT27]] = add nuw i64 [[INDEX13]], [[TMP45]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT27]], [[N_VEC10]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK5:%.*]], label [[VECTOR_BODY12]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-INTERLEAVE1:       middle.block5:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE26]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP72:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE24]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE22]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP74:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE20]])
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N28:%.*]] = icmp eq i64 [[TMP39]], [[N_VEC10]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N28]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US_LOOPEXIT:%.*]], label [[SCALAR_PH6]]
-;
-; CHECK-INTERLEAVED-LABEL: define void @dotp_unrolled(
-; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i32 [[NUM_IN:%.*]], ptr [[W:%.*]], ptr [[SCALES:%.*]], ptr [[U:%.*]], ptr [[V:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    [[CMP154:%.*]] = icmp sgt i32 [[NUM_OUT]], 3
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP154]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END98:%.*]]
-; CHECK-INTERLEAVED:       for.body.lr.ph:
-; CHECK-INTERLEAVED-NEXT:    [[DIV:%.*]] = sdiv i32 [[NUM_OUT]], 4
-; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = shl nsw i32 [[DIV]], 2
-; CHECK-INTERLEAVED-NEXT:    [[CMP11145:%.*]] = icmp sgt i32 [[NUM_IN]], 0
-; CHECK-INTERLEAVED-NEXT:    [[IDXPROM44:%.*]] = sext i32 [[NUM_IN]] to i64
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[MUL]] to i64
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP11145]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK-INTERLEAVED:       for.body.preheader:
-; CHECK-INTERLEAVED-NEXT:    br label [[FOR_END98]]
-; CHECK-INTERLEAVED:       for.body.us.preheader:
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[NUM_IN]] to i64
-; CHECK-INTERLEAVED-NEXT:    br label [[FOR_BODY_US:%.*]]
-; CHECK-INTERLEAVED:       for.body.us:
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV164:%.*]] = phi i64 [ 0, [[FOR_BODY_US_PREHEADER]] ], [ [[INDVARS_IV_NEXT165:%.*]], [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US:%.*]] ]
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[INDVARS_IV164]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[INDVARS_IV164]], 1
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX3_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP2]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX3_US]], align 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[INDVARS_IV164]], 2
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP4]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARRAYIDX6_US]], align 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[INDVARS_IV164]], 3
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX9_US]], align 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP11]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE181:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI172:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE179:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI173:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE177:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI174:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD175:%.*]] = load <vscale x 16 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD175]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul nsw <vscale x 16 x i32> [[TMP20]], [[TMP17]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI174]], <vscale x 16 x i32> [[TMP21]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD176:%.*]] = load <vscale x 16 x i8>, ptr [[TMP23]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD176]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = mul nsw <vscale x 16 x i32> [[TMP24]], [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE177]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI173]], <vscale x 16 x i32> [[TMP25]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD178:%.*]] = load <vscale x 16 x i8>, ptr [[TMP27]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD178]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP28]], [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE179]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI172]], <vscale x 16 x i32> [[TMP29]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD180:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD180]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP32]], [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE181]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP33]])
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
-; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE177_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE177]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE179_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE179]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE181_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE181]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE181_LCSSA]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE179_LCSSA]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE177_LCSSA]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_LCSSA]])
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[SCALAR_PH]]
-; CHECK-INTERLEAVED:       scalar.ph:
-; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX182:%.*]] = phi i32 [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX183:%.*]] = phi i32 [ [[TMP37]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX184:%.*]] = phi i32 [ [[TMP38]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[BC_RESUME_VAL]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP40]], 32
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[TMP39]], [[TMP41]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK7]], label [[SCALAR_PH6:%.*]], label [[VECTOR_PH8:%.*]]
-; CHECK-INTERLEAVED:       vector.ph8:
-; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 32
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[TMP39]], [[TMP43]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC10:%.*]] = sub i64 [[TMP39]], [[N_MOD_VF9]]
-; CHECK-INTERLEAVED-NEXT:    [[IND_END:%.*]] = add i64 [[BC_RESUME_VAL]], [[N_VEC10]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP44]], 32
-; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX182]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX183]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX184]], i32 0
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY12:%.*]]
-; CHECK-INTERLEAVED:       vector.body12:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX13:%.*]] = phi i64 [ 0, [[VECTOR_PH8]] ], [ [[INDEX_NEXT40:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI14:%.*]] = phi <vscale x 4 x i32> [ [[TMP46]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE38:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI15:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE39:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI16:%.*]] = phi <vscale x 4 x i32> [ [[TMP47]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE34:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI17:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE35:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI18:%.*]] = phi <vscale x 4 x i32> [ [[TMP48]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE30:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI19:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE31:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI20:%.*]] = phi <vscale x 4 x i32> [ [[TMP49]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE26:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI21:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE27:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[BC_RESUME_VAL]], [[INDEX13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP50]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP51]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = mul i64 [[TMP53]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP51]], i64 [[TMP54]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD22:%.*]] = load <vscale x 16 x i8>, ptr [[TMP52]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD23:%.*]] = load <vscale x 16 x i8>, ptr [[TMP55]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD22]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD23]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP50]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP58]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = mul i64 [[TMP60]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[TMP58]], i64 [[TMP61]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD24:%.*]] = load <vscale x 16 x i8>, ptr [[TMP59]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD25:%.*]] = load <vscale x 16 x i8>, ptr [[TMP62]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD24]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP64:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD25]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP65:%.*]] = mul nsw <vscale x 16 x i32> [[TMP63]], [[TMP56]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = mul nsw <vscale x 16 x i32> [[TMP64]], [[TMP57]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE26]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI20]], <vscale x 16 x i32> [[TMP65]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE27]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI21]], <vscale x 16 x i32> [[TMP66]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP50]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[TMP67]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = mul i64 [[TMP69]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i8, ptr [[TMP67]], i64 [[TMP70]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD28:%.*]] = load <vscale x 16 x i8>, ptr [[TMP68]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD29:%.*]] = load <vscale x 16 x i8>, ptr [[TMP71]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD28]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD29]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = mul nsw <vscale x 16 x i32> [[TMP72]], [[TMP63]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = mul nsw <vscale x 16 x i32> [[TMP73]], [[TMP64]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE30]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI18]], <vscale x 16 x i32> [[TMP74]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE31]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI19]], <vscale x 16 x i32> [[TMP75]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP50]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = getelementptr inbounds i8, ptr [[TMP76]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = mul i64 [[TMP78]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i8, ptr [[TMP76]], i64 [[TMP79]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD32:%.*]] = load <vscale x 16 x i8>, ptr [[TMP77]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD33:%.*]] = load <vscale x 16 x i8>, ptr [[TMP80]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP81:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD32]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD33]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = mul nsw <vscale x 16 x i32> [[TMP81]], [[TMP63]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = mul nsw <vscale x 16 x i32> [[TMP82]], [[TMP64]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE34]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI16]], <vscale x 16 x i32> [[TMP83]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE35]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI17]], <vscale x 16 x i32> [[TMP84]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP50]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = getelementptr inbounds i8, ptr [[TMP85]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP87:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP88:%.*]] = mul i64 [[TMP87]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP89:%.*]] = getelementptr inbounds i8, ptr [[TMP85]], i64 [[TMP88]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD36:%.*]] = load <vscale x 16 x i8>, ptr [[TMP86]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD37:%.*]] = load <vscale x 16 x i8>, ptr [[TMP89]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP90:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD36]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP91:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD37]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP92:%.*]] = mul nsw <vscale x 16 x i32> [[TMP90]], [[TMP63]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP93:%.*]] = mul nsw <vscale x 16 x i32> [[TMP91]], [[TMP64]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE38]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI14]], <vscale x 16 x i32> [[TMP92]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE39]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI15]], <vscale x 16 x i32> [[TMP93]])
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT40]] = add nuw i64 [[INDEX13]], [[TMP45]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP94:%.*]] = icmp eq i64 [[INDEX_NEXT40]], [[N_VEC10]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP94]], label [[MIDDLE_BLOCK5:%.*]], label [[VECTOR_BODY12]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-INTERLEAVED:       middle.block5:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE39]], [[PARTIAL_REDUCE38]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP95:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX41:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE35]], [[PARTIAL_REDUCE34]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX41]])
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX42:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE31]], [[PARTIAL_REDUCE30]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX42]])
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX43:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE27]], [[PARTIAL_REDUCE26]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX43]])
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N44:%.*]] = icmp eq i64 [[TMP39]], [[N_VEC10]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N44]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US_LOOPEXIT:%.*]], label [[SCALAR_PH6]]
+; CHECK-LABEL: define void @dotp_unrolled(
+; CHECK-SAME: i32 [[NUM_OUT:%.*]], i32 [[NUM_IN:%.*]], ptr [[W:%.*]], ptr [[SCALES:%.*]], ptr [[U:%.*]], ptr [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP154:%.*]] = icmp sgt i32 [[NUM_OUT]], 3
+; CHECK-NEXT:    br i1 [[CMP154]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END98:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[NUM_OUT]], 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[DIV]], 2
+; CHECK-NEXT:    [[CMP11145:%.*]] = icmp sgt i32 [[NUM_IN]], 0
+; CHECK-NEXT:    [[IDXPROM44:%.*]] = sext i32 [[NUM_IN]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[MUL]] to i64
+; CHECK-NEXT:    br i1 [[CMP11145]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_END98]]
+; CHECK:       for.body.us.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[NUM_IN]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY_US:%.*]]
+; CHECK:       for.body.us:
+; CHECK-NEXT:    [[INDVARS_IV164:%.*]] = phi i64 [ 0, [[FOR_BODY_US_PREHEADER]] ], [ [[INDVARS_IV_NEXT165:%.*]], [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[INDVARS_IV164]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[INDVARS_IV164]], 1
+; CHECK-NEXT:    [[ARRAYIDX3_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX3_US]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[INDVARS_IV164]], 2
+; CHECK-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARRAYIDX6_US]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[INDVARS_IV164]], 3
+; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX9_US]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[ITER_CHECK:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP11]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE181:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI172:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE179:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI173:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE177:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI174:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD175:%.*]] = load <vscale x 16 x i8>, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD175]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = mul nsw <vscale x 16 x i32> [[TMP20]], [[TMP17]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI174]], <vscale x 16 x i32> [[TMP21]])
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD176:%.*]] = load <vscale x 16 x i8>, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP24:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD176]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP25:%.*]] = mul nsw <vscale x 16 x i32> [[TMP24]], [[TMP20]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE177]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI173]], <vscale x 16 x i32> [[TMP25]])
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD178:%.*]] = load <vscale x 16 x i8>, ptr [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD178]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP28]], [[TMP20]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE179]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI172]], <vscale x 16 x i32> [[TMP29]])
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD180:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD180]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP32]], [[TMP20]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE181]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP33]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[PARTIAL_REDUCE_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[PARTIAL_REDUCE177_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE177]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[PARTIAL_REDUCE179_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE179]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[PARTIAL_REDUCE181_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE181]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE181_LCSSA]])
+; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE179_LCSSA]])
+; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE177_LCSSA]])
+; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_LCSSA]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[ITER_CHECK]]
+; CHECK:       iter.check:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX182:%.*]] = phi i32 [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX183:%.*]] = phi i32 [ [[TMP37]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX184:%.*]] = phi i32 [ [[TMP38]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-NEXT:    [[TMP39:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[BC_RESUME_VAL]]
+; CHECK-NEXT:    [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP40]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[TMP39]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK9:%.*]] = icmp ult i64 [[TMP39]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK9]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH10:%.*]]
+; CHECK:       vector.ph10:
+; CHECK-NEXT:    [[N_MOD_VF11:%.*]] = urem i64 [[TMP39]], 16
+; CHECK-NEXT:    [[N_VEC12:%.*]] = sub i64 [[TMP39]], [[N_MOD_VF11]]
+; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX182]], i32 0
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX183]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX184]], i32 0
+; CHECK-NEXT:    br label [[VECTOR_BODY13:%.*]]
+; CHECK:       vector.body13:
+; CHECK-NEXT:    [[INDEX14:%.*]] = phi i64 [ 0, [[VECTOR_PH10]] ], [ [[INDEX_NEXT28:%.*]], [[VECTOR_BODY13]] ]
+; CHECK-NEXT:    [[VEC_PHI15:%.*]] = phi <4 x i32> [ [[TMP42]], [[VECTOR_PH10]] ], [ [[PARTIAL_REDUCE27:%.*]], [[VECTOR_BODY13]] ]
+; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <4 x i32> [ [[TMP43]], [[VECTOR_PH10]] ], [ [[PARTIAL_REDUCE25:%.*]], [[VECTOR_BODY13]] ]
+; CHECK-NEXT:    [[VEC_PHI17:%.*]] = phi <4 x i32> [ [[TMP44]], [[VECTOR_PH10]] ], [ [[PARTIAL_REDUCE23:%.*]], [[VECTOR_BODY13]] ]
+; CHECK-NEXT:    [[VEC_PHI18:%.*]] = phi <4 x i32> [ [[TMP45]], [[VECTOR_PH10]] ], [ [[PARTIAL_REDUCE21:%.*]], [[VECTOR_BODY13]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[BC_RESUME_VAL]], [[INDEX14]]
+; CHECK-NEXT:    [[TMP46:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP46]]
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[TMP47]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP48]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP46]]
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[TMP50]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP51]], align 1
+; CHECK-NEXT:    [[TMP52:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
+; CHECK-NEXT:    [[TMP53:%.*]] = mul nsw <16 x i32> [[TMP52]], [[TMP49]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI18]], <16 x i32> [[TMP53]])
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP46]]
+; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <16 x i8>, ptr [[TMP55]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD22]] to <16 x i32>
+; CHECK-NEXT:    [[TMP57:%.*]] = mul nsw <16 x i32> [[TMP56]], [[TMP52]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE23]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI17]], <16 x i32> [[TMP57]])
+; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP46]]
+; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP58]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP59]], align 1
+; CHECK-NEXT:    [[TMP60:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = mul nsw <16 x i32> [[TMP60]], [[TMP52]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE25]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI16]], <16 x i32> [[TMP61]])
+; CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP46]]
+; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[TMP62]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP63]], align 1
+; CHECK-NEXT:    [[TMP64:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
+; CHECK-NEXT:    [[TMP65:%.*]] = mul nsw <16 x i32> [[TMP64]], [[TMP52]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE27]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI15]], <16 x i32> [[TMP65]])
+; CHECK-NEXT:    [[INDEX_NEXT28]] = add nuw i64 [[INDEX14]], 16
+; CHECK-NEXT:    [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC12]]
+; CHECK-NEXT:    br i1 [[TMP66]], label [[MIDDLE_BLOCK5:%.*]], label [[VECTOR_BODY13]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       middle.block5:
+; CHECK-NEXT:    [[TMP67:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE27]])
+; CHECK-NEXT:    [[TMP68:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE25]])
+; CHECK-NEXT:    [[TMP69:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE23]])
+; CHECK-NEXT:    [[TMP70:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
+; CHECK-NEXT:    [[CMP_N29:%.*]] = icmp eq i64 [[TMP39]], [[N_VEC12]]
+; CHECK-NEXT:    br i1 [[CMP_N29]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ;
 entry:
   %cmp154 = icmp sgt i32 %num_out, 3
@@ -1161,7 +1096,7 @@ for.end98:                                        ; preds = %for.end98.loopexit1
 
 define i32 @not_dotp_predicated(i32 %N, ptr %a, ptr %b) #0 {
 ; CHECK-LABEL: define i32 @not_dotp_predicated(
-; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[N]], 16
 ; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[REM]], 0
@@ -1170,39 +1105,34 @@ define i32 @not_dotp_predicated(i32 %N, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[REM]] to i64
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 15
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP7]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = sext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i32>
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP10]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 8 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw <vscale x 8 x i32> [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 8 x i32> [[TMP12]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP14]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i32> [[TMP13]], <vscale x 8 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 8 x i1> [[TMP15]], i32 0
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP10]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP15]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP8]], <16 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP18:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP18]], i32 0
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP14]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ;
 entry:
@@ -1248,39 +1178,34 @@ define i32 @not_dotp_predicated_pragma(i32 %N, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 15
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP7]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP10]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw <vscale x 16 x i32> [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 16 x i32> [[TMP12]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP14]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP13]], <vscale x 16 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[TMP15]], i32 0
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP10]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP15]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP8]], <16 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP18:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP18]], i32 0
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP14]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index 7fcb33b8584f33..48eaa2b280624a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -8,7 +8,7 @@ target triple = "aarch64-none-unknown-elf"
 
 define void @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-LABEL: Checking a loop in 'print_partial_reduction'
-; CHECK:      VPlan 'Initial VPlan for VF={2,4,8,16},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for VF={2,4},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<0> = original trip-count
@@ -30,7 +30,51 @@ define void @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT:   WIDEN ir<%2> = load vp<%5>
 ; CHECK-NEXT:   WIDEN-CAST ir<%conv3> = zext ir<%2> to i32
 ; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%conv3>, ir<%conv>
-; CHECK-NEXT:   PARTIAL-REDUCE ir<%add> = add ir<%mul>, ir<%acc.010>
+; CHECK-NEXT:   WIDEN ir<%add> = add ir<%mul>, ir<[[ACC]]>
+; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<%8> = compute-reduction-result ir<[[ACC]]>, ir<%add>
+; CHECK-NEXT:   EMIT vp<%9> = extract-from-end vp<%8>, ir<1>
+; CHECK-NEXT:   EMIT vp<%10> = icmp eq ir<0>, vp<%1>
+; CHECK-NEXT:   EMIT branch-on-cond vp<%10>
+; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%9>)
+; CHECK-NEXT:   IR   %0 = lshr i32 %add.lcssa, 0
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK:      VPlan 'Initial VPlan for VF={8,16},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<0> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<%add> (VF scaled by 1/4)
+; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:   CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<[[STEPS]]>
+; CHECK-NEXT:   vp<%4> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:   WIDEN ir<%1> = load vp<%4>
+; CHECK-NEXT:   WIDEN-CAST ir<%conv> = zext ir<%1> to i32
+; CHECK-NEXT:   CLONE ir<%arrayidx2> = getelementptr ir<%b>, vp<[[STEPS]]>
+; CHECK-NEXT:   vp<%5> = vector-pointer ir<%arrayidx2>
+; CHECK-NEXT:   WIDEN ir<%2> = load vp<%5>
+; CHECK-NEXT:   WIDEN-CAST ir<%conv3> = zext ir<%2> to i32
+; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%conv3>, ir<%conv>
+; CHECK-NEXT:   PARTIAL-REDUCE ir<%add> = add ir<%mul>, ir<[[ACC]]>
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors

>From e573e856e2b0ca2af5f0eae6b8d56efdcdee7b7b Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Fri, 11 Oct 2024 15:57:19 +0100
Subject: [PATCH 03/56] Get recipe-based version working with multiple-use
 extends

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 80 +++++++++++--------
 llvm/lib/Transforms/Vectorize/VPlan.h         | 70 ++++++++++++++--
 2 files changed, 110 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d992927b5a9e15..ffb364af03057f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1574,8 +1574,6 @@ class LoopVectorizationCostModel {
   }
 
 private:
-  PartialReductionList PartialReductionChains;
-
   unsigned NumPredStores = 0;
 
   /// \return An upper bound for the vectorization factors for both
@@ -4700,11 +4698,6 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
         return false;
   }
 
-  // Prevent epilogue vectorization if a partial reduction is involved
-  // TODO Is there a cleaner way to check this?
-  if (CM.getPartialReductionChains().size() > 0)
-    return false;
-
   // Epilogue vectorization code has not been auditted to ensure it handles
   // non-latch exits properly.  It may be fine, but it needs auditted and
   // tested.
@@ -7145,7 +7138,6 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
 
 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
-
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
 
@@ -8709,26 +8701,24 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
 
 /// Examines reduction operations to see if the target can use a cheaper
 /// operation with a wider per-iteration input VF and narrower PHI VF.
-/// Returns the ratio between the two VFs (1 by default).
-static unsigned getReductionScaleFactor(PHINode *PHI,
-                                        const RecurrenceDescriptor &Rdx,
-                                        const TargetTransformInfo *TTI,
-                                        VFRange &Range,
-                                        LoopVectorizationCostModel &CM) {
+/// Returns a struct containing the ratio between the two VFs and other cached
+/// information, or null if no scalable reduction was found.
+static std::optional<PartialReductionChain>
+getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx,
+                   const TargetTransformInfo *TTI, VFRange &Range,
+                   LoopVectorizationCostModel &CM) {
   // FIXME: Should we move this to VPRecipeBuilder and cache the values needed
   //        for the TTI query?
-  unsigned DefaultScaleFactor = 1;
-
   // TODO: Allow scaling reductions when predicating. The select at
   // the end of the loop chooses between the phi value and most recent
   // reduction result, both of which have different VFs to the active lane
   // mask when scaling.
   if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
-    return DefaultScaleFactor;
+    return std::nullopt;
 
   auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
   if (!Update)
-    return DefaultScaleFactor;
+    return std::nullopt;
 
   Value *Op = Update->getOperand(0);
   if (Op == PHI)
@@ -8737,7 +8727,7 @@ static unsigned getReductionScaleFactor(PHINode *PHI,
   // Match dot product pattern
   auto *BinOp = dyn_cast<BinaryOperator>(Op);
   if (!BinOp || !BinOp->hasOneUse())
-    return DefaultScaleFactor;
+    return std::nullopt;
 
   auto IsSextOrZext = [](Instruction *I) {
     return I && (I->getOpcode() == Instruction::ZExt ||
@@ -8747,13 +8737,13 @@ static unsigned getReductionScaleFactor(PHINode *PHI,
   auto *ExtA = dyn_cast<Instruction>(BinOp->getOperand(0));
   auto *ExtB = dyn_cast<Instruction>(BinOp->getOperand(1));
   if (!IsSextOrZext(ExtA) || !IsSextOrZext(ExtB))
-    return DefaultScaleFactor;
+    return std::nullopt;
 
   Value *A = ExtA->getOperand(0);
   Value *B = ExtB->getOperand(0);
   // Check that the extends extend from the same type
   if (A->getType() != B->getType())
-    return DefaultScaleFactor;
+    return std::nullopt;
 
   unsigned TargetScaleFactor =
       PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
@@ -8764,6 +8754,13 @@ static unsigned getReductionScaleFactor(PHINode *PHI,
   TTI::PartialReductionExtendKind OpBExtend =
       TargetTransformInfo::getPartialReductionExtendKind(ExtB);
 
+  PartialReductionChain Chain;
+  Chain.Reduction = Rdx.getLoopExitInstr();
+  Chain.ExtendA = ExtA;
+  Chain.ExtendB = ExtB;
+  Chain.ScaleFactor = TargetScaleFactor;
+  Chain.BinOp = dyn_cast<Instruction>(Op);
+
   if (LoopVectorizationPlanner::getDecisionAndClampRange(
           [&](ElementCount VF) {
             InstructionCost Cost = TTI->getPartialReductionCost(
@@ -8772,9 +8769,9 @@ static unsigned getReductionScaleFactor(PHINode *PHI,
             return Cost.isValid();
           },
           Range))
-    return TargetScaleFactor;
+    return Chain;
 
-  return DefaultScaleFactor;
+  return std::nullopt;
 }
 
 VPRecipeBase *
@@ -8803,11 +8800,9 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
 
       // If the PHI is used by a partial reduction, set the scale factor
-      unsigned ScaleFactor =
-          getReductionScaleFactor(Phi, RdxDesc, TTI, Range, CM);
-      Instruction *ReductionInstr = RdxDesc.getLoopExitInstr();
-      if (ScaleFactor != 1)
-        Plan.addScaledReductionExitInstr(RdxDesc.getLoopExitInstr());
+      std::optional<PartialReductionChain> Chain =
+          getScaledReduction(Phi, RdxDesc, TTI, Range, CM);
+      unsigned ScaleFactor = Chain ? Chain->ScaleFactor : 1;
       PhiRecipe = new VPReductionPHIRecipe(
           Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
           CM.useOrderedReductions(RdxDesc), ScaleFactor);
@@ -8842,7 +8837,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
     return tryToWidenMemory(Instr, Operands, Range);
 
-  if (Plan.isScaledReductionExitInstr(Instr))
+  if (Plan.getScaledReductionForInstr(Instr))
     return tryToCreatePartialReduction(Instr, Operands);
 
   if (!shouldWiden(Instr, Range))
@@ -9229,8 +9224,28 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
         return Legal->blockNeedsPredication(BB) || NeedsBlends;
       });
+
+  // Cache the partial reductions up front so we can remove the invalid ones
+  // before creating the recipes
+  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
+    for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
+      Instruction *Instr = &I;
+      auto *Phi = dyn_cast<PHINode>(Instr);
+      if (!Phi || !Legal->isReductionVariable(Phi))
+        continue;
+      const RecurrenceDescriptor &RdxDesc =
+          Legal->getReductionVars().find(Phi)->second;
+      std::optional<PartialReductionChain> Chain =
+          getScaledReduction(Phi, RdxDesc, &TTI, Range, CM);
+      if (Chain.has_value())
+        Plan->addScaledReductionExitInstr(*Chain);
+    }
+  }
+  Plan->removeInvalidScaledReductionExitInstrs();
+
   auto *MiddleVPBB = Plan->getMiddleBlock();
   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
+
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
     // Relevant instructions from basic block BB will be grouped into VPRecipe
     // ingredients and fill a new VPBasicBlock.
@@ -9273,11 +9288,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         continue;
       }
 
-      VPRecipeBase *Recipe = nullptr;
-
-      if (!Recipe)
-        Recipe =
-            RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
+      VPRecipeBase *Recipe =
+          RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
       if (!Recipe)
         Recipe = RecipeBuilder.handleReplication(Instr, Range);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 6fe0f4006facac..5352b46992d619 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2408,7 +2408,6 @@ class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
   VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
   /// Generate the reduction in the loop
   void execute(VPTransformState &State) override;
-  unsigned getOpcode() { return Opcode; }
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -3742,6 +3741,24 @@ class VPRegionBlock : public VPBlockBase {
   VPRegionBlock *clone() override;
 };
 
+/// A chain of instructions that form a partial reduction.
+/// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))),
+/// accumulator)
+struct PartialReductionChain {
+  /// The top-level binary operation that forms the reduction to a scalar
+  /// after the loop body
+  Instruction *Reduction;
+  /// The extension of each of the inner binary operation's operands
+  Instruction *ExtendA;
+  Instruction *ExtendB;
+
+  Instruction *BinOp;
+
+  /// The scaling factor between the size of the reduction type and the
+  /// (possibly extended) inputs
+  unsigned ScaleFactor;
+};
+
 /// VPlan models a candidate for vectorization, encoding various decisions take
 /// to produce efficient output IR, including which branches, basic-blocks and
 /// output IR instructions to generate, and their cost. VPlan holds a
@@ -3805,7 +3822,8 @@ class VPlan {
 
   /// Stores the set of reduction exit instructions that will be scaled to
   /// a smaller VF in this plan via partial reductions.
-  SmallPtrSet<const Instruction *, 2> ScaledReductionExitInstrs;
+  DenseMap<const Instruction *, PartialReductionChain>
+      ScaledReductionExitInstrs;
 
 public:
   /// Construct a VPlan with original preheader \p Preheader, trip count \p TC,
@@ -4027,12 +4045,52 @@ class VPlan {
   /// recipes to refer to the clones, and return it.
   VPlan *duplicate();
 
-  void addScaledReductionExitInstr(const Instruction *ExitInst) {
-    ScaledReductionExitInstrs.insert(ExitInst);
+  void addScaledReductionExitInstr(PartialReductionChain Chain) {
+    ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Chain));
   }
 
-  bool isScaledReductionExitInstr(const Instruction *ExitInst) {
-    return ScaledReductionExitInstrs.contains(ExitInst);
+  std::optional<PartialReductionChain>
+  getScaledReductionForInstr(const Instruction *ExitInst) {
+    auto It = ScaledReductionExitInstrs.find(ExitInst);
+    return It == ScaledReductionExitInstrs.end()
+               ? std::nullopt
+               : std::make_optional(It->second);
+  }
+
+  void removeInvalidScaledReductionExitInstrs() {
+    // A partial reduction is invalid if any of its extends are used by
+    // something that isn't another partial reduction. This is because the
+    // extends are intended to be lowered along with the reduction itself.
+
+    // Build up a set of partial reduction bin ops for efficient use checking
+    SmallSet<Instruction *, 4> PartialReductionBinOps;
+    for (auto It : ScaledReductionExitInstrs) {
+      if (It.second.BinOp)
+        PartialReductionBinOps.insert(It.second.BinOp);
+    }
+
+    auto ExtendIsOnlyUsedByPartialReductions =
+        [PartialReductionBinOps](Instruction *Extend) {
+          for (auto *Use : Extend->users()) {
+            Instruction *UseInstr = dyn_cast<Instruction>(Use);
+            if (!PartialReductionBinOps.contains(UseInstr))
+              return false;
+          }
+          return true;
+        };
+
+    // Check if each use of a chain's two extends is a partial reduction
+    // and remove those that have non-partial reduction users
+    SmallSet<Instruction *, 4> PartialReductionsToRemove;
+    for (auto It : ScaledReductionExitInstrs) {
+      PartialReductionChain Chain = It.second;
+      if (!ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) ||
+          !ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
+        PartialReductionsToRemove.insert(Chain.Reduction);
+    }
+
+    for (auto *Instr : PartialReductionsToRemove)
+      ScaledReductionExitInstrs.erase(Instr);
   }
 };
 

>From ee88fc80774f4c0eb6c722a7b5d1f5842bd04483 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 15 Oct 2024 11:41:05 +0100
Subject: [PATCH 04/56] Use cached partial reduction when getting the scale
 factor for phi

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ffb364af03057f..833a542211228b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8801,7 +8801,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
 
       // If the PHI is used by a partial reduction, set the scale factor
       std::optional<PartialReductionChain> Chain =
-          getScaledReduction(Phi, RdxDesc, TTI, Range, CM);
+          Plan.getScaledReductionForInstr(RdxDesc.getLoopExitInstr());
       unsigned ScaleFactor = Chain ? Chain->ScaleFactor : 1;
       PhiRecipe = new VPReductionPHIRecipe(
           Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),

>From fa67602f2f580903233376163f2a664249d506ab Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 16 Oct 2024 16:16:34 +0100
Subject: [PATCH 05/56] Fix partial-reduce-sdot.ll test

---
 .../CodeGen/AArch64/partial-reduce-sdot.ll    | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll b/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
index fc6e3239a1b43c..3c82a4868f659d 100644
--- a/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
+++ b/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
@@ -9,41 +9,41 @@ define void @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP19:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP27:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP29:%.*]] = mul <vscale x 16 x i32> [[TMP27]], [[TMP19]]
-; CHECK-NEXT:    [[TMP14]] = add <vscale x 16 x i32> [[TMP29]], [[VEC_PHI]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP15]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP14]])
+; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]

>From 973f076a2b319cd9266666eedd2baf3c21fd4eeb Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 16 Oct 2024 14:39:15 +0100
Subject: [PATCH 06/56] Support predicated loops

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  2 --
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  9 +++++++
 .../AArch64/partial-reduce-dot-product.ll     | 26 ++++++++++---------
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 833a542211228b..18c6f823722bdb 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8713,8 +8713,6 @@ getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx,
   // the end of the loop chooses between the phi value and most recent
   // reduction result, both of which have different VFs to the active lane
   // mask when scaling.
-  if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
-    return std::nullopt;
 
   auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
   if (!Update)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index dd719b3973b01e..82fdf3c1c678b8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -451,6 +451,15 @@ Value *VPInstruction::generate(VPTransformState &State) {
     Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
     Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
     Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
+    auto *CondVec = dyn_cast<VectorType>(Cond->getType());
+    auto *Op1Vec = dyn_cast<VectorType>(Op1->getType());
+    // If the condition and values differ in their element count (as can happen
+    // in the case of predicated partial reductions) then reduce the condition
+    // to a single value and select based on that instead.
+    if (CondVec && Op1Vec &&
+        CondVec->getElementCount() != Op1Vec->getElementCount() &&
+        CondVec->getElementType() == IntegerType::getInt1Ty(Cond->getContext()))
+      Cond = Builder.CreateOrReduce(Cond);
     return Builder.CreateSelect(Cond, Op1, Op2, Name);
   }
   case VPInstruction::ActiveLaneMask: {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 9d7dda55a45c41..b3a090e9534bee 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1094,8 +1094,8 @@ for.end98:                                        ; preds = %for.end98.loopexit1
   ret void
 }
 
-define i32 @not_dotp_predicated(i32 %N, ptr %a, ptr %b) #0 {
-; CHECK-LABEL: define i32 @not_dotp_predicated(
+define i32 @dotp_predicated(i32 %N, ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define i32 @dotp_predicated(
 ; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[N]], 16
@@ -1113,7 +1113,7 @@ define i32 @not_dotp_predicated(i32 %N, ptr %a, ptr %b) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
@@ -1124,15 +1124,16 @@ define i32 @not_dotp_predicated(i32 %N, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP10]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i32> [[TMP17]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP15]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP8]], <16 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP17]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP15]] = select i1 [[TMP8]], <4 x i32> [[PARTIAL_REDUCE]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP18]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ;
 entry:
@@ -1168,8 +1169,8 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
 }
 
-define i32 @not_dotp_predicated_pragma(i32 %N, ptr %a, ptr %b) #0 {
-; CHECK-LABEL: define i32 @not_dotp_predicated_pragma(
+define i32 @dotp_predicated_pragma(i32 %N, ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define i32 @dotp_predicated_pragma(
 ; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i32 [[N]], 0
@@ -1186,7 +1187,7 @@ define i32 @not_dotp_predicated_pragma(i32 %N, ptr %a, ptr %b) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
@@ -1197,15 +1198,16 @@ define i32 @not_dotp_predicated_pragma(i32 %N, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP10]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i32> [[TMP17]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP15]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP8]], <16 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP17]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP15]] = select i1 [[TMP8]], <4 x i32> [[PARTIAL_REDUCE]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP18]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ;
 entry:

>From dc6fb36d8230dc340385b61631643cd3e9afd3c8 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Thu, 24 Oct 2024 14:50:30 +0100
Subject: [PATCH 07/56] Loop over reduction vars

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 18c6f823722bdb..16ade431be764e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9225,25 +9225,16 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
 
   // Cache the partial reductions up front so we can remove the invalid ones
   // before creating the recipes
-  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
-    for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
-      Instruction *Instr = &I;
-      auto *Phi = dyn_cast<PHINode>(Instr);
-      if (!Phi || !Legal->isReductionVariable(Phi))
-        continue;
-      const RecurrenceDescriptor &RdxDesc =
-          Legal->getReductionVars().find(Phi)->second;
-      std::optional<PartialReductionChain> Chain =
-          getScaledReduction(Phi, RdxDesc, &TTI, Range, CM);
-      if (Chain.has_value())
-        Plan->addScaledReductionExitInstr(*Chain);
-    }
+  for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
+    std::optional<PartialReductionChain> Chain =
+        getScaledReduction(Phi, RdxDesc, &TTI, Range, CM);
+    if (Chain.has_value())
+      Plan->addScaledReductionExitInstr(*Chain);
   }
   Plan->removeInvalidScaledReductionExitInstrs();
 
   auto *MiddleVPBB = Plan->getMiddleBlock();
   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
-
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
     // Relevant instructions from basic block BB will be grouped into VPRecipe
     // ingredients and fill a new VPBasicBlock.

>From 63797c572ae0b410e440dac81383c4bf0288ff4c Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Thu, 24 Oct 2024 14:50:45 +0100
Subject: [PATCH 08/56] Simplify state get and set calls

---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 82fdf3c1c678b8..57c08dc200e92e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -309,8 +309,8 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
 
   assert(Opcode == Instruction::Add && "Unhandled partial reduction opcode");
 
-  Value *BinOpVal = State.get(getOperand(0), 0);
-  Value *PhiVal = State.get(getOperand(1), 0);
+  Value *BinOpVal = State.get(getOperand(0));
+  Value *PhiVal = State.get(getOperand(1));
   assert(PhiVal && BinOpVal && "Phi and Mul must be set");
 
   Type *RetTy = PhiVal->getType();
@@ -320,7 +320,7 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
       {PhiVal, BinOpVal}, nullptr, Twine("partial.reduce"));
 
   // Use this vector value for all users of the original instruction.
-  State.set(this, V, 0);
+  State.set(this, V);
   State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
 }
 

>From 0ebd52e400b1695c063c14e31874a30a764bba79 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Thu, 24 Oct 2024 15:08:51 +0100
Subject: [PATCH 09/56] Remove already vectorised code from unroll test

---
 .../AArch64/partial-reduce-dot-product.ll     | 419 +++++++++---------
 1 file changed, 219 insertions(+), 200 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index b3a090e9534bee..25ad742e9ecdda 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -759,8 +759,8 @@ define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %w, ptr %scales, ptr %
 ; CHECK-NEXT:    br label [[FOR_END98]]
 ; CHECK:       for.body.us.preheader:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[NUM_IN]] to i64
-; CHECK-NEXT:    br label [[FOR_BODY_US:%.*]]
-; CHECK:       for.body.us:
+; CHECK-NEXT:    br label [[ITER_CHECK:%.*]]
+; CHECK:       iter.check:
 ; CHECK-NEXT:    [[INDVARS_IV164:%.*]] = phi i64 [ 0, [[FOR_BODY_US_PREHEADER]] ], [ [[INDVARS_IV_NEXT165:%.*]], [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US:%.*]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[INDVARS_IV164]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
@@ -774,134 +774,224 @@ define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %w, ptr %scales, ptr %
 ; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX9_US]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP9]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[ITER_CHECK:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP11]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE181:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI172:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE179:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI173:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE177:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI174:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1
-; CHECK-NEXT:    [[TMP17:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP14]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP13]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP17]])
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD175:%.*]] = load <vscale x 16 x i8>, ptr [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP20:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD175]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP21:%.*]] = mul nsw <vscale x 16 x i32> [[TMP20]], [[TMP17]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI174]], <vscale x 16 x i32> [[TMP21]])
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP14]]
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP16]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP21]])
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD176:%.*]] = load <vscale x 16 x i8>, ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP24:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD176]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP25:%.*]] = mul nsw <vscale x 16 x i32> [[TMP24]], [[TMP20]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE177]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI173]], <vscale x 16 x i32> [[TMP25]])
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP14]]
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP16]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP25]])
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD178:%.*]] = load <vscale x 16 x i8>, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP28:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD178]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP28]], [[TMP20]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE179]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI172]], <vscale x 16 x i32> [[TMP29]])
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD180:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
-; CHECK-NEXT:    [[TMP32:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD180]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP32]], [[TMP20]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE181]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP33]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
-; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP16]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP29]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[PARTIAL_REDUCE_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[PARTIAL_REDUCE177_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE177]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[PARTIAL_REDUCE179_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE179]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[PARTIAL_REDUCE181_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE181]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE181_LCSSA]])
-; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE179_LCSSA]])
-; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE177_LCSSA]])
-; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_LCSSA]])
+; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE11]])
+; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE9]])
+; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
+; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[ITER_CHECK]]
-; CHECK:       iter.check:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX182:%.*]] = phi i32 [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX183:%.*]] = phi i32 [ [[TMP37]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX184:%.*]] = phi i32 [ [[TMP38]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-NEXT:    [[TMP39:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[BC_RESUME_VAL]]
-; CHECK-NEXT:    [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP40]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[TMP39]], [[TMP41]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK9:%.*]] = icmp ult i64 [[TMP39]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK9]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH10:%.*]]
-; CHECK:       vector.ph10:
-; CHECK-NEXT:    [[N_MOD_VF11:%.*]] = urem i64 [[TMP39]], 16
-; CHECK-NEXT:    [[N_VEC12:%.*]] = sub i64 [[TMP39]], [[N_MOD_VF11]]
-; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX182]], i32 0
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX183]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX184]], i32 0
-; CHECK-NEXT:    br label [[VECTOR_BODY13:%.*]]
-; CHECK:       vector.body13:
-; CHECK-NEXT:    [[INDEX14:%.*]] = phi i64 [ 0, [[VECTOR_PH10]] ], [ [[INDEX_NEXT28:%.*]], [[VECTOR_BODY13]] ]
-; CHECK-NEXT:    [[VEC_PHI15:%.*]] = phi <4 x i32> [ [[TMP42]], [[VECTOR_PH10]] ], [ [[PARTIAL_REDUCE27:%.*]], [[VECTOR_BODY13]] ]
-; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <4 x i32> [ [[TMP43]], [[VECTOR_PH10]] ], [ [[PARTIAL_REDUCE25:%.*]], [[VECTOR_BODY13]] ]
-; CHECK-NEXT:    [[VEC_PHI17:%.*]] = phi <4 x i32> [ [[TMP44]], [[VECTOR_PH10]] ], [ [[PARTIAL_REDUCE23:%.*]], [[VECTOR_BODY13]] ]
-; CHECK-NEXT:    [[VEC_PHI18:%.*]] = phi <4 x i32> [ [[TMP45]], [[VECTOR_PH10]] ], [ [[PARTIAL_REDUCE21:%.*]], [[VECTOR_BODY13]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[BC_RESUME_VAL]], [[INDEX14]]
-; CHECK-NEXT:    [[TMP46:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP46]]
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[TMP47]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP48]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
-; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP46]]
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[TMP50]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP51]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
-; CHECK-NEXT:    [[TMP53:%.*]] = mul nsw <16 x i32> [[TMP52]], [[TMP49]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI18]], <16 x i32> [[TMP53]])
-; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP46]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP36:%.*]] = mul i64 [[TMP35]], 4
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP36]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP31]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX12:%.*]] = phi i32 [ [[TMP32]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP33]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX14:%.*]] = phi i32 [ [[TMP34]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP38:%.*]] = mul i64 [[TMP37]], 4
+; CHECK-NEXT:    [[N_MOD_VF15:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP38]]
+; CHECK-NEXT:    [[N_VEC16:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF15]]
+; CHECK-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 4
+; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX12]], i32 0
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX13]], i32 0
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX14]], i32 0
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX17:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI18:%.*]] = phi <vscale x 4 x i32> [ [[TMP41]], [[VEC_EPILOG_PH]] ], [ [[TMP68:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI19:%.*]] = phi <vscale x 4 x i32> [ [[TMP42]], [[VEC_EPILOG_PH]] ], [ [[TMP63:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI20:%.*]] = phi <vscale x 4 x i32> [ [[TMP43]], [[VEC_EPILOG_PH]] ], [ [[TMP58:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI21:%.*]] = phi <vscale x 4 x i32> [ [[TMP44]], [[VEC_EPILOG_PH]] ], [ [[TMP53:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP45:%.*]] = add i64 [[INDEX17]], 0
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i8, ptr [[TMP46]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <vscale x 4 x i8>, ptr [[TMP47]], align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD22]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[TMP49]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <vscale x 4 x i8>, ptr [[TMP50]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD23]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = mul nsw <vscale x 4 x i32> [[TMP51]], [[TMP48]]
+; CHECK-NEXT:    [[TMP53]] = add <vscale x 4 x i32> [[TMP52]], [[VEC_PHI21]]
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP45]]
 ; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <16 x i8>, ptr [[TMP55]], align 1
-; CHECK-NEXT:    [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD22]] to <16 x i32>
-; CHECK-NEXT:    [[TMP57:%.*]] = mul nsw <16 x i32> [[TMP56]], [[TMP52]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE23]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI17]], <16 x i32> [[TMP57]])
-; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP46]]
-; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP58]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP59]], align 1
-; CHECK-NEXT:    [[TMP60:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
-; CHECK-NEXT:    [[TMP61:%.*]] = mul nsw <16 x i32> [[TMP60]], [[TMP52]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE25]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI16]], <16 x i32> [[TMP61]])
-; CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP46]]
-; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[TMP62]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP63]], align 1
-; CHECK-NEXT:    [[TMP64:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
-; CHECK-NEXT:    [[TMP65:%.*]] = mul nsw <16 x i32> [[TMP64]], [[TMP52]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE27]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI15]], <16 x i32> [[TMP65]])
-; CHECK-NEXT:    [[INDEX_NEXT28]] = add nuw i64 [[INDEX14]], 16
-; CHECK-NEXT:    [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC12]]
-; CHECK-NEXT:    br i1 [[TMP66]], label [[MIDDLE_BLOCK5:%.*]], label [[VECTOR_BODY13]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       middle.block5:
-; CHECK-NEXT:    [[TMP67:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE27]])
-; CHECK-NEXT:    [[TMP68:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE25]])
-; CHECK-NEXT:    [[TMP69:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE23]])
-; CHECK-NEXT:    [[TMP70:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
-; CHECK-NEXT:    [[CMP_N29:%.*]] = icmp eq i64 [[TMP39]], [[N_VEC12]]
-; CHECK-NEXT:    br i1 [[CMP_N29]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <vscale x 4 x i8>, ptr [[TMP55]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD24]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP57:%.*]] = mul nsw <vscale x 4 x i32> [[TMP56]], [[TMP51]]
+; CHECK-NEXT:    [[TMP58]] = add <vscale x 4 x i32> [[TMP57]], [[VEC_PHI20]]
+; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i8, ptr [[TMP59]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <vscale x 4 x i8>, ptr [[TMP60]], align 1
+; CHECK-NEXT:    [[TMP61:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD25]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP62:%.*]] = mul nsw <vscale x 4 x i32> [[TMP61]], [[TMP51]]
+; CHECK-NEXT:    [[TMP63]] = add <vscale x 4 x i32> [[TMP62]], [[VEC_PHI19]]
+; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i8, ptr [[TMP64]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <vscale x 4 x i8>, ptr [[TMP65]], align 1
+; CHECK-NEXT:    [[TMP66:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD26]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP67:%.*]] = mul nsw <vscale x 4 x i32> [[TMP66]], [[TMP51]]
+; CHECK-NEXT:    [[TMP68]] = add <vscale x 4 x i32> [[TMP67]], [[VEC_PHI18]]
+; CHECK-NEXT:    [[INDEX_NEXT27]] = add nuw i64 [[INDEX17]], [[TMP40]]
+; CHECK-NEXT:    [[TMP69:%.*]] = icmp eq i64 [[INDEX_NEXT27]], [[N_VEC16]]
+; CHECK-NEXT:    br i1 [[TMP69]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[TMP70:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP68]])
+; CHECK-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP63]])
+; CHECK-NEXT:    [[TMP72:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP58]])
+; CHECK-NEXT:    [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP53]])
+; CHECK-NEXT:    [[CMP_N28:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC16]]
+; CHECK-NEXT:    br i1 [[CMP_N28]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX29:%.*]] = phi i32 [ [[TMP70]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP31]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX30:%.*]] = phi i32 [ [[TMP71]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP32]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX31:%.*]] = phi i32 [ [[TMP72]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP33]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX32:%.*]] = phi i32 [ [[TMP73]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP34]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY12_US:%.*]]
+; CHECK:       for.body12.us:
+; CHECK-NEXT:    [[INDVARS_IV161:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT162:%.*]], [[FOR_BODY12_US]] ]
+; CHECK-NEXT:    [[TOTAL3_0149_US:%.*]] = phi i32 [ [[BC_MERGE_RDX29]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD43_US:%.*]], [[FOR_BODY12_US]] ]
+; CHECK-NEXT:    [[TOTAL2_0148_US:%.*]] = phi i32 [ [[BC_MERGE_RDX30]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD35_US:%.*]], [[FOR_BODY12_US]] ]
+; CHECK-NEXT:    [[TOTAL1_0147_US:%.*]] = phi i32 [ [[BC_MERGE_RDX31]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD27_US:%.*]], [[FOR_BODY12_US]] ]
+; CHECK-NEXT:    [[TOTAL0_0146_US:%.*]] = phi i32 [ [[BC_MERGE_RDX32]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD19_US:%.*]], [[FOR_BODY12_US]] ]
+; CHECK-NEXT:    [[ARRAYIDX14_US:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDVARS_IV161]]
+; CHECK-NEXT:    [[TMP74:%.*]] = load i8, ptr [[ARRAYIDX14_US]], align 1
+; CHECK-NEXT:    [[CONV_US:%.*]] = sext i8 [[TMP74]] to i32
+; CHECK-NEXT:    [[ARRAYIDX16_US:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[INDVARS_IV161]]
+; CHECK-NEXT:    [[TMP75:%.*]] = load i8, ptr [[ARRAYIDX16_US]], align 1
+; CHECK-NEXT:    [[CONV17_US:%.*]] = sext i8 [[TMP75]] to i32
+; CHECK-NEXT:    [[MUL18_US:%.*]] = mul nsw i32 [[CONV17_US]], [[CONV_US]]
+; CHECK-NEXT:    [[ADD19_US]] = add nsw i32 [[MUL18_US]], [[TOTAL0_0146_US]]
+; CHECK-NEXT:    [[ARRAYIDX21_US:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[INDVARS_IV161]]
+; CHECK-NEXT:    [[TMP76:%.*]] = load i8, ptr [[ARRAYIDX21_US]], align 1
+; CHECK-NEXT:    [[CONV22_US:%.*]] = sext i8 [[TMP76]] to i32
+; CHECK-NEXT:    [[MUL26_US:%.*]] = mul nsw i32 [[CONV22_US]], [[CONV17_US]]
+; CHECK-NEXT:    [[ADD27_US]] = add nsw i32 [[MUL26_US]], [[TOTAL1_0147_US]]
+; CHECK-NEXT:    [[ARRAYIDX29_US:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[INDVARS_IV161]]
+; CHECK-NEXT:    [[TMP77:%.*]] = load i8, ptr [[ARRAYIDX29_US]], align 1
+; CHECK-NEXT:    [[CONV30_US:%.*]] = sext i8 [[TMP77]] to i32
+; CHECK-NEXT:    [[MUL34_US:%.*]] = mul nsw i32 [[CONV30_US]], [[CONV17_US]]
+; CHECK-NEXT:    [[ADD35_US]] = add nsw i32 [[MUL34_US]], [[TOTAL2_0148_US]]
+; CHECK-NEXT:    [[ARRAYIDX37_US:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[INDVARS_IV161]]
+; CHECK-NEXT:    [[TMP78:%.*]] = load i8, ptr [[ARRAYIDX37_US]], align 1
+; CHECK-NEXT:    [[CONV38_US:%.*]] = sext i8 [[TMP78]] to i32
+; CHECK-NEXT:    [[MUL42_US:%.*]] = mul nsw i32 [[CONV38_US]], [[CONV17_US]]
+; CHECK-NEXT:    [[ADD43_US]] = add nsw i32 [[MUL42_US]], [[TOTAL3_0149_US]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT162]] = add nuw nsw i64 [[INDVARS_IV161]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT162]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[FOR_BODY12_US]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       for.cond10.for.cond.cleanup_crit_edge.us:
+; CHECK-NEXT:    [[ADD19_US_LCSSA:%.*]] = phi i32 [ [[ADD19_US]], [[FOR_BODY12_US]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ], [ [[TMP73]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD27_US_LCSSA:%.*]] = phi i32 [ [[ADD27_US]], [[FOR_BODY12_US]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ [[TMP72]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD35_US_LCSSA:%.*]] = phi i32 [ [[ADD35_US]], [[FOR_BODY12_US]] ], [ [[TMP32]], [[MIDDLE_BLOCK]] ], [ [[TMP71]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD43_US_LCSSA:%.*]] = phi i32 [ [[ADD43_US]], [[FOR_BODY12_US]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ], [ [[TMP70]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ARRAYIDX45_US:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[IDXPROM44]]
+; CHECK-NEXT:    [[TMP79:%.*]] = load i8, ptr [[ARRAYIDX45_US]], align 1
+; CHECK-NEXT:    [[CONV46_US:%.*]] = sext i8 [[TMP79]] to i32
+; CHECK-NEXT:    [[MUL47_US:%.*]] = mul nsw i32 [[CONV46_US]], 127
+; CHECK-NEXT:    [[ADD48_US:%.*]] = add nsw i32 [[MUL47_US]], [[ADD19_US_LCSSA]]
+; CHECK-NEXT:    [[CONV49_US:%.*]] = sitofp i32 [[ADD48_US]] to float
+; CHECK-NEXT:    [[ARRAYIDX52_US:%.*]] = getelementptr inbounds float, ptr [[SCALES]], i64 [[INDVARS_IV164]]
+; CHECK-NEXT:    [[TMP80:%.*]] = load float, ptr [[ARRAYIDX52_US]], align 4
+; CHECK-NEXT:    [[MUL53_US:%.*]] = fmul float [[TMP80]], [[CONV49_US]]
+; CHECK-NEXT:    [[ARRAYIDX56_US:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[INDVARS_IV164]]
+; CHECK-NEXT:    store float [[MUL53_US]], ptr [[ARRAYIDX56_US]], align 4
+; CHECK-NEXT:    [[ARRAYIDX58_US:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[IDXPROM44]]
+; CHECK-NEXT:    [[TMP81:%.*]] = load i8, ptr [[ARRAYIDX58_US]], align 1
+; CHECK-NEXT:    [[CONV59_US:%.*]] = sext i8 [[TMP81]] to i32
+; CHECK-NEXT:    [[MUL60_US:%.*]] = mul nsw i32 [[CONV59_US]], 127
+; CHECK-NEXT:    [[ADD61_US:%.*]] = add nsw i32 [[MUL60_US]], [[ADD27_US_LCSSA]]
+; CHECK-NEXT:    [[CONV62_US:%.*]] = sitofp i32 [[ADD61_US]] to float
+; CHECK-NEXT:    [[ARRAYIDX65_US:%.*]] = getelementptr inbounds float, ptr [[SCALES]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP82:%.*]] = load float, ptr [[ARRAYIDX65_US]], align 4
+; CHECK-NEXT:    [[MUL66_US:%.*]] = fmul float [[TMP82]], [[CONV62_US]]
+; CHECK-NEXT:    [[ARRAYIDX69_US:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP2]]
+; CHECK-NEXT:    store float [[MUL66_US]], ptr [[ARRAYIDX69_US]], align 4
+; CHECK-NEXT:    [[ARRAYIDX71_US:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[IDXPROM44]]
+; CHECK-NEXT:    [[TMP83:%.*]] = load i8, ptr [[ARRAYIDX71_US]], align 1
+; CHECK-NEXT:    [[CONV72_US:%.*]] = sext i8 [[TMP83]] to i32
+; CHECK-NEXT:    [[MUL73_US:%.*]] = mul nsw i32 [[CONV72_US]], 127
+; CHECK-NEXT:    [[ADD74_US:%.*]] = add nsw i32 [[MUL73_US]], [[ADD35_US_LCSSA]]
+; CHECK-NEXT:    [[CONV75_US:%.*]] = sitofp i32 [[ADD74_US]] to float
+; CHECK-NEXT:    [[ARRAYIDX78_US:%.*]] = getelementptr inbounds float, ptr [[SCALES]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP84:%.*]] = load float, ptr [[ARRAYIDX78_US]], align 4
+; CHECK-NEXT:    [[MUL79_US:%.*]] = fmul float [[TMP84]], [[CONV75_US]]
+; CHECK-NEXT:    [[ARRAYIDX82_US:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP4]]
+; CHECK-NEXT:    store float [[MUL79_US]], ptr [[ARRAYIDX82_US]], align 4
+; CHECK-NEXT:    [[ARRAYIDX84_US:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[IDXPROM44]]
+; CHECK-NEXT:    [[TMP85:%.*]] = load i8, ptr [[ARRAYIDX84_US]], align 1
+; CHECK-NEXT:    [[CONV85_US:%.*]] = sext i8 [[TMP85]] to i32
+; CHECK-NEXT:    [[MUL86_US:%.*]] = mul nsw i32 [[CONV85_US]], 127
+; CHECK-NEXT:    [[ADD87_US:%.*]] = add nsw i32 [[MUL86_US]], [[ADD43_US_LCSSA]]
+; CHECK-NEXT:    [[CONV88_US:%.*]] = sitofp i32 [[ADD87_US]] to float
+; CHECK-NEXT:    [[ARRAYIDX91_US:%.*]] = getelementptr inbounds float, ptr [[SCALES]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP86:%.*]] = load float, ptr [[ARRAYIDX91_US]], align 4
+; CHECK-NEXT:    [[MUL92_US:%.*]] = fmul float [[TMP86]], [[CONV88_US]]
+; CHECK-NEXT:    [[ARRAYIDX95_US:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP6]]
+; CHECK-NEXT:    store float [[MUL92_US]], ptr [[ARRAYIDX95_US]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT165]] = add nuw nsw i64 [[INDVARS_IV164]], 4
+; CHECK-NEXT:    [[CMP_US:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT165]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[CMP_US]], label [[ITER_CHECK]], label [[FOR_END98_LOOPEXIT:%.*]]
+; CHECK:       for.end98.loopexit:
+; CHECK-NEXT:    br label [[FOR_END98]]
+; CHECK:       for.end98:
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp154 = icmp sgt i32 %num_out, 3
@@ -935,81 +1025,14 @@ for.body.us:                                      ; preds = %for.body.us.prehead
   %6 = or disjoint i64 %indvars.iv164, 3
   %arrayidx9.us = getelementptr inbounds ptr, ptr %w, i64 %6
   %7 = load ptr, ptr %arrayidx9.us, align 8
-  %8 = call i64 @llvm.vscale.i64()
-  %9 = mul i64 %8, 16
-  %min.iters.check = icmp ult i64 %wide.trip.count, %9
-  br i1 %min.iters.check, label %scalar.ph, label %vector.ph
-
-vector.ph:                                        ; preds = %for.body.us
-  %10 = call i64 @llvm.vscale.i64()
-  %11 = mul i64 %10, 16
-  %n.mod.vf = urem i64 %wide.trip.count, %11
-  %n.vec = sub i64 %wide.trip.count, %n.mod.vf
-  %12 = call i64 @llvm.vscale.i64()
-  %13 = mul i64 %12, 16
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce181, %vector.body ]
-  %vec.phi172 = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce179, %vector.body ]
-  %vec.phi173 = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce177, %vector.body ]
-  %vec.phi174 = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce, %vector.body ]
-  %14 = add i64 %index, 0
-  %15 = getelementptr inbounds i8, ptr %1, i64 %14
-  %16 = getelementptr inbounds i8, ptr %15, i32 0
-  %wide.load = load <vscale x 16 x i8>, ptr %16, align 1
-  %17 = sext <vscale x 16 x i8> %wide.load to <vscale x 16 x i32>
-  %18 = getelementptr inbounds i8, ptr %u, i64 %14
-  %19 = getelementptr inbounds i8, ptr %18, i32 0
-  %wide.load175 = load <vscale x 16 x i8>, ptr %19, align 1
-  %20 = sext <vscale x 16 x i8> %wide.load175 to <vscale x 16 x i32>
-  %21 = mul nsw <vscale x 16 x i32> %20, %17
-  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi174, <vscale x 16 x i32> %21)
-  %22 = getelementptr inbounds i8, ptr %3, i64 %14
-  %23 = getelementptr inbounds i8, ptr %22, i32 0
-  %wide.load176 = load <vscale x 16 x i8>, ptr %23, align 1
-  %24 = sext <vscale x 16 x i8> %wide.load176 to <vscale x 16 x i32>
-  %25 = mul nsw <vscale x 16 x i32> %24, %20
-  %partial.reduce177 = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi173, <vscale x 16 x i32> %25)
-  %26 = getelementptr inbounds i8, ptr %5, i64 %14
-  %27 = getelementptr inbounds i8, ptr %26, i32 0
-  %wide.load178 = load <vscale x 16 x i8>, ptr %27, align 1
-  %28 = sext <vscale x 16 x i8> %wide.load178 to <vscale x 16 x i32>
-  %29 = mul nsw <vscale x 16 x i32> %28, %20
-  %partial.reduce179 = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi172, <vscale x 16 x i32> %29)
-  %30 = getelementptr inbounds i8, ptr %7, i64 %14
-  %31 = getelementptr inbounds i8, ptr %30, i32 0
-  %wide.load180 = load <vscale x 16 x i8>, ptr %31, align 1
-  %32 = sext <vscale x 16 x i8> %wide.load180 to <vscale x 16 x i32>
-  %33 = mul nsw <vscale x 16 x i32> %32, %20
-  %partial.reduce181 = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %33)
-  %index.next = add nuw i64 %index, %13
-  %34 = icmp eq i64 %index.next, %n.vec
-  br i1 %34, label %middle.block, label %vector.body
-
-middle.block:                                     ; preds = %vector.body
-  %35 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce181)
-  %36 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce179)
-  %37 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce177)
-  %38 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce)
-  %cmp.n = icmp eq i64 %wide.trip.count, %n.vec
-  br i1 %cmp.n, label %for.cond10.for.cond.cleanup_crit_edge.us, label %scalar.ph
-
-scalar.ph:                                        ; preds = %middle.block, %for.body.us
-  %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.us ]
-  %bc.merge.rdx = phi i32 [ %35, %middle.block ], [ 0, %for.body.us ]
-  %bc.merge.rdx182 = phi i32 [ %36, %middle.block ], [ 0, %for.body.us ]
-  %bc.merge.rdx183 = phi i32 [ %37, %middle.block ], [ 0, %for.body.us ]
-  %bc.merge.rdx184 = phi i32 [ %38, %middle.block ], [ 0, %for.body.us ]
   br label %for.body12.us
 
-for.body12.us:                                    ; preds = %scalar.ph, %for.body12.us
-  %indvars.iv161 = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next162, %for.body12.us ]
-  %total3.0149.us = phi i32 [ %bc.merge.rdx, %scalar.ph ], [ %add43.us, %for.body12.us ]
-  %total2.0148.us = phi i32 [ %bc.merge.rdx182, %scalar.ph ], [ %add35.us, %for.body12.us ]
-  %total1.0147.us = phi i32 [ %bc.merge.rdx183, %scalar.ph ], [ %add27.us, %for.body12.us ]
-  %total0.0146.us = phi i32 [ %bc.merge.rdx184, %scalar.ph ], [ %add19.us, %for.body12.us ]
+for.body12.us:                                    ; preds = %for.body.us, %for.body12.us
+  %indvars.iv161 = phi i64 [ 0, %for.body.us ], [ %indvars.iv.next162, %for.body12.us ]
+  %total3.0149.us = phi i32 [ 0, %for.body.us ], [ %add43.us, %for.body12.us ]
+  %total2.0148.us = phi i32 [ 0, %for.body.us ], [ %add35.us, %for.body12.us ]
+  %total1.0147.us = phi i32 [ 0, %for.body.us ], [ %add27.us, %for.body12.us ]
+  %total0.0146.us = phi i32 [ 0, %for.body.us ], [ %add19.us, %for.body12.us ]
   %arrayidx14.us = getelementptr inbounds i8, ptr %1, i64 %indvars.iv161
   %39 = load i8, ptr %arrayidx14.us, align 1
   %conv.us = sext i8 %39 to i32
@@ -1037,16 +1060,12 @@ for.body12.us:                                    ; preds = %scalar.ph, %for.bod
   %exitcond.not = icmp eq i64 %indvars.iv.next162, %wide.trip.count
   br i1 %exitcond.not, label %for.cond10.for.cond.cleanup_crit_edge.us, label %for.body12.us
 
-for.cond10.for.cond.cleanup_crit_edge.us:         ; preds = %middle.block, %for.body12.us
-  %add19.us.lcssa = phi i32 [ %add19.us, %for.body12.us ], [ %38, %middle.block ]
-  %add27.us.lcssa = phi i32 [ %add27.us, %for.body12.us ], [ %37, %middle.block ]
-  %add35.us.lcssa = phi i32 [ %add35.us, %for.body12.us ], [ %36, %middle.block ]
-  %add43.us.lcssa = phi i32 [ %add43.us, %for.body12.us ], [ %35, %middle.block ]
+for.cond10.for.cond.cleanup_crit_edge.us:         ; preds = %for.body12.us
   %arrayidx45.us = getelementptr inbounds i8, ptr %1, i64 %idxprom44
   %44 = load i8, ptr %arrayidx45.us, align 1
   %conv46.us = sext i8 %44 to i32
   %mul47.us = mul nsw i32 %conv46.us, 127
-  %add48.us = add nsw i32 %mul47.us, %add19.us.lcssa
+  %add48.us = add nsw i32 %mul47.us, %add19.us
   %conv49.us = sitofp i32 %add48.us to float
   %arrayidx52.us = getelementptr inbounds float, ptr %scales, i64 %indvars.iv164
   %45 = load float, ptr %arrayidx52.us, align 4
@@ -1057,7 +1076,7 @@ for.cond10.for.cond.cleanup_crit_edge.us:         ; preds = %middle.block, %for.
   %46 = load i8, ptr %arrayidx58.us, align 1
   %conv59.us = sext i8 %46 to i32
   %mul60.us = mul nsw i32 %conv59.us, 127
-  %add61.us = add nsw i32 %mul60.us, %add27.us.lcssa
+  %add61.us = add nsw i32 %mul60.us, %add27.us
   %conv62.us = sitofp i32 %add61.us to float
   %arrayidx65.us = getelementptr inbounds float, ptr %scales, i64 %2
   %47 = load float, ptr %arrayidx65.us, align 4
@@ -1068,7 +1087,7 @@ for.cond10.for.cond.cleanup_crit_edge.us:         ; preds = %middle.block, %for.
   %48 = load i8, ptr %arrayidx71.us, align 1
   %conv72.us = sext i8 %48 to i32
   %mul73.us = mul nsw i32 %conv72.us, 127
-  %add74.us = add nsw i32 %mul73.us, %add35.us.lcssa
+  %add74.us = add nsw i32 %mul73.us, %add35.us
   %conv75.us = sitofp i32 %add74.us to float
   %arrayidx78.us = getelementptr inbounds float, ptr %scales, i64 %4
   %49 = load float, ptr %arrayidx78.us, align 4
@@ -1079,7 +1098,7 @@ for.cond10.for.cond.cleanup_crit_edge.us:         ; preds = %middle.block, %for.
   %50 = load i8, ptr %arrayidx84.us, align 1
   %conv85.us = sext i8 %50 to i32
   %mul86.us = mul nsw i32 %conv85.us, 127
-  %add87.us = add nsw i32 %mul86.us, %add43.us.lcssa
+  %add87.us = add nsw i32 %mul86.us, %add43.us
   %conv88.us = sitofp i32 %add87.us to float
   %arrayidx91.us = getelementptr inbounds float, ptr %scales, i64 %6
   %51 = load float, ptr %arrayidx91.us, align 4

>From 1ef66abf4c7153d7c6038982f48dc78e6bb29884 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Thu, 24 Oct 2024 15:36:49 +0100
Subject: [PATCH 10/56] Revert "Support predicated loops"

This reverts commit cd7272dbfc9e2a32f1ef760fd25756864fac34f6.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  2 ++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  9 -------
 .../AArch64/partial-reduce-dot-product.ll     | 26 +++++++++----------
 3 files changed, 14 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 16ade431be764e..1eb1786af01d64 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8713,6 +8713,8 @@ getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx,
   // the end of the loop chooses between the phi value and most recent
   // reduction result, both of which have different VFs to the active lane
   // mask when scaling.
+  if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
+    return std::nullopt;
 
   auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
   if (!Update)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 57c08dc200e92e..f610d232c44a31 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -451,15 +451,6 @@ Value *VPInstruction::generate(VPTransformState &State) {
     Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
     Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
     Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
-    auto *CondVec = dyn_cast<VectorType>(Cond->getType());
-    auto *Op1Vec = dyn_cast<VectorType>(Op1->getType());
-    // If the condition and values differ in their element count (as can happen
-    // in the case of predicated partial reductions) then reduce the condition
-    // to a single value and select based on that instead.
-    if (CondVec && Op1Vec &&
-        CondVec->getElementCount() != Op1Vec->getElementCount() &&
-        CondVec->getElementType() == IntegerType::getInt1Ty(Cond->getContext()))
-      Cond = Builder.CreateOrReduce(Cond);
     return Builder.CreateSelect(Cond, Op1, Op2, Name);
   }
   case VPInstruction::ActiveLaneMask: {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 25ad742e9ecdda..e4be826ce71699 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1113,8 +1113,8 @@ for.end98:                                        ; preds = %for.end98.loopexit1
   ret void
 }
 
-define i32 @dotp_predicated(i32 %N, ptr %a, ptr %b) #0 {
-; CHECK-LABEL: define i32 @dotp_predicated(
+define i32 @not_dotp_predicated(i32 %N, ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define i32 @not_dotp_predicated(
 ; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[N]], 16
@@ -1132,7 +1132,7 @@ define i32 @dotp_predicated(i32 %N, ptr %a, ptr %b) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
@@ -1143,16 +1143,15 @@ define i32 @dotp_predicated(i32 %N, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP10]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP3]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP17]])
-; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP15]] = select i1 [[TMP8]], <4 x i32> [[PARTIAL_REDUCE]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP15]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP8]], <16 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP18]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ;
 entry:
@@ -1188,8 +1187,8 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
 }
 
-define i32 @dotp_predicated_pragma(i32 %N, ptr %a, ptr %b) #0 {
-; CHECK-LABEL: define i32 @dotp_predicated_pragma(
+define i32 @not_dotp_predicated_pragma(i32 %N, ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define i32 @not_dotp_predicated_pragma(
 ; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i32 [[N]], 0
@@ -1206,7 +1205,7 @@ define i32 @dotp_predicated_pragma(i32 %N, ptr %a, ptr %b) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
@@ -1217,16 +1216,15 @@ define i32 @dotp_predicated_pragma(i32 %N, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP10]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP3]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP17]])
-; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP15]] = select i1 [[TMP8]], <4 x i32> [[PARTIAL_REDUCE]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP15]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP8]], <16 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP18]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ;
 entry:

>From 73ff592f93d457c541b4ba5517b463679e6f7b6b Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 29 Oct 2024 16:12:07 +0000
Subject: [PATCH 11/56] Collapse Chain declaration and if statement

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1eb1786af01d64..4b09b44c4b71d5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9227,12 +9227,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
 
   // Cache the partial reductions up front so we can remove the invalid ones
   // before creating the recipes
-  for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
-    std::optional<PartialReductionChain> Chain =
-        getScaledReduction(Phi, RdxDesc, &TTI, Range, CM);
-    if (Chain.has_value())
+  for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
+    if (std::optional<PartialReductionChain> Chain =
+        getScaledReduction(Phi, RdxDesc, &TTI, Range, CM))
       Plan->addScaledReductionExitInstr(*Chain);
-  }
   Plan->removeInvalidScaledReductionExitInstrs();
 
   auto *MiddleVPBB = Plan->getMiddleBlock();

>From 959925b124e01141243c611c5e77857b712551ec Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 30 Oct 2024 10:16:36 +0000
Subject: [PATCH 12/56] Move scaled reduction list to VPRecipeBuilder

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 46 ++--------
 .../Transforms/Vectorize/VPRecipeBuilder.h    | 73 ++++++++++++++++
 llvm/lib/Transforms/Vectorize/VPlan.h         | 86 ++-----------------
 .../AArch64/partial-reduce-dot-product.ll     | 76 ++++++++--------
 .../LoopVectorize/AArch64/vplan-printing.ll   | 14 +--
 5 files changed, 127 insertions(+), 168 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4b09b44c4b71d5..fced747fd90308 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1537,42 +1537,6 @@ class LoopVectorizationCostModel {
   /// trivially hoistable.
   bool shouldConsiderInvariant(Value *Op);
 
-  /// A chain of instructions that form a partial reduction.
-  /// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))),
-  /// accumulator)
-  struct PartialReductionChain {
-    /// The top-level binary operation that forms the reduction to a scalar
-    /// after the loop body
-    Instruction *Reduction;
-    /// The inner binary operation that forms the reduction to a vector value
-    /// within the loop body
-    Instruction *BinOp;
-    /// The extension of each of the inner binary operation's operands
-    Instruction *ExtendA;
-    Instruction *ExtendB;
-
-    /// The accumulator that is reduced to a scalar after the loop body
-    Value *Accumulator;
-
-    /// The scaling factor between the size of the reduction type and the
-    /// (possibly extended) inputs
-    unsigned ScaleFactor;
-  };
-
-  using PartialReductionList = DenseMap<Instruction *, PartialReductionChain>;
-
-  PartialReductionList getPartialReductionChains() {
-    return PartialReductionChains;
-  }
-
-  std::optional<PartialReductionChain>
-  getInstructionsPartialReduction(Instruction *I) const {
-    auto PairIt = PartialReductionChains.find(I);
-    if (PairIt == PartialReductionChains.end())
-      return std::nullopt;
-    return PairIt->second;
-  }
-
 private:
   unsigned NumPredStores = 0;
 
@@ -8801,7 +8765,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
 
       // If the PHI is used by a partial reduction, set the scale factor
       std::optional<PartialReductionChain> Chain =
-          Plan.getScaledReductionForInstr(RdxDesc.getLoopExitInstr());
+          getScaledReductionForInstr(RdxDesc.getLoopExitInstr());
       unsigned ScaleFactor = Chain ? Chain->ScaleFactor : 1;
       PhiRecipe = new VPReductionPHIRecipe(
           Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
@@ -8837,7 +8801,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
     return tryToWidenMemory(Instr, Operands, Range);
 
-  if (Plan.getScaledReductionForInstr(Instr))
+  if (getScaledReductionForInstr(Instr))
     return tryToCreatePartialReduction(Instr, Operands);
 
   if (!shouldWiden(Instr, Range))
@@ -8874,7 +8838,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
 
   SmallVector<VPValue *, 2> OrderedOperands = {BinOp, Phi};
   return new VPPartialReductionRecipe(
-      *Reduction, make_range(OrderedOperands.begin(), OrderedOperands.end()));
+      Reduction->getOpcode(), make_range(OrderedOperands.begin(), OrderedOperands.end()));
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -9230,8 +9194,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
     if (std::optional<PartialReductionChain> Chain =
         getScaledReduction(Phi, RdxDesc, &TTI, Range, CM))
-      Plan->addScaledReductionExitInstr(*Chain);
-  Plan->removeInvalidScaledReductionExitInstrs();
+      RecipeBuilder.addScaledReductionExitInstr(*Chain);
+  RecipeBuilder.removeInvalidScaledReductionExitInstrs();
 
   auto *MiddleVPBB = Plan->getMiddleBlock();
   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index b93b8df8653ef2..e25979badf79e8 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -24,6 +24,25 @@ class TargetLibraryInfo;
 struct HistogramInfo;
 class TargetTransformInfo;
 
+/// A chain of instructions that form a partial reduction.
+/// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))),
+/// accumulator)
+struct PartialReductionChain {
+  /// The top-level binary operation that forms the reduction to a scalar
+  /// after the loop body
+  Instruction *Reduction;
+  /// The extension of each of the inner binary operation's operands
+  Instruction *ExtendA;
+  Instruction *ExtendB;
+
+  Instruction *BinOp;
+
+  /// The scaling factor between the size of the reduction type and the
+  /// (possibly extended) inputs
+  unsigned ScaleFactor;
+};
+
+
 /// Helper class to create VPRecipies from IR instructions.
 class VPRecipeBuilder {
   /// The VPlan new recipes are added to.
@@ -67,6 +86,11 @@ class VPRecipeBuilder {
   /// created.
   SmallVector<VPHeaderPHIRecipe *, 4> PhisToFix;
 
+  /// The set of reduction exit instructions that will be scaled to
+  /// a smaller VF via partial reductions.
+  DenseMap<const Instruction *, PartialReductionChain>
+      ScaledReductionExitInstrs;
+
   /// Check if \p I can be widened at the start of \p Range and possibly
   /// decrease the range such that the returned value holds for the entire \p
   /// Range. The function should not be called for memory instructions or calls.
@@ -124,6 +148,55 @@ class VPRecipeBuilder {
       : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal),
         CM(CM), PSE(PSE), Builder(Builder) {}
 
+  void addScaledReductionExitInstr(PartialReductionChain Chain) {
+    ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Chain));
+  }
+
+  std::optional<PartialReductionChain>
+  getScaledReductionForInstr(const Instruction *ExitInst) {
+    auto It = ScaledReductionExitInstrs.find(ExitInst);
+    return It == ScaledReductionExitInstrs.end()
+               ? std::nullopt
+               : std::make_optional(It->second);
+  }
+
+  void removeInvalidScaledReductionExitInstrs() {
+    // A partial reduction is invalid if any of its extends are used by
+    // something that isn't another partial reduction. This is because the
+    // extends are intended to be lowered along with the reduction itself.
+
+    // Build up a set of partial reduction bin ops for efficient use checking
+    SmallSet<Instruction *, 4> PartialReductionBinOps;
+    for (auto It : ScaledReductionExitInstrs) {
+      if (It.second.BinOp)
+        PartialReductionBinOps.insert(It.second.BinOp);
+    }
+
+    auto ExtendIsOnlyUsedByPartialReductions =
+        [PartialReductionBinOps](Instruction *Extend) {
+          for (auto *Use : Extend->users()) {
+            Instruction *UseInstr = dyn_cast<Instruction>(Use);
+            if (!PartialReductionBinOps.contains(UseInstr))
+              return false;
+          }
+          return true;
+        };
+
+    // Check if each use of a chain's two extends is a partial reduction
+    // and remove those that have non-partial reduction users
+    SmallSet<Instruction *, 4> PartialReductionsToRemove;
+    for (auto It : ScaledReductionExitInstrs) {
+      PartialReductionChain Chain = It.second;
+      if (!ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) ||
+          !ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
+        PartialReductionsToRemove.insert(Chain.Reduction);
+    }
+
+    for (auto *Instr : PartialReductionsToRemove)
+      ScaledReductionExitInstrs.erase(Instr);
+  }
+
+
   /// Create and return a widened recipe for \p I if one can be created within
   /// the given VF \p Range.
   VPRecipeBase *tryToCreateWidenRecipe(Instruction *Instr,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5352b46992d619..290546adc59359 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2389,25 +2389,26 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
 /// scalar value.
 class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
   unsigned Opcode;
-  Instruction &Reduction;
 
 public:
   template <typename IterT>
-  VPPartialReductionRecipe(Instruction &I, iterator_range<IterT> Operands)
-      : VPRecipeWithIRFlags(VPDef::VPPartialReductionSC, Operands, I),
-        Opcode(I.getOpcode()), Reduction(I) {
+  VPPartialReductionRecipe(unsigned ReductionOpcode, iterator_range<IterT> Operands)
+      : VPRecipeWithIRFlags(VPDef::VPPartialReductionSC, Operands),
+        Opcode(ReductionOpcode) {
     assert(isa<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe()) &&
            "Unexpected operand order for partial reduction recipe");
   }
   ~VPPartialReductionRecipe() override = default;
   VPPartialReductionRecipe *clone() override {
-    auto Ops = operands();
-    return new VPPartialReductionRecipe(Reduction,
-                                        make_range(Ops.begin(), Ops.end()));
+    return new VPPartialReductionRecipe(Opcode,
+                                        operands());
   }
+
   VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
+
   /// Generate the reduction in the loop
   void execute(VPTransformState &State) override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -3741,24 +3742,6 @@ class VPRegionBlock : public VPBlockBase {
   VPRegionBlock *clone() override;
 };
 
-/// A chain of instructions that form a partial reduction.
-/// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))),
-/// accumulator)
-struct PartialReductionChain {
-  /// The top-level binary operation that forms the reduction to a scalar
-  /// after the loop body
-  Instruction *Reduction;
-  /// The extension of each of the inner binary operation's operands
-  Instruction *ExtendA;
-  Instruction *ExtendB;
-
-  Instruction *BinOp;
-
-  /// The scaling factor between the size of the reduction type and the
-  /// (possibly extended) inputs
-  unsigned ScaleFactor;
-};
-
 /// VPlan models a candidate for vectorization, encoding various decisions take
 /// to produce efficient output IR, including which branches, basic-blocks and
 /// output IR instructions to generate, and their cost. VPlan holds a
@@ -3820,11 +3803,6 @@ class VPlan {
   /// been modeled in VPlan directly.
   DenseMap<const SCEV *, VPValue *> SCEVToExpansion;
 
-  /// Stores the set of reduction exit instructions that will be scaled to
-  /// a smaller VF in this plan via partial reductions.
-  DenseMap<const Instruction *, PartialReductionChain>
-      ScaledReductionExitInstrs;
-
 public:
   /// Construct a VPlan with original preheader \p Preheader, trip count \p TC,
   /// \p Entry to the plan and with \p ScalarHeader wrapping the original header
@@ -4044,54 +4022,6 @@ class VPlan {
   /// Clone the current VPlan, update all VPValues of the new VPlan and cloned
   /// recipes to refer to the clones, and return it.
   VPlan *duplicate();
-
-  void addScaledReductionExitInstr(PartialReductionChain Chain) {
-    ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Chain));
-  }
-
-  std::optional<PartialReductionChain>
-  getScaledReductionForInstr(const Instruction *ExitInst) {
-    auto It = ScaledReductionExitInstrs.find(ExitInst);
-    return It == ScaledReductionExitInstrs.end()
-               ? std::nullopt
-               : std::make_optional(It->second);
-  }
-
-  void removeInvalidScaledReductionExitInstrs() {
-    // A partial reduction is invalid if any of its extends are used by
-    // something that isn't another partial reduction. This is because the
-    // extends are intended to be lowered along with the reduction itself.
-
-    // Build up a set of partial reduction bin ops for efficient use checking
-    SmallSet<Instruction *, 4> PartialReductionBinOps;
-    for (auto It : ScaledReductionExitInstrs) {
-      if (It.second.BinOp)
-        PartialReductionBinOps.insert(It.second.BinOp);
-    }
-
-    auto ExtendIsOnlyUsedByPartialReductions =
-        [PartialReductionBinOps](Instruction *Extend) {
-          for (auto *Use : Extend->users()) {
-            Instruction *UseInstr = dyn_cast<Instruction>(Use);
-            if (!PartialReductionBinOps.contains(UseInstr))
-              return false;
-          }
-          return true;
-        };
-
-    // Check if each use of a chain's two extends is a partial reduction
-    // and remove those that have non-partial reduction users
-    SmallSet<Instruction *, 4> PartialReductionsToRemove;
-    for (auto It : ScaledReductionExitInstrs) {
-      PartialReductionChain Chain = It.second;
-      if (!ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) ||
-          !ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
-        PartialReductionsToRemove.insert(Chain.Reduction);
-    }
-
-    for (auto *Instr : PartialReductionsToRemove)
-      ScaledReductionExitInstrs.erase(Instr);
-  }
 };
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index e4be826ce71699..45c8bdf2bc6c88 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -773,9 +773,7 @@ define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %w, ptr %scales, ptr %
 ; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[INDVARS_IV164]], 3
 ; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX9_US]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP9]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
@@ -831,9 +829,7 @@ define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %w, ptr %scales, ptr %
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-NEXT:    [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP36:%.*]] = mul i64 [[TMP35]], 4
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP36]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP31]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -841,60 +837,56 @@ define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %w, ptr %scales, ptr %
 ; CHECK-NEXT:    [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP33]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX14:%.*]] = phi i32 [ [[TMP34]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP38:%.*]] = mul i64 [[TMP37]], 4
-; CHECK-NEXT:    [[N_MOD_VF15:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP38]]
+; CHECK-NEXT:    [[N_MOD_VF15:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
 ; CHECK-NEXT:    [[N_VEC16:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF15]]
-; CHECK-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 4
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX12]], i32 0
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX13]], i32 0
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX14]], i32 0
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX12]], i32 0
+; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX13]], i32 0
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX14]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX17:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI18:%.*]] = phi <vscale x 4 x i32> [ [[TMP41]], [[VEC_EPILOG_PH]] ], [ [[TMP68:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI19:%.*]] = phi <vscale x 4 x i32> [ [[TMP42]], [[VEC_EPILOG_PH]] ], [ [[TMP63:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI20:%.*]] = phi <vscale x 4 x i32> [ [[TMP43]], [[VEC_EPILOG_PH]] ], [ [[TMP58:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI21:%.*]] = phi <vscale x 4 x i32> [ [[TMP44]], [[VEC_EPILOG_PH]] ], [ [[TMP53:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI18:%.*]] = phi <2 x i32> [ [[TMP37]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE30:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI19:%.*]] = phi <2 x i32> [ [[TMP38]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI20:%.*]] = phi <2 x i32> [ [[TMP35]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI21:%.*]] = phi <2 x i32> [ [[TMP36]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE24:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP45:%.*]] = add i64 [[INDEX17]], 0
 ; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP45]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i8, ptr [[TMP46]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <vscale x 4 x i8>, ptr [[TMP47]], align 1
-; CHECK-NEXT:    [[TMP48:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD22]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i8>, ptr [[TMP47]], align 1
+; CHECK-NEXT:    [[TMP40:%.*]] = sext <8 x i8> [[WIDE_LOAD22]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP45]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[TMP49]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <vscale x 4 x i8>, ptr [[TMP50]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD23]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP52:%.*]] = mul nsw <vscale x 4 x i32> [[TMP51]], [[TMP48]]
-; CHECK-NEXT:    [[TMP53]] = add <vscale x 4 x i32> [[TMP52]], [[VEC_PHI21]]
+; CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i8>, ptr [[TMP50]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = sext <8 x i8> [[WIDE_LOAD23]] to <8 x i32>
+; CHECK-NEXT:    [[TMP44:%.*]] = mul nsw <8 x i32> [[TMP43]], [[TMP40]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE24]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI21]], <8 x i32> [[TMP44]])
 ; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP45]]
 ; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <vscale x 4 x i8>, ptr [[TMP55]], align 1
-; CHECK-NEXT:    [[TMP56:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD24]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP57:%.*]] = mul nsw <vscale x 4 x i32> [[TMP56]], [[TMP51]]
-; CHECK-NEXT:    [[TMP58]] = add <vscale x 4 x i32> [[TMP57]], [[VEC_PHI20]]
+; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <8 x i8>, ptr [[TMP55]], align 1
+; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i8> [[WIDE_LOAD25]] to <8 x i32>
+; CHECK-NEXT:    [[TMP48:%.*]] = mul nsw <8 x i32> [[TMP53]], [[TMP43]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE26]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI20]], <8 x i32> [[TMP48]])
 ; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP45]]
 ; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i8, ptr [[TMP59]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <vscale x 4 x i8>, ptr [[TMP60]], align 1
-; CHECK-NEXT:    [[TMP61:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD25]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP62:%.*]] = mul nsw <vscale x 4 x i32> [[TMP61]], [[TMP51]]
-; CHECK-NEXT:    [[TMP63]] = add <vscale x 4 x i32> [[TMP62]], [[VEC_PHI19]]
+; CHECK-NEXT:    [[WIDE_LOAD27:%.*]] = load <8 x i8>, ptr [[TMP60]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = sext <8 x i8> [[WIDE_LOAD27]] to <8 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = mul nsw <8 x i32> [[TMP51]], [[TMP43]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE28]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI19]], <8 x i32> [[TMP52]])
 ; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP45]]
 ; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i8, ptr [[TMP64]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <vscale x 4 x i8>, ptr [[TMP65]], align 1
-; CHECK-NEXT:    [[TMP66:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD26]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP67:%.*]] = mul nsw <vscale x 4 x i32> [[TMP66]], [[TMP51]]
-; CHECK-NEXT:    [[TMP68]] = add <vscale x 4 x i32> [[TMP67]], [[VEC_PHI18]]
-; CHECK-NEXT:    [[INDEX_NEXT27]] = add nuw i64 [[INDEX17]], [[TMP40]]
+; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <8 x i8>, ptr [[TMP65]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = sext <8 x i8> [[WIDE_LOAD29]] to <8 x i32>
+; CHECK-NEXT:    [[TMP56:%.*]] = mul nsw <8 x i32> [[TMP57]], [[TMP43]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE30]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI18]], <8 x i32> [[TMP56]])
+; CHECK-NEXT:    [[INDEX_NEXT27]] = add nuw i64 [[INDEX17]], 8
 ; CHECK-NEXT:    [[TMP69:%.*]] = icmp eq i64 [[INDEX_NEXT27]], [[N_VEC16]]
 ; CHECK-NEXT:    br i1 [[TMP69]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP70:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP68]])
-; CHECK-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP63]])
-; CHECK-NEXT:    [[TMP72:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP58]])
-; CHECK-NEXT:    [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP53]])
+; CHECK-NEXT:    [[TMP70:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE30]])
+; CHECK-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE28]])
+; CHECK-NEXT:    [[TMP72:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE26]])
+; CHECK-NEXT:    [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE24]])
 ; CHECK-NEXT:    [[CMP_N28:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC16]]
 ; CHECK-NEXT:    br i1 [[CMP_N28]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index 48eaa2b280624a..d6e61e8734f9cf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -63,7 +63,7 @@ define void @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT: vector.body:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<%add> (VF scaled by 1/4)
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, vp<%6> (VF scaled by 1/4)
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:   CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<[[STEPS]]>
 ; CHECK-NEXT:   vp<%4> = vector-pointer ir<%arrayidx>
@@ -74,7 +74,7 @@ define void @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT:   WIDEN ir<%2> = load vp<%5>
 ; CHECK-NEXT:   WIDEN-CAST ir<%conv3> = zext ir<%2> to i32
 ; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%conv3>, ir<%conv>
-; CHECK-NEXT:   PARTIAL-REDUCE ir<%add> = add ir<%mul>, ir<[[ACC]]>
+; CHECK-NEXT:   PARTIAL-REDUCE vp<%6> = add ir<%mul>, ir<[[ACC]]>
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -82,14 +82,14 @@ define void @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<%8> = compute-reduction-result ir<[[ACC]]>, ir<%add>
-; CHECK-NEXT:   EMIT vp<%9> = extract-from-end vp<%8>, ir<1>
-; CHECK-NEXT:   EMIT vp<%10> = icmp eq ir<0>, vp<%1>
-; CHECK-NEXT:   EMIT branch-on-cond vp<%10>
+; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, vp<%6>
+; CHECK-NEXT:   EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<0>, vp<%1>
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
-; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%9>)
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]>)
 ; CHECK-NEXT:   IR   %0 = lshr i32 %add.lcssa, 0
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:

>From 4afbd7363008176e64b0263e83040edb21973c22 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 30 Oct 2024 10:20:01 +0000
Subject: [PATCH 13/56] Remove redundant comment

---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f610d232c44a31..8d51a36d518ac0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -319,7 +319,6 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
       RetTy, Intrinsic::experimental_vector_partial_reduce_add,
       {PhiVal, BinOpVal}, nullptr, Twine("partial.reduce"));
 
-  // Use this vector value for all users of the original instruction.
   State.set(this, V);
   State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
 }

>From 09af7190a8f4d986d55a9497f407acafa5878e31 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 30 Oct 2024 13:51:49 +0000
Subject: [PATCH 14/56] Improve IR names in test

---
 .../AArch64/partial-reduce-dot-product.ll     | 494 +++++++++---------
 1 file changed, 247 insertions(+), 247 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 45c8bdf2bc6c88..102e0699ab831d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -232,14 +232,14 @@ define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = load i16, ptr [[TMP23]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = load i16, ptr [[TMP24]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = load i16, ptr [[TMP25]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = load i16, ptr [[TMP26]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = load i16, ptr [[TMP27]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP46:%.*]] = load i16, ptr [[TMP28]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP55:%.*]] = load i16, ptr [[TMP29]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP56:%.*]] = load i16, ptr [[TMP30]], align 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2
@@ -248,14 +248,14 @@ define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3
-; CHECK-INTERLEAVE1-NEXT:    [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4
-; CHECK-INTERLEAVE1-NEXT:    [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5
-; CHECK-INTERLEAVE1-NEXT:    [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6
-; CHECK-INTERLEAVE1-NEXT:    [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7
-; CHECK-INTERLEAVE1-NEXT:    [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9
-; CHECK-INTERLEAVE1-NEXT:    [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10
-; CHECK-INTERLEAVE1-NEXT:    [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11
+; CHECK-INTERLEAVE1-NEXT:    [[TMP57:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP41]], i32 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 5
+; CHECK-INTERLEAVE1-NEXT:    [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 6
+; CHECK-INTERLEAVE1-NEXT:    [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 7
+; CHECK-INTERLEAVE1-NEXT:    [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP96:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 9
+; CHECK-INTERLEAVE1-NEXT:    [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP55]], i32 10
+; CHECK-INTERLEAVE1-NEXT:    [[TMP62:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP56]], i32 11
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14
@@ -424,8 +424,8 @@ define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP141:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP142:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP139:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP140:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = load i16, ptr [[TMP39]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = load i16, ptr [[TMP40]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = load i16, ptr [[TMP41]], align 2
@@ -472,8 +472,8 @@ define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP112:%.*]] = load i16, ptr [[TMP66]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP113:%.*]] = load i16, ptr [[TMP67]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP68]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP141]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP142]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP139]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP140]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
@@ -497,11 +497,11 @@ define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-INTERLEAVED-NEXT:    [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP140:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ;
 entry:
@@ -742,37 +742,37 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
 }
 
-define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %w, ptr %scales, ptr %u, ptr %v) #0 {
+define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %a, ptr %b, ptr %c, ptr %d) #0 {
 ; CHECK-LABEL: define void @dotp_unrolled(
-; CHECK-SAME: i32 [[NUM_OUT:%.*]], i32 [[NUM_IN:%.*]], ptr [[W:%.*]], ptr [[SCALES:%.*]], ptr [[U:%.*]], ptr [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: i32 [[NUM_OUT:%.*]], i32 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP154:%.*]] = icmp sgt i32 [[NUM_OUT]], 3
 ; CHECK-NEXT:    br i1 [[CMP154]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END98:%.*]]
 ; CHECK:       for.body.lr.ph:
 ; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[NUM_OUT]], 4
 ; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[DIV]], 2
-; CHECK-NEXT:    [[CMP11145:%.*]] = icmp sgt i32 [[NUM_IN]], 0
-; CHECK-NEXT:    [[IDXPROM44:%.*]] = sext i32 [[NUM_IN]] to i64
+; CHECK-NEXT:    [[CMP111LOAD_B:%.*]] = icmp sgt i32 [[NUM_IN]], 0
+; CHECK-NEXT:    [[IDXPROMLOAD_PROM_A0:%.*]] = sext i32 [[NUM_IN]] to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[MUL]] to i64
-; CHECK-NEXT:    br i1 [[CMP11145]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 [[CMP111LOAD_B]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    br label [[FOR_END98]]
 ; CHECK:       for.body.us.preheader:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[NUM_IN]] to i64
 ; CHECK-NEXT:    br label [[ITER_CHECK:%.*]]
 ; CHECK:       iter.check:
-; CHECK-NEXT:    [[INDVARS_IV164:%.*]] = phi i64 [ 0, [[FOR_BODY_US_PREHEADER]] ], [ [[INDVARS_IV_NEXT165:%.*]], [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[INDVARS_IV164]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
+; CHECK-NEXT:    [[INDVARS_IV164:%.*]] = phi i64 [ 0, [[FOR_BODY_US_PREHEADER]] ], [ [[IV_NEXT2:%.*]], [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US:%.*]] ]
+; CHECK-NEXT:    [[GEP_A0:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[INDVARS_IV164]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[GEP_A0]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[INDVARS_IV164]], 1
-; CHECK-NEXT:    [[ARRAYIDX3_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX3_US]], align 8
+; CHECK-NEXT:    [[GEP_A1:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[GEP_A1]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[INDVARS_IV164]], 2
-; CHECK-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARRAYIDX6_US]], align 8
+; CHECK-NEXT:    [[GEP_A2:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[GEP_A2]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[INDVARS_IV164]], 3
-; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX9_US]], align 8
+; CHECK-NEXT:    [[GEP_A3:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[GEP_A3]], align 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
@@ -793,7 +793,7 @@ define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %w, ptr %scales, ptr %
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP15]], align 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
@@ -850,34 +850,34 @@ define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %w, ptr %scales, ptr %
 ; CHECK-NEXT:    [[VEC_PHI19:%.*]] = phi <2 x i32> [ [[TMP38]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI20:%.*]] = phi <2 x i32> [ [[TMP35]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI21:%.*]] = phi <2 x i32> [ [[TMP36]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE24:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP45:%.*]] = add i64 [[INDEX17]], 0
-; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP45]]
-; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i8, ptr [[TMP46]], i32 0
+; CHECK-NEXT:    [[TMP39:%.*]] = add i64 [[INDEX17]], 0
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP39]]
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i8, ptr [[TMP40]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i8>, ptr [[TMP47]], align 1
-; CHECK-NEXT:    [[TMP40:%.*]] = sext <8 x i8> [[WIDE_LOAD22]] to <8 x i32>
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP41:%.*]] = sext <8 x i8> [[WIDE_LOAD22]] to <8 x i32>
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP39]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[TMP49]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i8>, ptr [[TMP50]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = sext <8 x i8> [[WIDE_LOAD23]] to <8 x i32>
-; CHECK-NEXT:    [[TMP44:%.*]] = mul nsw <8 x i32> [[TMP43]], [[TMP40]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE24]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI21]], <8 x i32> [[TMP44]])
-; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP42:%.*]] = sext <8 x i8> [[WIDE_LOAD23]] to <8 x i32>
+; CHECK-NEXT:    [[TMP43:%.*]] = mul nsw <8 x i32> [[TMP42]], [[TMP41]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE24]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI21]], <8 x i32> [[TMP43]])
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP39]]
 ; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <8 x i8>, ptr [[TMP55]], align 1
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i8> [[WIDE_LOAD25]] to <8 x i32>
-; CHECK-NEXT:    [[TMP48:%.*]] = mul nsw <8 x i32> [[TMP53]], [[TMP43]]
+; CHECK-NEXT:    [[TMP48:%.*]] = mul nsw <8 x i32> [[TMP53]], [[TMP42]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE26]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI20]], <8 x i32> [[TMP48]])
-; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP39]]
 ; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i8, ptr [[TMP59]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD27:%.*]] = load <8 x i8>, ptr [[TMP60]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = sext <8 x i8> [[WIDE_LOAD27]] to <8 x i32>
-; CHECK-NEXT:    [[TMP52:%.*]] = mul nsw <8 x i32> [[TMP51]], [[TMP43]]
+; CHECK-NEXT:    [[TMP52:%.*]] = mul nsw <8 x i32> [[TMP51]], [[TMP42]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE28]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI19]], <8 x i32> [[TMP52]])
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP39]]
 ; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i8, ptr [[TMP64]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <8 x i8>, ptr [[TMP65]], align 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = sext <8 x i8> [[WIDE_LOAD29]] to <8 x i32>
-; CHECK-NEXT:    [[TMP56:%.*]] = mul nsw <8 x i32> [[TMP57]], [[TMP43]]
+; CHECK-NEXT:    [[TMP56:%.*]] = mul nsw <8 x i32> [[TMP57]], [[TMP42]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE30]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI18]], <8 x i32> [[TMP56]])
 ; CHECK-NEXT:    [[INDEX_NEXT27]] = add nuw i64 [[INDEX17]], 8
 ; CHECK-NEXT:    [[TMP69:%.*]] = icmp eq i64 [[INDEX_NEXT27]], [[N_VEC16]]
@@ -897,88 +897,88 @@ define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %w, ptr %scales, ptr %
 ; CHECK-NEXT:    [[BC_MERGE_RDX32:%.*]] = phi i32 [ [[TMP73]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP34]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY12_US:%.*]]
 ; CHECK:       for.body12.us:
-; CHECK-NEXT:    [[INDVARS_IV161:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT162:%.*]], [[FOR_BODY12_US]] ]
-; CHECK-NEXT:    [[TOTAL3_0149_US:%.*]] = phi i32 [ [[BC_MERGE_RDX29]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD43_US:%.*]], [[FOR_BODY12_US]] ]
-; CHECK-NEXT:    [[TOTAL2_0148_US:%.*]] = phi i32 [ [[BC_MERGE_RDX30]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD35_US:%.*]], [[FOR_BODY12_US]] ]
-; CHECK-NEXT:    [[TOTAL1_0147_US:%.*]] = phi i32 [ [[BC_MERGE_RDX31]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD27_US:%.*]], [[FOR_BODY12_US]] ]
-; CHECK-NEXT:    [[TOTAL0_0146_US:%.*]] = phi i32 [ [[BC_MERGE_RDX32]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD19_US:%.*]], [[FOR_BODY12_US]] ]
-; CHECK-NEXT:    [[ARRAYIDX14_US:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDVARS_IV161]]
-; CHECK-NEXT:    [[TMP74:%.*]] = load i8, ptr [[ARRAYIDX14_US]], align 1
-; CHECK-NEXT:    [[CONV_US:%.*]] = sext i8 [[TMP74]] to i32
-; CHECK-NEXT:    [[ARRAYIDX16_US:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[INDVARS_IV161]]
-; CHECK-NEXT:    [[TMP75:%.*]] = load i8, ptr [[ARRAYIDX16_US]], align 1
-; CHECK-NEXT:    [[CONV17_US:%.*]] = sext i8 [[TMP75]] to i32
-; CHECK-NEXT:    [[MUL18_US:%.*]] = mul nsw i32 [[CONV17_US]], [[CONV_US]]
-; CHECK-NEXT:    [[ADD19_US]] = add nsw i32 [[MUL18_US]], [[TOTAL0_0146_US]]
-; CHECK-NEXT:    [[ARRAYIDX21_US:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[INDVARS_IV161]]
-; CHECK-NEXT:    [[TMP76:%.*]] = load i8, ptr [[ARRAYIDX21_US]], align 1
-; CHECK-NEXT:    [[CONV22_US:%.*]] = sext i8 [[TMP76]] to i32
-; CHECK-NEXT:    [[MUL26_US:%.*]] = mul nsw i32 [[CONV22_US]], [[CONV17_US]]
-; CHECK-NEXT:    [[ADD27_US]] = add nsw i32 [[MUL26_US]], [[TOTAL1_0147_US]]
-; CHECK-NEXT:    [[ARRAYIDX29_US:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[INDVARS_IV161]]
-; CHECK-NEXT:    [[TMP77:%.*]] = load i8, ptr [[ARRAYIDX29_US]], align 1
-; CHECK-NEXT:    [[CONV30_US:%.*]] = sext i8 [[TMP77]] to i32
-; CHECK-NEXT:    [[MUL34_US:%.*]] = mul nsw i32 [[CONV30_US]], [[CONV17_US]]
-; CHECK-NEXT:    [[ADD35_US]] = add nsw i32 [[MUL34_US]], [[TOTAL2_0148_US]]
-; CHECK-NEXT:    [[ARRAYIDX37_US:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[INDVARS_IV161]]
-; CHECK-NEXT:    [[TMP78:%.*]] = load i8, ptr [[ARRAYIDX37_US]], align 1
-; CHECK-NEXT:    [[CONV38_US:%.*]] = sext i8 [[TMP78]] to i32
-; CHECK-NEXT:    [[MUL42_US:%.*]] = mul nsw i32 [[CONV38_US]], [[CONV17_US]]
-; CHECK-NEXT:    [[ADD43_US]] = add nsw i32 [[MUL42_US]], [[TOTAL3_0149_US]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT162]] = add nuw nsw i64 [[INDVARS_IV161]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT162]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY12_US]] ]
+; CHECK-NEXT:    [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX29]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY12_US]] ]
+; CHECK-NEXT:    [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX30]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY12_US]] ]
+; CHECK-NEXT:    [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX31]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY12_US]] ]
+; CHECK-NEXT:    [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX32]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY12_US]] ]
+; CHECK-NEXT:    [[GEP_IDX_A0:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD_A0:%.*]] = load i8, ptr [[GEP_IDX_A0]], align 1
+; CHECK-NEXT:    [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD_C:%.*]] = load i8, ptr [[GEP_C]], align 1
+; CHECK-NEXT:    [[EXT_C:%.*]] = sext i8 [[LOAD_C]] to i32
+; CHECK-NEXT:    [[MUL_A0:%.*]] = mul nsw i32 [[EXT_C]], [[EXT_A0]]
+; CHECK-NEXT:    [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]]
+; CHECK-NEXT:    [[GEP_IDX_A1:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD_A1:%.*]] = load i8, ptr [[GEP_IDX_A1]], align 1
+; CHECK-NEXT:    [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32
+; CHECK-NEXT:    [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_C]]
+; CHECK-NEXT:    [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]]
+; CHECK-NEXT:    [[GEP_IDX_A2:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD_A2:%.*]] = load i8, ptr [[GEP_IDX_A2]], align 1
+; CHECK-NEXT:    [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32
+; CHECK-NEXT:    [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_C]]
+; CHECK-NEXT:    [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]]
+; CHECK-NEXT:    [[GEP_IDX_A3:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD_A3:%.*]] = load i8, ptr [[GEP_IDX_A3]], align 1
+; CHECK-NEXT:    [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32
+; CHECK-NEXT:    [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_C]]
+; CHECK-NEXT:    [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[FOR_BODY12_US]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       for.cond10.for.cond.cleanup_crit_edge.us:
-; CHECK-NEXT:    [[ADD19_US_LCSSA:%.*]] = phi i32 [ [[ADD19_US]], [[FOR_BODY12_US]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ], [ [[TMP73]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[ADD27_US_LCSSA:%.*]] = phi i32 [ [[ADD27_US]], [[FOR_BODY12_US]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ [[TMP72]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[ADD35_US_LCSSA:%.*]] = phi i32 [ [[ADD35_US]], [[FOR_BODY12_US]] ], [ [[TMP32]], [[MIDDLE_BLOCK]] ], [ [[TMP71]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[ADD43_US_LCSSA:%.*]] = phi i32 [ [[ADD43_US]], [[FOR_BODY12_US]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ], [ [[TMP70]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[ARRAYIDX45_US:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[IDXPROM44]]
-; CHECK-NEXT:    [[TMP79:%.*]] = load i8, ptr [[ARRAYIDX45_US]], align 1
-; CHECK-NEXT:    [[CONV46_US:%.*]] = sext i8 [[TMP79]] to i32
-; CHECK-NEXT:    [[MUL47_US:%.*]] = mul nsw i32 [[CONV46_US]], 127
-; CHECK-NEXT:    [[ADD48_US:%.*]] = add nsw i32 [[MUL47_US]], [[ADD19_US_LCSSA]]
-; CHECK-NEXT:    [[CONV49_US:%.*]] = sitofp i32 [[ADD48_US]] to float
-; CHECK-NEXT:    [[ARRAYIDX52_US:%.*]] = getelementptr inbounds float, ptr [[SCALES]], i64 [[INDVARS_IV164]]
-; CHECK-NEXT:    [[TMP80:%.*]] = load float, ptr [[ARRAYIDX52_US]], align 4
-; CHECK-NEXT:    [[MUL53_US:%.*]] = fmul float [[TMP80]], [[CONV49_US]]
-; CHECK-NEXT:    [[ARRAYIDX56_US:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[INDVARS_IV164]]
-; CHECK-NEXT:    store float [[MUL53_US]], ptr [[ARRAYIDX56_US]], align 4
-; CHECK-NEXT:    [[ARRAYIDX58_US:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[IDXPROM44]]
-; CHECK-NEXT:    [[TMP81:%.*]] = load i8, ptr [[ARRAYIDX58_US]], align 1
-; CHECK-NEXT:    [[CONV59_US:%.*]] = sext i8 [[TMP81]] to i32
-; CHECK-NEXT:    [[MUL60_US:%.*]] = mul nsw i32 [[CONV59_US]], 127
-; CHECK-NEXT:    [[ADD61_US:%.*]] = add nsw i32 [[MUL60_US]], [[ADD27_US_LCSSA]]
-; CHECK-NEXT:    [[CONV62_US:%.*]] = sitofp i32 [[ADD61_US]] to float
-; CHECK-NEXT:    [[ARRAYIDX65_US:%.*]] = getelementptr inbounds float, ptr [[SCALES]], i64 [[TMP2]]
+; CHECK-NEXT:    [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY12_US]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ], [ [[TMP73]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY12_US]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ [[TMP72]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY12_US]] ], [ [[TMP32]], [[MIDDLE_BLOCK]] ], [ [[TMP71]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY12_US]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ], [ [[TMP70]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[GEP_IDXPROM_A0:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[IDXPROMLOAD_PROM_A0]]
+; CHECK-NEXT:    [[LOAD_PROM_A0:%.*]] = load i8, ptr [[GEP_IDXPROM_A0]], align 1
+; CHECK-NEXT:    [[EXT_PROM_A0:%.*]] = sext i8 [[LOAD_PROM_A0]] to i32
+; CHECK-NEXT:    [[MUL_PROM_A0:%.*]] = mul nsw i32 [[EXT_PROM_A0]], 127
+; CHECK-NEXT:    [[ADD_PROM_A0:%.*]] = add nsw i32 [[MUL_PROM_A0]], [[ADD_A0_LCSSA]]
+; CHECK-NEXT:    [[FPEXT_PROM_A0:%.*]] = sitofp i32 [[ADD_PROM_A0]] to float
+; CHECK-NEXT:    [[ARRAYIDX65_US:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV164]]
 ; CHECK-NEXT:    [[TMP82:%.*]] = load float, ptr [[ARRAYIDX65_US]], align 4
-; CHECK-NEXT:    [[MUL66_US:%.*]] = fmul float [[TMP82]], [[CONV62_US]]
-; CHECK-NEXT:    [[ARRAYIDX69_US:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP2]]
-; CHECK-NEXT:    store float [[MUL66_US]], ptr [[ARRAYIDX69_US]], align 4
-; CHECK-NEXT:    [[ARRAYIDX71_US:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[IDXPROM44]]
-; CHECK-NEXT:    [[TMP83:%.*]] = load i8, ptr [[ARRAYIDX71_US]], align 1
-; CHECK-NEXT:    [[CONV72_US:%.*]] = sext i8 [[TMP83]] to i32
-; CHECK-NEXT:    [[MUL73_US:%.*]] = mul nsw i32 [[CONV72_US]], 127
-; CHECK-NEXT:    [[ADD74_US:%.*]] = add nsw i32 [[MUL73_US]], [[ADD35_US_LCSSA]]
-; CHECK-NEXT:    [[CONV75_US:%.*]] = sitofp i32 [[ADD74_US]] to float
-; CHECK-NEXT:    [[ARRAYIDX78_US:%.*]] = getelementptr inbounds float, ptr [[SCALES]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP84:%.*]] = load float, ptr [[ARRAYIDX78_US]], align 4
-; CHECK-NEXT:    [[MUL79_US:%.*]] = fmul float [[TMP84]], [[CONV75_US]]
-; CHECK-NEXT:    [[ARRAYIDX82_US:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP4]]
-; CHECK-NEXT:    store float [[MUL79_US]], ptr [[ARRAYIDX82_US]], align 4
-; CHECK-NEXT:    [[ARRAYIDX84_US:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[IDXPROM44]]
-; CHECK-NEXT:    [[TMP85:%.*]] = load i8, ptr [[ARRAYIDX84_US]], align 1
-; CHECK-NEXT:    [[CONV85_US:%.*]] = sext i8 [[TMP85]] to i32
-; CHECK-NEXT:    [[MUL86_US:%.*]] = mul nsw i32 [[CONV85_US]], 127
-; CHECK-NEXT:    [[ADD87_US:%.*]] = add nsw i32 [[MUL86_US]], [[ADD43_US_LCSSA]]
-; CHECK-NEXT:    [[CONV88_US:%.*]] = sitofp i32 [[ADD87_US]] to float
-; CHECK-NEXT:    [[ARRAYIDX91_US:%.*]] = getelementptr inbounds float, ptr [[SCALES]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP86:%.*]] = load float, ptr [[ARRAYIDX91_US]], align 4
-; CHECK-NEXT:    [[MUL92_US:%.*]] = fmul float [[TMP86]], [[CONV88_US]]
-; CHECK-NEXT:    [[ARRAYIDX95_US:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP6]]
-; CHECK-NEXT:    store float [[MUL92_US]], ptr [[ARRAYIDX95_US]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT165]] = add nuw nsw i64 [[INDVARS_IV164]], 4
-; CHECK-NEXT:    [[CMP_US:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT165]], [[TMP0]]
+; CHECK-NEXT:    [[FMUL_B:%.*]] = fmul float [[TMP82]], [[FPEXT_PROM_A0]]
+; CHECK-NEXT:    [[GEP_D:%.*]] = getelementptr inbounds float, ptr [[D]], i64 [[INDVARS_IV164]]
+; CHECK-NEXT:    store float [[FMUL_B]], ptr [[GEP_D]], align 4
+; CHECK-NEXT:    [[GEP_IDXPROM_A1:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[IDXPROMLOAD_PROM_A0]]
+; CHECK-NEXT:    [[LOAD_PROM_A1:%.*]] = load i8, ptr [[GEP_IDXPROM_A1]], align 1
+; CHECK-NEXT:    [[EXT_PROM_A1:%.*]] = sext i8 [[LOAD_PROM_A1]] to i32
+; CHECK-NEXT:    [[MUL_PROM_A1:%.*]] = mul nsw i32 [[EXT_PROM_A1]], 127
+; CHECK-NEXT:    [[ADD_PROM_A1:%.*]] = add nsw i32 [[MUL_PROM_A1]], [[ADD_A1_LCSSA]]
+; CHECK-NEXT:    [[FPEXT_PROM_A1:%.*]] = sitofp i32 [[ADD_PROM_A1]] to float
+; CHECK-NEXT:    [[GEP_B1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[LOAD_B1:%.*]] = load float, ptr [[GEP_B1]], align 4
+; CHECK-NEXT:    [[FMUL_B1:%.*]] = fmul float [[LOAD_B1]], [[FPEXT_PROM_A1]]
+; CHECK-NEXT:    [[GEP_D1:%.*]] = getelementptr inbounds float, ptr [[D]], i64 [[TMP2]]
+; CHECK-NEXT:    store float [[FMUL_B1]], ptr [[GEP_D1]], align 4
+; CHECK-NEXT:    [[GEP_IDXPROM_A2:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[IDXPROMLOAD_PROM_A0]]
+; CHECK-NEXT:    [[LOAD_PROM_A2:%.*]] = load i8, ptr [[GEP_IDXPROM_A2]], align 1
+; CHECK-NEXT:    [[EXT_PROM_A2:%.*]] = sext i8 [[LOAD_PROM_A2]] to i32
+; CHECK-NEXT:    [[MUL_PROM_A2:%.*]] = mul nsw i32 [[EXT_PROM_A2]], 127
+; CHECK-NEXT:    [[ADD_PROM_A2:%.*]] = add nsw i32 [[MUL_PROM_A2]], [[ADD_A2_LCSSA]]
+; CHECK-NEXT:    [[FPEXT_PROM_A2:%.*]] = sitofp i32 [[ADD_PROM_A2]] to float
+; CHECK-NEXT:    [[GEP_B2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
+; CHECK-NEXT:    [[LOAD_B2:%.*]] = load float, ptr [[GEP_B2]], align 4
+; CHECK-NEXT:    [[FMUL_B2:%.*]] = fmul float [[LOAD_B2]], [[FPEXT_PROM_A2]]
+; CHECK-NEXT:    [[GEP_D2:%.*]] = getelementptr inbounds float, ptr [[D]], i64 [[TMP4]]
+; CHECK-NEXT:    store float [[FMUL_B2]], ptr [[GEP_D2]], align 4
+; CHECK-NEXT:    [[GEP_IDXPROM_A3:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[IDXPROMLOAD_PROM_A0]]
+; CHECK-NEXT:    [[LOAD_PROM_A3:%.*]] = load i8, ptr [[GEP_IDXPROM_A3]], align 1
+; CHECK-NEXT:    [[EXT_PROM_A3:%.*]] = sext i8 [[LOAD_PROM_A3]] to i32
+; CHECK-NEXT:    [[MUL_PROM_A3:%.*]] = mul nsw i32 [[EXT_PROM_A3]], 127
+; CHECK-NEXT:    [[ADD_PROM_A3:%.*]] = add nsw i32 [[MUL_PROM_A3]], [[ADD_A3_LCSSA]]
+; CHECK-NEXT:    [[FPEXT_PROM_A3:%.*]] = sitofp i32 [[ADD_PROM_A3]] to float
+; CHECK-NEXT:    [[GEP_B3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT:    [[LOAD_B3:%.*]] = load float, ptr [[GEP_B3]], align 4
+; CHECK-NEXT:    [[MUL_B3:%.*]] = fmul float [[LOAD_B3]], [[FPEXT_PROM_A3]]
+; CHECK-NEXT:    [[GEP_D3:%.*]] = getelementptr inbounds float, ptr [[D]], i64 [[TMP6]]
+; CHECK-NEXT:    store float [[MUL_B3]], ptr [[GEP_D3]], align 4
+; CHECK-NEXT:    [[IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV164]], 4
+; CHECK-NEXT:    [[CMP_US:%.*]] = icmp ult i64 [[IV_NEXT2]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_US]], label [[ITER_CHECK]], label [[FOR_END98_LOOPEXIT:%.*]]
 ; CHECK:       for.end98.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END98]]
@@ -992,10 +992,10 @@ entry:
 for.body.lr.ph:                                   ; preds = %entry
   %div = sdiv i32 %num_out, 4
   %mul = shl nsw i32 %div, 2
-  %cmp11145 = icmp sgt i32 %num_in, 0
-  %idxprom44 = sext i32 %num_in to i64
-  %0 = zext nneg i32 %mul to i64
-  br i1 %cmp11145, label %for.body.us.preheader, label %for.body.preheader
+  %cmp.num_in = icmp sgt i32 %num_in, 0
+  %ext.num_in = sext i32 %num_in to i64
+  %max_iv = zext nneg i32 %mul to i64
+  br i1 %cmp.num_in, label %for.body.us.preheader, label %for.body.preheader
 
 for.body.preheader:                               ; preds = %for.body.lr.ph
   br label %for.end98
@@ -1005,100 +1005,100 @@ for.body.us.preheader:                            ; preds = %for.body.lr.ph
   br label %for.body.us
 
 for.body.us:                                      ; preds = %for.body.us.preheader, %for.cond10.for.cond.cleanup_crit_edge.us
-  %indvars.iv164 = phi i64 [ 0, %for.body.us.preheader ], [ %indvars.iv.next165, %for.cond10.for.cond.cleanup_crit_edge.us ]
-  %arrayidx.us = getelementptr inbounds ptr, ptr %w, i64 %indvars.iv164
-  %1 = load ptr, ptr %arrayidx.us, align 8
-  %2 = or disjoint i64 %indvars.iv164, 1
-  %arrayidx3.us = getelementptr inbounds ptr, ptr %w, i64 %2
-  %3 = load ptr, ptr %arrayidx3.us, align 8
-  %4 = or disjoint i64 %indvars.iv164, 2
-  %arrayidx6.us = getelementptr inbounds ptr, ptr %w, i64 %4
-  %5 = load ptr, ptr %arrayidx6.us, align 8
-  %6 = or disjoint i64 %indvars.iv164, 3
-  %arrayidx9.us = getelementptr inbounds ptr, ptr %w, i64 %6
-  %7 = load ptr, ptr %arrayidx9.us, align 8
+  %iv2 = phi i64 [ 0, %for.body.us.preheader ], [ %iv.next2, %for.cond10.for.cond.cleanup_crit_edge.us ]
+  %gep.a0 = getelementptr inbounds ptr, ptr %a, i64 %iv2
+  %ptr.a = load ptr, ptr %gep.a0, align 8
+  %offset.1 = or disjoint i64 %iv2, 1
+  %gep.a1 = getelementptr inbounds ptr, ptr %a, i64 %offset.1
+  %ptr.a1 = load ptr, ptr %gep.a1, align 8
+  %offset.2 = or disjoint i64 %iv2, 2
+  %gep.a2 = getelementptr inbounds ptr, ptr %a, i64 %offset.2
+  %ptr.a2 = load ptr, ptr %gep.a2, align 8
+  %offset.3 = or disjoint i64 %iv2, 3
+  %gep.a3 = getelementptr inbounds ptr, ptr %a, i64 %offset.3
+  %ptr.a3 = load ptr, ptr %gep.a3, align 8
   br label %for.body12.us
 
 for.body12.us:                                    ; preds = %for.body.us, %for.body12.us
-  %indvars.iv161 = phi i64 [ 0, %for.body.us ], [ %indvars.iv.next162, %for.body12.us ]
-  %total3.0149.us = phi i32 [ 0, %for.body.us ], [ %add43.us, %for.body12.us ]
-  %total2.0148.us = phi i32 [ 0, %for.body.us ], [ %add35.us, %for.body12.us ]
-  %total1.0147.us = phi i32 [ 0, %for.body.us ], [ %add27.us, %for.body12.us ]
-  %total0.0146.us = phi i32 [ 0, %for.body.us ], [ %add19.us, %for.body12.us ]
-  %arrayidx14.us = getelementptr inbounds i8, ptr %1, i64 %indvars.iv161
-  %39 = load i8, ptr %arrayidx14.us, align 1
-  %conv.us = sext i8 %39 to i32
-  %arrayidx16.us = getelementptr inbounds i8, ptr %u, i64 %indvars.iv161
-  %40 = load i8, ptr %arrayidx16.us, align 1
-  %conv17.us = sext i8 %40 to i32
-  %mul18.us = mul nsw i32 %conv17.us, %conv.us
-  %add19.us = add nsw i32 %mul18.us, %total0.0146.us
-  %arrayidx21.us = getelementptr inbounds i8, ptr %3, i64 %indvars.iv161
-  %41 = load i8, ptr %arrayidx21.us, align 1
-  %conv22.us = sext i8 %41 to i32
-  %mul26.us = mul nsw i32 %conv22.us, %conv17.us
-  %add27.us = add nsw i32 %mul26.us, %total1.0147.us
-  %arrayidx29.us = getelementptr inbounds i8, ptr %5, i64 %indvars.iv161
-  %42 = load i8, ptr %arrayidx29.us, align 1
-  %conv30.us = sext i8 %42 to i32
-  %mul34.us = mul nsw i32 %conv30.us, %conv17.us
-  %add35.us = add nsw i32 %mul34.us, %total2.0148.us
-  %arrayidx37.us = getelementptr inbounds i8, ptr %7, i64 %indvars.iv161
-  %43 = load i8, ptr %arrayidx37.us, align 1
-  %conv38.us = sext i8 %43 to i32
-  %mul42.us = mul nsw i32 %conv38.us, %conv17.us
-  %add43.us = add nsw i32 %mul42.us, %total3.0149.us
-  %indvars.iv.next162 = add nuw nsw i64 %indvars.iv161, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next162, %wide.trip.count
+  %iv = phi i64 [ 0, %for.body.us ], [ %iv.next, %for.body12.us ]
+  %accum3 = phi i32 [ 0, %for.body.us ], [ %add.a3, %for.body12.us ]
+  %accum2 = phi i32 [ 0, %for.body.us ], [ %add.a2, %for.body12.us ]
+  %accum1 = phi i32 [ 0, %for.body.us ], [ %add.a1, %for.body12.us ]
+  %accum0 = phi i32 [ 0, %for.body.us ], [ %add.a0, %for.body12.us ]
+  %gep.idx.a0 = getelementptr inbounds i8, ptr %ptr.a, i64 %iv
+  %load.a0 = load i8, ptr %gep.idx.a0, align 1
+  %ext.a0 = sext i8 %load.a0 to i32
+  %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv
+  %load.c = load i8, ptr %gep.c, align 1
+  %ext.c = sext i8 %load.c to i32
+  %mul.a0 = mul nsw i32 %ext.c, %ext.a0
+  %add.a0 = add nsw i32 %mul.a0, %accum0
+  %gep.idx.a1 = getelementptr inbounds i8, ptr %ptr.a1, i64 %iv
+  %load.a1 = load i8, ptr %gep.idx.a1, align 1
+  %ext.a1 = sext i8 %load.a1 to i32
+  %mul.a1 = mul nsw i32 %ext.a1, %ext.c
+  %add.a1 = add nsw i32 %mul.a1, %accum1
+  %gep.idx.a2 = getelementptr inbounds i8, ptr %ptr.a2, i64 %iv
+  %load.a2 = load i8, ptr %gep.idx.a2, align 1
+  %ext.a2 = sext i8 %load.a2 to i32
+  %mul.a2 = mul nsw i32 %ext.a2, %ext.c
+  %add.a2 = add nsw i32 %mul.a2, %accum2
+  %gep.idx.a3 = getelementptr inbounds i8, ptr %ptr.a3, i64 %iv
+  %load.a3 = load i8, ptr %gep.idx.a3, align 1
+  %ext.a3 = sext i8 %load.a3 to i32
+  %mul.a3 = mul nsw i32 %ext.a3, %ext.c
+  %add.a3 = add nsw i32 %mul.a3, %accum3
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
   br i1 %exitcond.not, label %for.cond10.for.cond.cleanup_crit_edge.us, label %for.body12.us
 
 for.cond10.for.cond.cleanup_crit_edge.us:         ; preds = %for.body12.us
-  %arrayidx45.us = getelementptr inbounds i8, ptr %1, i64 %idxprom44
-  %44 = load i8, ptr %arrayidx45.us, align 1
-  %conv46.us = sext i8 %44 to i32
-  %mul47.us = mul nsw i32 %conv46.us, 127
-  %add48.us = add nsw i32 %mul47.us, %add19.us
-  %conv49.us = sitofp i32 %add48.us to float
-  %arrayidx52.us = getelementptr inbounds float, ptr %scales, i64 %indvars.iv164
-  %45 = load float, ptr %arrayidx52.us, align 4
-  %mul53.us = fmul float %45, %conv49.us
-  %arrayidx56.us = getelementptr inbounds float, ptr %v, i64 %indvars.iv164
-  store float %mul53.us, ptr %arrayidx56.us, align 4
-  %arrayidx58.us = getelementptr inbounds i8, ptr %3, i64 %idxprom44
-  %46 = load i8, ptr %arrayidx58.us, align 1
-  %conv59.us = sext i8 %46 to i32
-  %mul60.us = mul nsw i32 %conv59.us, 127
-  %add61.us = add nsw i32 %mul60.us, %add27.us
-  %conv62.us = sitofp i32 %add61.us to float
-  %arrayidx65.us = getelementptr inbounds float, ptr %scales, i64 %2
-  %47 = load float, ptr %arrayidx65.us, align 4
-  %mul66.us = fmul float %47, %conv62.us
-  %arrayidx69.us = getelementptr inbounds float, ptr %v, i64 %2
-  store float %mul66.us, ptr %arrayidx69.us, align 4
-  %arrayidx71.us = getelementptr inbounds i8, ptr %5, i64 %idxprom44
-  %48 = load i8, ptr %arrayidx71.us, align 1
-  %conv72.us = sext i8 %48 to i32
-  %mul73.us = mul nsw i32 %conv72.us, 127
-  %add74.us = add nsw i32 %mul73.us, %add35.us
-  %conv75.us = sitofp i32 %add74.us to float
-  %arrayidx78.us = getelementptr inbounds float, ptr %scales, i64 %4
-  %49 = load float, ptr %arrayidx78.us, align 4
-  %mul79.us = fmul float %49, %conv75.us
-  %arrayidx82.us = getelementptr inbounds float, ptr %v, i64 %4
-  store float %mul79.us, ptr %arrayidx82.us, align 4
-  %arrayidx84.us = getelementptr inbounds i8, ptr %7, i64 %idxprom44
-  %50 = load i8, ptr %arrayidx84.us, align 1
-  %conv85.us = sext i8 %50 to i32
-  %mul86.us = mul nsw i32 %conv85.us, 127
-  %add87.us = add nsw i32 %mul86.us, %add43.us
-  %conv88.us = sitofp i32 %add87.us to float
-  %arrayidx91.us = getelementptr inbounds float, ptr %scales, i64 %6
-  %51 = load float, ptr %arrayidx91.us, align 4
-  %mul92.us = fmul float %51, %conv88.us
-  %arrayidx95.us = getelementptr inbounds float, ptr %v, i64 %6
-  store float %mul92.us, ptr %arrayidx95.us, align 4
-  %indvars.iv.next165 = add nuw nsw i64 %indvars.iv164, 4
-  %cmp.us = icmp ult i64 %indvars.iv.next165, %0
+  %gep.idxprom.a0 = getelementptr inbounds i8, ptr %ptr.a, i64 %ext.num_in
+  %load.prom.a0 = load i8, ptr %gep.idxprom.a0, align 1
+  %ext.prom.a0 = sext i8 %load.prom.a0 to i32
+  %mul.prom.a0 = mul nsw i32 %ext.prom.a0, 127
+  %add.prom.a0 = add nsw i32 %mul.prom.a0, %add.a0
+  %fpext.prom.a0 = sitofp i32 %add.prom.a0 to float
+  %gep.idxprom.b = getelementptr inbounds float, ptr %b, i64 %iv2
+  %load.b = load float, ptr %gep.idxprom.b, align 4
+  %fmul.b = fmul float %load.b, %fpext.prom.a0
+  %gep.d = getelementptr inbounds float, ptr %d, i64 %iv2
+  store float %fmul.b, ptr %gep.d, align 4
+  %gep.idxprom.a1 = getelementptr inbounds i8, ptr %ptr.a1, i64 %ext.num_in
+  %load.prom.a1 = load i8, ptr %gep.idxprom.a1, align 1
+  %ext.prom.a1 = sext i8 %load.prom.a1 to i32
+  %mul.prom.a1 = mul nsw i32 %ext.prom.a1, 127
+  %add.prom.a1 = add nsw i32 %mul.prom.a1, %add.a1
+  %fpext.prom.a1 = sitofp i32 %add.prom.a1 to float
+  %gep.b1 = getelementptr inbounds float, ptr %b, i64 %offset.1
+  %load.b1 = load float, ptr %gep.b1, align 4
+  %fmul.b1 = fmul float %load.b1, %fpext.prom.a1
+  %gep.d1 = getelementptr inbounds float, ptr %d, i64 %offset.1
+  store float %fmul.b1, ptr %gep.d1, align 4
+  %gep.idxprom.a2 = getelementptr inbounds i8, ptr %ptr.a2, i64 %ext.num_in
+  %load.prom.a2 = load i8, ptr %gep.idxprom.a2, align 1
+  %ext.prom.a2 = sext i8 %load.prom.a2 to i32
+  %mul.prom.a2 = mul nsw i32 %ext.prom.a2, 127
+  %add.prom.a2 = add nsw i32 %mul.prom.a2, %add.a2
+  %fpext.prom.a2 = sitofp i32 %add.prom.a2 to float
+  %gep.b2 = getelementptr inbounds float, ptr %b, i64 %offset.2
+  %load.b2 = load float, ptr %gep.b2, align 4
+  %fmul.b2 = fmul float %load.b2, %fpext.prom.a2
+  %gep.d2 = getelementptr inbounds float, ptr %d, i64 %offset.2
+  store float %fmul.b2, ptr %gep.d2, align 4
+  %gep.idxprom.a3 = getelementptr inbounds i8, ptr %ptr.a3, i64 %ext.num_in
+  %load.prom.a3 = load i8, ptr %gep.idxprom.a3, align 1
+  %ext.prom.a3 = sext i8 %load.prom.a3 to i32
+  %mul.prom.a3 = mul nsw i32 %ext.prom.a3, 127
+  %add.prom.a3 = add nsw i32 %mul.prom.a3, %add.a3
+  %fpext.prom.a3 = sitofp i32 %add.prom.a3 to float
+  %gep.b3 = getelementptr inbounds float, ptr %b, i64 %offset.3
+  %load.b3 = load float, ptr %gep.b3, align 4
+  %mul.b3 = fmul float %load.b3, %fpext.prom.a3
+  %gep.d3 = getelementptr inbounds float, ptr %d, i64 %offset.3
+  store float %mul.b3, ptr %gep.d3, align 4
+  %iv.next2 = add nuw nsw i64 %iv2, 4
+  %cmp.us = icmp ult i64 %iv.next2, %max_iv
   br i1 %cmp.us, label %for.body.us, label %for.end98
 
 for.end98:                                        ; preds = %for.end98.loopexit171, %for.end98.loopexit, %entry
@@ -1164,18 +1164,18 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
   ret i32 %total.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %total.09 = phi i32 [ 0, %for.body.preheader ], [ %add, %for.body ]
-  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
-  %0 = load i8, ptr %arrayidx, align 1
-  %conv = sext i8 %0 to i32
-  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
-  %1 = load i8, ptr %arrayidx2, align 1
-  %conv3 = sext i8 %1 to i32
-  %mul = mul nsw i32 %conv3, %conv
-  %add = add nsw i32 %mul, %total.09
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %for.body.preheader ], [ %add, %for.body ]
+  %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = sext i8 %load.a to i32
+  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = sext i8 %load.b to i32
+  %mul = mul nsw i32 %ext.b, %ext.a
+  %add = add nsw i32 %mul, %accum
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
 }
 
@@ -1236,18 +1236,18 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
   ret i32 %total.0.lcssa
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %total.09 = phi i32 [ 0, %for.body.preheader ], [ %add, %for.body ]
-  %arrayidx = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
-  %0 = load i8, ptr %arrayidx, align 1
-  %conv = sext i8 %0 to i32
-  %arrayidx2 = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
-  %1 = load i8, ptr %arrayidx2, align 1
-  %conv3 = sext i8 %1 to i32
-  %mul = mul nsw i32 %conv3, %conv
-  %add = add nsw i32 %mul, %total.09
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %for.body.preheader ], [ %add, %for.body ]
+  %gep.a = getelementptr inbounds nuw i8, ptr %b, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = sext i8 %load.a to i32
+  %gep.a2 = getelementptr inbounds nuw i8, ptr %a, i64 %iv
+  %load.b = load i8, ptr %gep.a2, align 1
+  %ext.b = sext i8 %load.b to i32
+  %mul = mul nsw i32 %ext.b, %ext.a
+  %add = add nsw i32 %mul, %accum
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !7
 }
 

>From 569c8747db9008d9906991637a24273156efddec Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 30 Oct 2024 14:26:09 +0000
Subject: [PATCH 15/56] Simplify unroll test

---
 .../AArch64/partial-reduce-dot-product.ll     | 322 +++++-------------
 1 file changed, 81 insertions(+), 241 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 102e0699ab831d..8684425b5a8401 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -742,9 +742,9 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
 }
 
-define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %a, ptr %b, ptr %c, ptr %d) #0 {
-; CHECK-LABEL: define void @dotp_unrolled(
-; CHECK-SAME: i32 [[NUM_OUT:%.*]], i32 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+define i32 @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define i32 @dotp_unrolled(
+; CHECK-SAME: i32 [[NUM_OUT:%.*]], i32 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP154:%.*]] = icmp sgt i32 [[NUM_OUT]], 3
 ; CHECK-NEXT:    br i1 [[CMP154]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END98:%.*]]
@@ -754,33 +754,19 @@ define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %a, ptr %b, ptr %c, pt
 ; CHECK-NEXT:    [[CMP111LOAD_B:%.*]] = icmp sgt i32 [[NUM_IN]], 0
 ; CHECK-NEXT:    [[IDXPROMLOAD_PROM_A0:%.*]] = sext i32 [[NUM_IN]] to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[MUL]] to i64
-; CHECK-NEXT:    br i1 [[CMP111LOAD_B]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_END98]]
-; CHECK:       for.body.us.preheader:
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[NUM_IN]] to i64
-; CHECK-NEXT:    br label [[ITER_CHECK:%.*]]
+; CHECK-NEXT:    br i1 [[CMP111LOAD_B]], label [[ITER_CHECK:%.*]], label [[FOR_END98]]
 ; CHECK:       iter.check:
-; CHECK-NEXT:    [[INDVARS_IV164:%.*]] = phi i64 [ 0, [[FOR_BODY_US_PREHEADER]] ], [ [[IV_NEXT2:%.*]], [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US:%.*]] ]
-; CHECK-NEXT:    [[GEP_A0:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[INDVARS_IV164]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[GEP_A0]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[INDVARS_IV164]], 1
-; CHECK-NEXT:    [[GEP_A1:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[GEP_A1]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[INDVARS_IV164]], 2
-; CHECK-NEXT:    [[GEP_A2:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[GEP_A2]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[INDVARS_IV164]], 3
-; CHECK-NEXT:    [[GEP_A3:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[GEP_A3]], align 8
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[NUM_IN]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[WIDE_TRIP_COUNT]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP2]], i64 16, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -789,31 +775,34 @@ define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %a, ptr %b, ptr %c, pt
 ; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[TMP10]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP10]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint i64 [[TMP10]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP15]], align 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP13]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP17]])
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[WIDE_VEC5:%.*]] = load <128 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = shufflevector <128 x i8> [[WIDE_VEC5]], <128 x i8> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP16]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP21]])
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <128 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = shufflevector <128 x i8> [[WIDE_VEC8]], <128 x i8> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>
 ; CHECK-NEXT:    [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP16]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP25]])
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-NEXT:    [[WIDE_VEC11:%.*]] = load <128 x i8>, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = shufflevector <128 x i8> [[WIDE_VEC11]], <128 x i8> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>
 ; CHECK-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP16]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP29]])
@@ -825,11 +814,10 @@ define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %a, ptr %b, ptr %c, pt
 ; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE9]])
 ; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
 ; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-NEXT:    br label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 [[N_VEC_REMAINING]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP31]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -838,7 +826,9 @@ define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %a, ptr %b, ptr %c, pt
 ; CHECK-NEXT:    [[BC_MERGE_RDX14:%.*]] = phi i32 [ [[TMP34]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF15:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
-; CHECK-NEXT:    [[N_VEC16:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF15]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[N_MOD_VF15]], 0
+; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 8, i64 [[N_MOD_VF15]]
+; CHECK-NEXT:    [[N_VEC16:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP27]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX12]], i32 0
 ; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX13]], i32 0
@@ -851,31 +841,34 @@ define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %a, ptr %b, ptr %c, pt
 ; CHECK-NEXT:    [[VEC_PHI20:%.*]] = phi <2 x i32> [ [[TMP35]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI21:%.*]] = phi <2 x i32> [ [[TMP36]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE24:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP39:%.*]] = add i64 [[INDEX17]], 0
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP39]]
-; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i8, ptr [[TMP40]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i8>, ptr [[TMP47]], align 1
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP39]]
+; CHECK-NEXT:    [[TMP44:%.*]] = or disjoint i64 [[TMP39]], 1
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = or disjoint i64 [[TMP39]], 2
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP46]]
+; CHECK-NEXT:    [[TMP54:%.*]] = or disjoint i64 [[TMP39]], 3
+; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP54]]
+; CHECK-NEXT:    [[WIDE_VEC24:%.*]] = load <64 x i8>, ptr [[TMP40]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = shufflevector <64 x i8> [[WIDE_VEC24]], <64 x i8> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
 ; CHECK-NEXT:    [[TMP41:%.*]] = sext <8 x i8> [[WIDE_LOAD22]] to <8 x i32>
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP39]]
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP39]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[TMP49]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i8>, ptr [[TMP50]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = sext <8 x i8> [[WIDE_LOAD23]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP43:%.*]] = mul nsw <8 x i32> [[TMP42]], [[TMP41]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE24]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI21]], <8 x i32> [[TMP43]])
-; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP39]]
-; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <8 x i8>, ptr [[TMP55]], align 1
+; CHECK-NEXT:    [[WIDE_VEC28:%.*]] = load <64 x i8>, ptr [[TMP45]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = shufflevector <64 x i8> [[WIDE_VEC28]], <64 x i8> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i8> [[WIDE_LOAD25]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP48:%.*]] = mul nsw <8 x i32> [[TMP53]], [[TMP42]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE26]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI20]], <8 x i32> [[TMP48]])
-; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP39]]
-; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i8, ptr [[TMP59]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD27:%.*]] = load <8 x i8>, ptr [[TMP60]], align 1
+; CHECK-NEXT:    [[WIDE_VEC31:%.*]] = load <64 x i8>, ptr [[TMP47]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD27:%.*]] = shufflevector <64 x i8> [[WIDE_VEC31]], <64 x i8> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
 ; CHECK-NEXT:    [[TMP51:%.*]] = sext <8 x i8> [[WIDE_LOAD27]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP52:%.*]] = mul nsw <8 x i32> [[TMP51]], [[TMP42]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE28]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI19]], <8 x i32> [[TMP52]])
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP39]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i8, ptr [[TMP64]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <8 x i8>, ptr [[TMP65]], align 1
+; CHECK-NEXT:    [[WIDE_VEC34:%.*]] = load <64 x i8>, ptr [[TMP55]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = shufflevector <64 x i8> [[WIDE_VEC34]], <64 x i8> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
 ; CHECK-NEXT:    [[TMP57:%.*]] = sext <8 x i8> [[WIDE_LOAD29]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP56:%.*]] = mul nsw <8 x i32> [[TMP57]], [[TMP42]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE30]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI18]], <8 x i32> [[TMP56]])
@@ -887,103 +880,7 @@ define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %a, ptr %b, ptr %c, pt
 ; CHECK-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE28]])
 ; CHECK-NEXT:    [[TMP72:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE26]])
 ; CHECK-NEXT:    [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE24]])
-; CHECK-NEXT:    [[CMP_N28:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC16]]
-; CHECK-NEXT:    br i1 [[CMP_N28]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX29:%.*]] = phi i32 [ [[TMP70]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP31]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX30:%.*]] = phi i32 [ [[TMP71]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP32]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX31:%.*]] = phi i32 [ [[TMP72]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP33]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX32:%.*]] = phi i32 [ [[TMP73]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP34]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY12_US:%.*]]
-; CHECK:       for.body12.us:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY12_US]] ]
-; CHECK-NEXT:    [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX29]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY12_US]] ]
-; CHECK-NEXT:    [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX30]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY12_US]] ]
-; CHECK-NEXT:    [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX31]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY12_US]] ]
-; CHECK-NEXT:    [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX32]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY12_US]] ]
-; CHECK-NEXT:    [[GEP_IDX_A0:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD_A0:%.*]] = load i8, ptr [[GEP_IDX_A0]], align 1
-; CHECK-NEXT:    [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD_C:%.*]] = load i8, ptr [[GEP_C]], align 1
-; CHECK-NEXT:    [[EXT_C:%.*]] = sext i8 [[LOAD_C]] to i32
-; CHECK-NEXT:    [[MUL_A0:%.*]] = mul nsw i32 [[EXT_C]], [[EXT_A0]]
-; CHECK-NEXT:    [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]]
-; CHECK-NEXT:    [[GEP_IDX_A1:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD_A1:%.*]] = load i8, ptr [[GEP_IDX_A1]], align 1
-; CHECK-NEXT:    [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32
-; CHECK-NEXT:    [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_C]]
-; CHECK-NEXT:    [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]]
-; CHECK-NEXT:    [[GEP_IDX_A2:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD_A2:%.*]] = load i8, ptr [[GEP_IDX_A2]], align 1
-; CHECK-NEXT:    [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32
-; CHECK-NEXT:    [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_C]]
-; CHECK-NEXT:    [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]]
-; CHECK-NEXT:    [[GEP_IDX_A3:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD_A3:%.*]] = load i8, ptr [[GEP_IDX_A3]], align 1
-; CHECK-NEXT:    [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32
-; CHECK-NEXT:    [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_C]]
-; CHECK-NEXT:    [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[FOR_BODY12_US]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       for.cond10.for.cond.cleanup_crit_edge.us:
-; CHECK-NEXT:    [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY12_US]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ], [ [[TMP73]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY12_US]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ [[TMP72]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY12_US]] ], [ [[TMP32]], [[MIDDLE_BLOCK]] ], [ [[TMP71]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY12_US]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ], [ [[TMP70]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[GEP_IDXPROM_A0:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[IDXPROMLOAD_PROM_A0]]
-; CHECK-NEXT:    [[LOAD_PROM_A0:%.*]] = load i8, ptr [[GEP_IDXPROM_A0]], align 1
-; CHECK-NEXT:    [[EXT_PROM_A0:%.*]] = sext i8 [[LOAD_PROM_A0]] to i32
-; CHECK-NEXT:    [[MUL_PROM_A0:%.*]] = mul nsw i32 [[EXT_PROM_A0]], 127
-; CHECK-NEXT:    [[ADD_PROM_A0:%.*]] = add nsw i32 [[MUL_PROM_A0]], [[ADD_A0_LCSSA]]
-; CHECK-NEXT:    [[FPEXT_PROM_A0:%.*]] = sitofp i32 [[ADD_PROM_A0]] to float
-; CHECK-NEXT:    [[ARRAYIDX65_US:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV164]]
-; CHECK-NEXT:    [[TMP82:%.*]] = load float, ptr [[ARRAYIDX65_US]], align 4
-; CHECK-NEXT:    [[FMUL_B:%.*]] = fmul float [[TMP82]], [[FPEXT_PROM_A0]]
-; CHECK-NEXT:    [[GEP_D:%.*]] = getelementptr inbounds float, ptr [[D]], i64 [[INDVARS_IV164]]
-; CHECK-NEXT:    store float [[FMUL_B]], ptr [[GEP_D]], align 4
-; CHECK-NEXT:    [[GEP_IDXPROM_A1:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[IDXPROMLOAD_PROM_A0]]
-; CHECK-NEXT:    [[LOAD_PROM_A1:%.*]] = load i8, ptr [[GEP_IDXPROM_A1]], align 1
-; CHECK-NEXT:    [[EXT_PROM_A1:%.*]] = sext i8 [[LOAD_PROM_A1]] to i32
-; CHECK-NEXT:    [[MUL_PROM_A1:%.*]] = mul nsw i32 [[EXT_PROM_A1]], 127
-; CHECK-NEXT:    [[ADD_PROM_A1:%.*]] = add nsw i32 [[MUL_PROM_A1]], [[ADD_A1_LCSSA]]
-; CHECK-NEXT:    [[FPEXT_PROM_A1:%.*]] = sitofp i32 [[ADD_PROM_A1]] to float
-; CHECK-NEXT:    [[GEP_B1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[LOAD_B1:%.*]] = load float, ptr [[GEP_B1]], align 4
-; CHECK-NEXT:    [[FMUL_B1:%.*]] = fmul float [[LOAD_B1]], [[FPEXT_PROM_A1]]
-; CHECK-NEXT:    [[GEP_D1:%.*]] = getelementptr inbounds float, ptr [[D]], i64 [[TMP2]]
-; CHECK-NEXT:    store float [[FMUL_B1]], ptr [[GEP_D1]], align 4
-; CHECK-NEXT:    [[GEP_IDXPROM_A2:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[IDXPROMLOAD_PROM_A0]]
-; CHECK-NEXT:    [[LOAD_PROM_A2:%.*]] = load i8, ptr [[GEP_IDXPROM_A2]], align 1
-; CHECK-NEXT:    [[EXT_PROM_A2:%.*]] = sext i8 [[LOAD_PROM_A2]] to i32
-; CHECK-NEXT:    [[MUL_PROM_A2:%.*]] = mul nsw i32 [[EXT_PROM_A2]], 127
-; CHECK-NEXT:    [[ADD_PROM_A2:%.*]] = add nsw i32 [[MUL_PROM_A2]], [[ADD_A2_LCSSA]]
-; CHECK-NEXT:    [[FPEXT_PROM_A2:%.*]] = sitofp i32 [[ADD_PROM_A2]] to float
-; CHECK-NEXT:    [[GEP_B2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
-; CHECK-NEXT:    [[LOAD_B2:%.*]] = load float, ptr [[GEP_B2]], align 4
-; CHECK-NEXT:    [[FMUL_B2:%.*]] = fmul float [[LOAD_B2]], [[FPEXT_PROM_A2]]
-; CHECK-NEXT:    [[GEP_D2:%.*]] = getelementptr inbounds float, ptr [[D]], i64 [[TMP4]]
-; CHECK-NEXT:    store float [[FMUL_B2]], ptr [[GEP_D2]], align 4
-; CHECK-NEXT:    [[GEP_IDXPROM_A3:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[IDXPROMLOAD_PROM_A0]]
-; CHECK-NEXT:    [[LOAD_PROM_A3:%.*]] = load i8, ptr [[GEP_IDXPROM_A3]], align 1
-; CHECK-NEXT:    [[EXT_PROM_A3:%.*]] = sext i8 [[LOAD_PROM_A3]] to i32
-; CHECK-NEXT:    [[MUL_PROM_A3:%.*]] = mul nsw i32 [[EXT_PROM_A3]], 127
-; CHECK-NEXT:    [[ADD_PROM_A3:%.*]] = add nsw i32 [[MUL_PROM_A3]], [[ADD_A3_LCSSA]]
-; CHECK-NEXT:    [[FPEXT_PROM_A3:%.*]] = sitofp i32 [[ADD_PROM_A3]] to float
-; CHECK-NEXT:    [[GEP_B3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]]
-; CHECK-NEXT:    [[LOAD_B3:%.*]] = load float, ptr [[GEP_B3]], align 4
-; CHECK-NEXT:    [[MUL_B3:%.*]] = fmul float [[LOAD_B3]], [[FPEXT_PROM_A3]]
-; CHECK-NEXT:    [[GEP_D3:%.*]] = getelementptr inbounds float, ptr [[D]], i64 [[TMP6]]
-; CHECK-NEXT:    store float [[MUL_B3]], ptr [[GEP_D3]], align 4
-; CHECK-NEXT:    [[IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV164]], 4
-; CHECK-NEXT:    [[CMP_US:%.*]] = icmp ult i64 [[IV_NEXT2]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[CMP_US]], label [[ITER_CHECK]], label [[FOR_END98_LOOPEXIT:%.*]]
-; CHECK:       for.end98.loopexit:
-; CHECK-NEXT:    br label [[FOR_END98]]
-; CHECK:       for.end98:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br label [[VEC_EPILOG_SCALAR_PH]]
 ;
 entry:
   %cmp154 = icmp sgt i32 %num_out, 3
@@ -995,114 +892,57 @@ for.body.lr.ph:                                   ; preds = %entry
   %cmp.num_in = icmp sgt i32 %num_in, 0
   %ext.num_in = sext i32 %num_in to i64
   %max_iv = zext nneg i32 %mul to i64
-  br i1 %cmp.num_in, label %for.body.us.preheader, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %for.body.lr.ph
-  br label %for.end98
+  br i1 %cmp.num_in, label %for.body.us.preheader, label %for.end98
 
 for.body.us.preheader:                            ; preds = %for.body.lr.ph
   %wide.trip.count = zext nneg i32 %num_in to i64
-  br label %for.body.us
+  br label %for.body12.us
 
-for.body.us:                                      ; preds = %for.body.us.preheader, %for.cond10.for.cond.cleanup_crit_edge.us
-  %iv2 = phi i64 [ 0, %for.body.us.preheader ], [ %iv.next2, %for.cond10.for.cond.cleanup_crit_edge.us ]
-  %gep.a0 = getelementptr inbounds ptr, ptr %a, i64 %iv2
-  %ptr.a = load ptr, ptr %gep.a0, align 8
-  %offset.1 = or disjoint i64 %iv2, 1
+for.body12.us:                                    ; preds = %for.body.us, %for.body12.us
+  %iv = phi i64 [ 0, %for.body.us.preheader ], [ %iv.next, %for.body12.us ]
+  %accum3 = phi i32 [ 0, %for.body.us.preheader ], [ %add.a3, %for.body12.us ]
+  %accum2 = phi i32 [ 0, %for.body.us.preheader ], [ %add.a2, %for.body12.us ]
+  %accum1 = phi i32 [ 0, %for.body.us.preheader ], [ %add.a1, %for.body12.us ]
+  %accum0 = phi i32 [ 0, %for.body.us.preheader ], [ %add.a0, %for.body12.us ]
+  %gep.a0 = getelementptr inbounds ptr, ptr %a, i64 %iv
+  %offset.1 = or disjoint i64 %iv, 1
   %gep.a1 = getelementptr inbounds ptr, ptr %a, i64 %offset.1
-  %ptr.a1 = load ptr, ptr %gep.a1, align 8
-  %offset.2 = or disjoint i64 %iv2, 2
+  %offset.2 = or disjoint i64 %iv, 2
   %gep.a2 = getelementptr inbounds ptr, ptr %a, i64 %offset.2
-  %ptr.a2 = load ptr, ptr %gep.a2, align 8
-  %offset.3 = or disjoint i64 %iv2, 3
+  %offset.3 = or disjoint i64 %iv, 3
   %gep.a3 = getelementptr inbounds ptr, ptr %a, i64 %offset.3
-  %ptr.a3 = load ptr, ptr %gep.a3, align 8
-  br label %for.body12.us
-
-for.body12.us:                                    ; preds = %for.body.us, %for.body12.us
-  %iv = phi i64 [ 0, %for.body.us ], [ %iv.next, %for.body12.us ]
-  %accum3 = phi i32 [ 0, %for.body.us ], [ %add.a3, %for.body12.us ]
-  %accum2 = phi i32 [ 0, %for.body.us ], [ %add.a2, %for.body12.us ]
-  %accum1 = phi i32 [ 0, %for.body.us ], [ %add.a1, %for.body12.us ]
-  %accum0 = phi i32 [ 0, %for.body.us ], [ %add.a0, %for.body12.us ]
-  %gep.idx.a0 = getelementptr inbounds i8, ptr %ptr.a, i64 %iv
-  %load.a0 = load i8, ptr %gep.idx.a0, align 1
+  %load.a0 = load i8, ptr %gep.a0, align 1
   %ext.a0 = sext i8 %load.a0 to i32
-  %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv
-  %load.c = load i8, ptr %gep.c, align 1
-  %ext.c = sext i8 %load.c to i32
-  %mul.a0 = mul nsw i32 %ext.c, %ext.a0
+  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = sext i8 %load.b to i32
+  %mul.a0 = mul nsw i32 %ext.b, %ext.a0
   %add.a0 = add nsw i32 %mul.a0, %accum0
-  %gep.idx.a1 = getelementptr inbounds i8, ptr %ptr.a1, i64 %iv
-  %load.a1 = load i8, ptr %gep.idx.a1, align 1
+  %load.a1 = load i8, ptr %gep.a1, align 1
   %ext.a1 = sext i8 %load.a1 to i32
-  %mul.a1 = mul nsw i32 %ext.a1, %ext.c
+  %mul.a1 = mul nsw i32 %ext.a1, %ext.b
   %add.a1 = add nsw i32 %mul.a1, %accum1
-  %gep.idx.a2 = getelementptr inbounds i8, ptr %ptr.a2, i64 %iv
-  %load.a2 = load i8, ptr %gep.idx.a2, align 1
+  %load.a2 = load i8, ptr %gep.a2, align 1
   %ext.a2 = sext i8 %load.a2 to i32
-  %mul.a2 = mul nsw i32 %ext.a2, %ext.c
+  %mul.a2 = mul nsw i32 %ext.a2, %ext.b
   %add.a2 = add nsw i32 %mul.a2, %accum2
-  %gep.idx.a3 = getelementptr inbounds i8, ptr %ptr.a3, i64 %iv
-  %load.a3 = load i8, ptr %gep.idx.a3, align 1
+  %load.a3 = load i8, ptr %gep.a3, align 1
   %ext.a3 = sext i8 %load.a3 to i32
-  %mul.a3 = mul nsw i32 %ext.a3, %ext.c
+  %mul.a3 = mul nsw i32 %ext.a3, %ext.b
   %add.a3 = add nsw i32 %mul.a3, %accum3
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond10.for.cond.cleanup_crit_edge.us, label %for.body12.us
+  br i1 %exitcond.not, label %for.end98, label %for.body12.us
 
-for.cond10.for.cond.cleanup_crit_edge.us:         ; preds = %for.body12.us
-  %gep.idxprom.a0 = getelementptr inbounds i8, ptr %ptr.a, i64 %ext.num_in
-  %load.prom.a0 = load i8, ptr %gep.idxprom.a0, align 1
-  %ext.prom.a0 = sext i8 %load.prom.a0 to i32
-  %mul.prom.a0 = mul nsw i32 %ext.prom.a0, 127
-  %add.prom.a0 = add nsw i32 %mul.prom.a0, %add.a0
-  %fpext.prom.a0 = sitofp i32 %add.prom.a0 to float
-  %gep.idxprom.b = getelementptr inbounds float, ptr %b, i64 %iv2
-  %load.b = load float, ptr %gep.idxprom.b, align 4
-  %fmul.b = fmul float %load.b, %fpext.prom.a0
-  %gep.d = getelementptr inbounds float, ptr %d, i64 %iv2
-  store float %fmul.b, ptr %gep.d, align 4
-  %gep.idxprom.a1 = getelementptr inbounds i8, ptr %ptr.a1, i64 %ext.num_in
-  %load.prom.a1 = load i8, ptr %gep.idxprom.a1, align 1
-  %ext.prom.a1 = sext i8 %load.prom.a1 to i32
-  %mul.prom.a1 = mul nsw i32 %ext.prom.a1, 127
-  %add.prom.a1 = add nsw i32 %mul.prom.a1, %add.a1
-  %fpext.prom.a1 = sitofp i32 %add.prom.a1 to float
-  %gep.b1 = getelementptr inbounds float, ptr %b, i64 %offset.1
-  %load.b1 = load float, ptr %gep.b1, align 4
-  %fmul.b1 = fmul float %load.b1, %fpext.prom.a1
-  %gep.d1 = getelementptr inbounds float, ptr %d, i64 %offset.1
-  store float %fmul.b1, ptr %gep.d1, align 4
-  %gep.idxprom.a2 = getelementptr inbounds i8, ptr %ptr.a2, i64 %ext.num_in
-  %load.prom.a2 = load i8, ptr %gep.idxprom.a2, align 1
-  %ext.prom.a2 = sext i8 %load.prom.a2 to i32
-  %mul.prom.a2 = mul nsw i32 %ext.prom.a2, 127
-  %add.prom.a2 = add nsw i32 %mul.prom.a2, %add.a2
-  %fpext.prom.a2 = sitofp i32 %add.prom.a2 to float
-  %gep.b2 = getelementptr inbounds float, ptr %b, i64 %offset.2
-  %load.b2 = load float, ptr %gep.b2, align 4
-  %fmul.b2 = fmul float %load.b2, %fpext.prom.a2
-  %gep.d2 = getelementptr inbounds float, ptr %d, i64 %offset.2
-  store float %fmul.b2, ptr %gep.d2, align 4
-  %gep.idxprom.a3 = getelementptr inbounds i8, ptr %ptr.a3, i64 %ext.num_in
-  %load.prom.a3 = load i8, ptr %gep.idxprom.a3, align 1
-  %ext.prom.a3 = sext i8 %load.prom.a3 to i32
-  %mul.prom.a3 = mul nsw i32 %ext.prom.a3, 127
-  %add.prom.a3 = add nsw i32 %mul.prom.a3, %add.a3
-  %fpext.prom.a3 = sitofp i32 %add.prom.a3 to float
-  %gep.b3 = getelementptr inbounds float, ptr %b, i64 %offset.3
-  %load.b3 = load float, ptr %gep.b3, align 4
-  %mul.b3 = fmul float %load.b3, %fpext.prom.a3
-  %gep.d3 = getelementptr inbounds float, ptr %d, i64 %offset.3
-  store float %mul.b3, ptr %gep.d3, align 4
-  %iv.next2 = add nuw nsw i64 %iv2, 4
-  %cmp.us = icmp ult i64 %iv.next2, %max_iv
-  br i1 %cmp.us, label %for.body.us, label %for.end98
-
-for.end98:                                        ; preds = %for.end98.loopexit171, %for.end98.loopexit, %entry
-  ret void
+for.end98:                                        ; preds = %for.body12.us, %entry
+  %result.a0 = phi i32 [ 0, %for.body.lr.ph ], [ 0, %entry ], [ %add.a0, %for.body12.us ]
+  %result.a1 = phi i32 [ 0, %for.body.lr.ph ], [ 0, %entry ], [ %add.a1, %for.body12.us ]
+  %result.a2 = phi i32 [ 0, %for.body.lr.ph ], [ 0, %entry ], [ %add.a2, %for.body12.us ]
+  %result.a3 = phi i32 [ 0, %for.body.lr.ph ], [ 0, %entry ], [ %add.a3, %for.body12.us ]
+  %result0 = add nsw i32 %result.a0, %result.a1
+  %result1 = add nsw i32 %result.a2, %result.a3
+  %result = add nsw i32 %result0, %result1
+  ret i32 %result
 }
 
 define i32 @not_dotp_predicated(i32 %N, ptr %a, ptr %b) #0 {

>From 9c9d962bbe5c51716ae93ed8f5b720647fb36f0b Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 30 Oct 2024 15:09:39 +0000
Subject: [PATCH 16/56] Add test with an extend with a non partial reduction
 user

---
 .../AArch64/partial-reduce-dot-product.ll     | 91 +++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 8684425b5a8401..681657fa8bc8f2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1091,6 +1091,97 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !7
 }
 
+define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  iter.check:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-INTERLEAVE1:       vector.main.loop.iter.check:
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10]] = add <16 x i32> [[TMP9]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  iter.check:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-INTERLEAVED:       vector.main.loop.iter.check:
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP8]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul <16 x i32> [[TMP11]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul <16 x i32> [[TMP12]], [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP15]] = add <16 x i32> [[TMP13]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP16]] = add <16 x i32> [[TMP14]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %result = add i32 %add, %ext.b
+  ret i32 %result
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
 !7 = distinct !{!7, !8, !9, !10}
 !8 = !{!"llvm.loop.mustprogress"}
 !9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}

>From 164dd37d36ad20d068786d8bfb479318a33e45eb Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 30 Oct 2024 15:12:27 +0000
Subject: [PATCH 17/56] Clean up printing test

---
 .../LoopVectorize/AArch64/vplan-printing.ll   | 47 +++++++++----------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index d6e61e8734f9cf..79e7abb367be7a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -6,7 +6,7 @@ target triple = "aarch64-none-unknown-elf"
 
 ; Tests for printing VPlans that are enabled under AArch64
 
-define void @print_partial_reduction(ptr %a, ptr %b) {
+define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-LABEL: Checking a loop in 'print_partial_reduction'
 ; CHECK:      VPlan 'Initial VPlan for VF={2,4},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
@@ -23,12 +23,12 @@ define void @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:   CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<[[STEPS]]>
 ; CHECK-NEXT:   vp<%4> = vector-pointer ir<%arrayidx>
-; CHECK-NEXT:   WIDEN ir<%1> = load vp<%4>
-; CHECK-NEXT:   WIDEN-CAST ir<%conv> = zext ir<%1> to i32
+; CHECK-NEXT:   WIDEN ir<%0> = load vp<%4>
+; CHECK-NEXT:   WIDEN-CAST ir<%conv> = zext ir<%0> to i32
 ; CHECK-NEXT:   CLONE ir<%arrayidx2> = getelementptr ir<%b>, vp<[[STEPS]]>
 ; CHECK-NEXT:   vp<%5> = vector-pointer ir<%arrayidx2>
-; CHECK-NEXT:   WIDEN ir<%2> = load vp<%5>
-; CHECK-NEXT:   WIDEN-CAST ir<%conv3> = zext ir<%2> to i32
+; CHECK-NEXT:   WIDEN ir<%1> = load vp<%5>
+; CHECK-NEXT:   WIDEN-CAST ir<%conv3> = zext ir<%1> to i32
 ; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%conv3>, ir<%conv>
 ; CHECK-NEXT:   WIDEN ir<%add> = add ir<%mul>, ir<[[ACC]]>
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
@@ -42,11 +42,10 @@ define void @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT:   EMIT vp<%9> = extract-from-end vp<%8>, ir<1>
 ; CHECK-NEXT:   EMIT vp<%10> = icmp eq ir<0>, vp<%1>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<%10>
-; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
+; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT: ir-bb<exit>:
 ; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%9>)
-; CHECK-NEXT:   IR   %0 = lshr i32 %add.lcssa, 0
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
@@ -67,12 +66,12 @@ define void @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:   CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<[[STEPS]]>
 ; CHECK-NEXT:   vp<%4> = vector-pointer ir<%arrayidx>
-; CHECK-NEXT:   WIDEN ir<%1> = load vp<%4>
-; CHECK-NEXT:   WIDEN-CAST ir<%conv> = zext ir<%1> to i32
+; CHECK-NEXT:   WIDEN ir<%0> = load vp<%4>
+; CHECK-NEXT:   WIDEN-CAST ir<%conv> = zext ir<%0> to i32
 ; CHECK-NEXT:   CLONE ir<%arrayidx2> = getelementptr ir<%b>, vp<[[STEPS]]>
 ; CHECK-NEXT:   vp<%5> = vector-pointer ir<%arrayidx2>
-; CHECK-NEXT:   WIDEN ir<%2> = load vp<%5>
-; CHECK-NEXT:   WIDEN-CAST ir<%conv3> = zext ir<%2> to i32
+; CHECK-NEXT:   WIDEN ir<%1> = load vp<%5>
+; CHECK-NEXT:   WIDEN-CAST ir<%conv3> = zext ir<%1> to i32
 ; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%conv3>, ir<%conv>
 ; CHECK-NEXT:   PARTIAL-REDUCE vp<%6> = add ir<%mul>, ir<[[ACC]]>
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
@@ -86,11 +85,10 @@ define void @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT:   EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1>
 ; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<0>, vp<%1>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
-; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
+; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT: ir-bb<exit>:
 ; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]>)
-; CHECK-NEXT:   IR   %0 = lshr i32 %add.lcssa, 0
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
@@ -100,22 +98,21 @@ define void @print_partial_reduction(ptr %a, ptr %b) {
 entry:
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %0 = lshr i32 %add, 0
-  ret void
-
 for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %acc.010 = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %arrayidx = getelementptr i8, ptr %a, i64 %indvars.iv
+  %arrayidx = getelementptr i8, ptr %a, i64 %iv
   %1 = load i8, ptr %arrayidx, align 1
   %conv = zext i8 %1 to i32
-  %arrayidx2 = getelementptr i8, ptr %b, i64 %indvars.iv
+  %arrayidx2 = getelementptr i8, ptr %b, i64 %iv
   %2 = load i8, ptr %arrayidx2, align 1
   %conv3 = zext i8 %2 to i32
   %mul = mul i32 %conv3, %conv
   %add = add i32 %mul, %acc.010
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, 0
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret i32 %add
 }

>From 6578b3398503304592a5746fcdb0ddc7d6b69842 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 30 Oct 2024 17:25:03 +0000
Subject: [PATCH 18/56] format

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 5 +++--
 llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h | 2 --
 llvm/lib/Transforms/Vectorize/VPlan.h           | 6 +++---
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fced747fd90308..e4d1e164d9adda 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8838,7 +8838,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
 
   SmallVector<VPValue *, 2> OrderedOperands = {BinOp, Phi};
   return new VPPartialReductionRecipe(
-      Reduction->getOpcode(), make_range(OrderedOperands.begin(), OrderedOperands.end()));
+      Reduction->getOpcode(),
+      make_range(OrderedOperands.begin(), OrderedOperands.end()));
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -9193,7 +9194,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   // before creating the recipes
   for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
     if (std::optional<PartialReductionChain> Chain =
-        getScaledReduction(Phi, RdxDesc, &TTI, Range, CM))
+            getScaledReduction(Phi, RdxDesc, &TTI, Range, CM))
       RecipeBuilder.addScaledReductionExitInstr(*Chain);
   RecipeBuilder.removeInvalidScaledReductionExitInstrs();
 
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index e25979badf79e8..71722f5f2e9009 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -42,7 +42,6 @@ struct PartialReductionChain {
   unsigned ScaleFactor;
 };
 
-
 /// Helper class to create VPRecipies from IR instructions.
 class VPRecipeBuilder {
   /// The VPlan new recipes are added to.
@@ -196,7 +195,6 @@ class VPRecipeBuilder {
       ScaledReductionExitInstrs.erase(Instr);
   }
 
-
   /// Create and return a widened recipe for \p I if one can be created within
   /// the given VF \p Range.
   VPRecipeBase *tryToCreateWidenRecipe(Instruction *Instr,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 290546adc59359..3a9abcbd5e9127 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2392,7 +2392,8 @@ class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
 
 public:
   template <typename IterT>
-  VPPartialReductionRecipe(unsigned ReductionOpcode, iterator_range<IterT> Operands)
+  VPPartialReductionRecipe(unsigned ReductionOpcode,
+                           iterator_range<IterT> Operands)
       : VPRecipeWithIRFlags(VPDef::VPPartialReductionSC, Operands),
         Opcode(ReductionOpcode) {
     assert(isa<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe()) &&
@@ -2400,8 +2401,7 @@ class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
   }
   ~VPPartialReductionRecipe() override = default;
   VPPartialReductionRecipe *clone() override {
-    return new VPPartialReductionRecipe(Opcode,
-                                        operands());
+    return new VPPartialReductionRecipe(Opcode, operands());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)

>From c365a97bc6e1589a15d5780b4a11255d1d44dc44 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Thu, 31 Oct 2024 14:25:51 +0000
Subject: [PATCH 19/56] Remove single-use variable

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e4d1e164d9adda..efb99b6ecf2f82 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8832,8 +8832,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
 
   VPValue *BinOp = Operands[0];
   VPValue *Phi = Operands[1];
-  VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
-  if (isa<VPReductionPHIRecipe>(BinOpRecipe))
+  if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe()))
     std::swap(BinOp, Phi);
 
   SmallVector<VPValue *, 2> OrderedOperands = {BinOp, Phi};

>From 71cb0c4baab7662c6313dab4497be95aa43bd2a8 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Thu, 31 Oct 2024 14:28:55 +0000
Subject: [PATCH 20/56] Move removeInvalidScaledReductionExitInstrs()

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 36 +++++++++++++++++++
 .../Transforms/Vectorize/VPRecipeBuilder.h    | 36 +------------------
 2 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index efb99b6ecf2f82..4a1896bedcb2dc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8824,6 +8824,42 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   return tryToWiden(Instr, Operands, VPBB);
 }
 
+void VPRecipeBuilder::removeInvalidScaledReductionExitInstrs() {
+  // A partial reduction is invalid if any of its extends are used by
+  // something that isn't another partial reduction. This is because the
+  // extends are intended to be lowered along with the reduction itself.
+
+  // Build up a set of partial reduction bin ops for efficient use checking
+  SmallSet<Instruction *, 4> PartialReductionBinOps;
+  for (auto It : ScaledReductionExitInstrs) {
+    if (It.second.BinOp)
+      PartialReductionBinOps.insert(It.second.BinOp);
+  }
+
+  auto ExtendIsOnlyUsedByPartialReductions =
+      [PartialReductionBinOps](Instruction *Extend) {
+        for (auto *Use : Extend->users()) {
+          Instruction *UseInstr = dyn_cast<Instruction>(Use);
+          if (!PartialReductionBinOps.contains(UseInstr))
+            return false;
+        }
+        return true;
+      };
+
+  // Check if each use of a chain's two extends is a partial reduction
+  // and remove those that have non-partial reduction users
+  SmallSet<Instruction *, 4> PartialReductionsToRemove;
+  for (auto It : ScaledReductionExitInstrs) {
+    PartialReductionChain Chain = It.second;
+    if (!ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) ||
+        !ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
+      PartialReductionsToRemove.insert(Chain.Reduction);
+  }
+
+  for (auto *Instr : PartialReductionsToRemove)
+    ScaledReductionExitInstrs.erase(Instr);
+}
+
 VPRecipeBase *
 VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
                                              ArrayRef<VPValue *> Operands) {
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 71722f5f2e9009..d21ff8e71cdb92 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -159,41 +159,7 @@ class VPRecipeBuilder {
                : std::make_optional(It->second);
   }
 
-  void removeInvalidScaledReductionExitInstrs() {
-    // A partial reduction is invalid if any of its extends are used by
-    // something that isn't another partial reduction. This is because the
-    // extends are intended to be lowered along with the reduction itself.
-
-    // Build up a set of partial reduction bin ops for efficient use checking
-    SmallSet<Instruction *, 4> PartialReductionBinOps;
-    for (auto It : ScaledReductionExitInstrs) {
-      if (It.second.BinOp)
-        PartialReductionBinOps.insert(It.second.BinOp);
-    }
-
-    auto ExtendIsOnlyUsedByPartialReductions =
-        [PartialReductionBinOps](Instruction *Extend) {
-          for (auto *Use : Extend->users()) {
-            Instruction *UseInstr = dyn_cast<Instruction>(Use);
-            if (!PartialReductionBinOps.contains(UseInstr))
-              return false;
-          }
-          return true;
-        };
-
-    // Check if each use of a chain's two extends is a partial reduction
-    // and remove those that have non-partial reduction users
-    SmallSet<Instruction *, 4> PartialReductionsToRemove;
-    for (auto It : ScaledReductionExitInstrs) {
-      PartialReductionChain Chain = It.second;
-      if (!ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) ||
-          !ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
-        PartialReductionsToRemove.insert(Chain.Reduction);
-    }
-
-    for (auto *Instr : PartialReductionsToRemove)
-      ScaledReductionExitInstrs.erase(Instr);
-  }
+  void removeInvalidScaledReductionExitInstrs();
 
   /// Create and return a widened recipe for \p I if one can be created within
   /// the given VF \p Range.

>From d686f422d6ffcc8a77b6c4774e42680b5c050642 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Thu, 31 Oct 2024 14:31:33 +0000
Subject: [PATCH 21/56] Infer scalar type using operands

---
 llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 1d8d949a8b9306..a886a5bea5e037 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -235,11 +235,6 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) {
   llvm_unreachable("Unhandled opcode");
 }
 
-Type *
-VPTypeAnalysis::inferScalarTypeForRecipe(const VPPartialReductionRecipe *R) {
-  return R->getUnderlyingInstr()->getType();
-}
-
 Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
   if (Type *CachedTy = CachedTypes.lookup(V))
     return CachedTy;
@@ -268,10 +263,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
               [](const auto *R) { return R->getScalarType(); })
           .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
                 VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
-                VPReverseVectorPointerRecipe, VPWidenCanonicalIVRecipe>(
-              [this](const VPRecipeBase *R) {
-                return inferScalarType(R->getOperand(0));
-              })
+                VPReverseVectorPointerRecipe, VPWidenCanonicalIVRecipe,
+                VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
+            return inferScalarType(R->getOperand(0));
+          })
           .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPWidenEVLRecipe,
                 VPReplicateRecipe, VPWidenCallRecipe, VPWidenMemoryRecipe,
                 VPWidenSelectRecipe, VPPartialReductionRecipe>(

>From db18f45b89deae8507f529b37b3028db89700bfa Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Thu, 31 Oct 2024 14:35:01 +0000
Subject: [PATCH 22/56] Remove added forward declarations and include from
 VPlanTransforms.h

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 54af9a7c84585a..11e094db6294f6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -15,12 +15,10 @@
 
 #include "VPlan.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 
 namespace llvm {
 
 class InductionDescriptor;
-class RecurrenceDescriptor;
 class Instruction;
 class PHINode;
 class ScalarEvolution;
@@ -28,7 +26,6 @@ class PredicatedScalarEvolution;
 class TargetLibraryInfo;
 class VPBuilder;
 class VPRecipeBuilder;
-class TargetTransformInfo;
 
 struct VPlanTransforms {
   /// Replaces the VPInstructions in \p Plan with corresponding

>From 87846b02606fc94c122b5710bfbaddc2d3018dc4 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Thu, 31 Oct 2024 14:43:58 +0000
Subject: [PATCH 23/56] Rename and move exit blocks

---
 .../AArch64/partial-reduce-dot-product.ll     | 111 +++++++++---------
 1 file changed, 53 insertions(+), 58 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 681657fa8bc8f2..3549c8111c7af9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -157,10 +157,6 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 entry:
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %result = lshr i32 %add, 0
-  ret i32 %result
-
 for.body:                                         ; preds = %for.body, %entry
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
@@ -174,11 +170,14 @@ for.body:                                         ; preds = %for.body, %entry
   %add = add i32 %mul, %accum
   %iv.next = add i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
 }
 
-define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
-; CHECK-INTERLEAVE1-LABEL: define void @not_dotp_different_types(
+define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types(
 ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVE1-NEXT:  iter.check:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP72:%.*]] = call i64 @llvm.vscale.i64()
@@ -321,13 +320,9 @@ define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX8:%.*]] = phi i32 [ [[TMP94]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       for.cond.cleanup.loopexit:
-; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP71]], [[MIDDLE_BLOCK]] ], [ [[TMP94]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP95:%.*]] = lshr i32 [[ADD_LCSSA]], 0
-; CHECK-INTERLEAVE1-NEXT:    ret void
 ; CHECK-INTERLEAVE1:       for.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX8]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX8]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
 ; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
@@ -339,8 +334,11 @@ define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[MIDDLE_BLOCK]] ], [ [[TMP94]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[ADD_LCSSA]]
 ;
-; CHECK-INTERLEAVED-LABEL: define void @not_dotp_different_types(
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  iter.check:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
@@ -507,10 +505,6 @@ define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
 entry:
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %0 = lshr i32 %add, 0
-  ret void
-
 for.body:                                         ; preds = %for.body, %entry
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
@@ -524,11 +518,14 @@ for.body:                                         ; preds = %for.body, %entry
   %add = add i32 %mul, %accum
   %iv.next = add i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
 }
 
-define void @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
-; CHECK-INTERLEAVE1-LABEL: define void @not_dotp_not_loop_carried(
+define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried(
 ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -566,7 +563,7 @@ define void @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ;
-; CHECK-INTERLEAVED-LABEL: define void @not_dotp_not_loop_carried(
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -618,10 +615,6 @@ define void @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 entry:
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %0 = lshr i32 %add, 0
-  ret void
-
 for.body:                                         ; preds = %for.body, %entry
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ]
@@ -635,11 +628,14 @@ for.body:                                         ; preds = %for.body, %entry
   %add = add i32 %mul, %accum
   %iv.next = add i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
 }
 
-define void @not_dotp_not_phi(ptr %a, ptr %b) #0 {
-; CHECK-INTERLEAVE1-LABEL: define void @not_dotp_not_phi(
+define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi(
 ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -676,7 +672,7 @@ define void @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ;
-; CHECK-INTERLEAVED-LABEL: define void @not_dotp_not_phi(
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -722,10 +718,6 @@ define void @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 entry:
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %0 = lshr i32 %add, 0
-  ret void
-
 for.body:                                         ; preds = %for.body, %entry
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
@@ -739,7 +731,10 @@ for.body:                                         ; preds = %for.body, %entry
   %add = add i32 %mul, %ext.b
   %iv.next = add i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
 }
 
 define i32 @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %a, ptr %b) #0 {
@@ -989,20 +984,12 @@ define i32 @not_dotp_predicated(i32 %N, ptr %a, ptr %b) #0 {
 entry:
   %rem = srem i32 %N, 16
   %cmp8 = icmp sgt i32 %rem, 0
-  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+  br i1 %cmp8, label %for.body.preheader, label %for.cleanup
 
 for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext nneg i32 %rem to i64
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %add.lcssa = phi i32 [ %add, %for.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %total.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
-  ret i32 %total.0.lcssa
-
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
   %accum = phi i32 [ 0, %for.body.preheader ], [ %add, %for.body ]
@@ -1016,7 +1003,15 @@ for.body:                                         ; preds = %for.body.preheader,
   %add = add nsw i32 %mul, %accum
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.cleanup
+
+for.cleanup:                                 ; preds = %for.exit, %entry
+  %total = phi i32 [ 0, %entry ], [ %add.lcssa, %for.exit ]
+  ret i32 %total
 }
 
 define i32 @not_dotp_predicated_pragma(i32 %N, ptr %a, ptr %b) #0 {
@@ -1067,14 +1062,6 @@ for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %add.lcssa = phi i32 [ %add, %for.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %total.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
-  ret i32 %total.0.lcssa
-
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
   %accum = phi i32 [ 0, %for.body.preheader ], [ %add, %for.body ]
@@ -1088,7 +1075,15 @@ for.body:                                         ; preds = %for.body.preheader,
   %add = add nsw i32 %mul, %accum
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !7
+  br i1 %exitcond.not, label %for.exit, label %for.body, !llvm.loop !7
+
+for.exit:                        ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.exit, %entry
+  %total = phi i32 [ 0, %entry ], [ %add.lcssa, %for.exit ]
+  ret i32 %total
 }
 
 define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
@@ -1162,10 +1157,6 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 entry:
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %result = add i32 %add, %ext.b
-  ret i32 %result
-
 for.body:                                         ; preds = %for.body, %entry
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
@@ -1179,7 +1170,11 @@ for.body:                                         ; preds = %for.body, %entry
   %add = add i32 %mul, %accum
   %iv.next = add i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  %result = add i32 %add, %ext.b
+  ret i32 %result
 }
 
 !7 = distinct !{!7, !8, !9, !10}

>From 9245acac248e95d0e51663a2f1b2955dfe3ec8e3 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Thu, 31 Oct 2024 14:51:17 +0000
Subject: [PATCH 24/56] Improve unrolled and predicated tests

---
 .../AArch64/partial-reduce-dot-product.ll     | 401 +++++++++---------
 1 file changed, 210 insertions(+), 191 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 3549c8111c7af9..4c839fccf70498 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -737,31 +737,18 @@ for.exit:                        ; preds = %for.body
   ret i32 %add
 }
 
-define i32 @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %a, ptr %b) #0 {
+define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-LABEL: define i32 @dotp_unrolled(
-; CHECK-SAME: i32 [[NUM_OUT:%.*]], i32 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP154:%.*]] = icmp sgt i32 [[NUM_OUT]], 3
-; CHECK-NEXT:    br i1 [[CMP154]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END98:%.*]]
-; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[NUM_OUT]], 4
-; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[DIV]], 2
-; CHECK-NEXT:    [[CMP111LOAD_B:%.*]] = icmp sgt i32 [[NUM_IN]], 0
-; CHECK-NEXT:    [[IDXPROMLOAD_PROM_A0:%.*]] = sext i32 [[NUM_IN]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[MUL]] to i64
-; CHECK-NEXT:    br i1 [[CMP111LOAD_B]], label [[ITER_CHECK:%.*]], label [[FOR_END98]]
-; CHECK:       iter.check:
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[NUM_IN]] to i64
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[NUM_IN]], 16
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP2]], i64 16, i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -770,36 +757,48 @@ define i32 @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[TMP10]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP241:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP10]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint i64 [[TMP10]], 3
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP8]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP242:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP160:%.*]] = or disjoint i64 [[TMP10]], 3
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP160]]
+; CHECK-NEXT:    [[TMP243:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP160]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP15]], align 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP13]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP17]])
-; CHECK-NEXT:    [[WIDE_VEC5:%.*]] = load <128 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = shufflevector <128 x i8> [[WIDE_VEC5]], <128 x i8> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
-; CHECK-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP16]]
+; CHECK-NEXT:    [[TMP244:%.*]] = getelementptr inbounds i8, ptr [[TMP241]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i8>, ptr [[TMP244]], align 1
+; CHECK-NEXT:    [[TMP245:%.*]] = sext <16 x i8> [[WIDE_LOAD7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP245]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP21]])
-; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <128 x i8>, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = shufflevector <128 x i8> [[WIDE_VEC8]], <128 x i8> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP16]]
+; CHECK-NEXT:    [[TMP246:%.*]] = getelementptr inbounds i8, ptr [[TMP242]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP246]], align 1
+; CHECK-NEXT:    [[TMP247:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP247]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP25]])
-; CHECK-NEXT:    [[WIDE_VEC11:%.*]] = load <128 x i8>, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = shufflevector <128 x i8> [[WIDE_VEC11]], <128 x i8> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
-; CHECK-NEXT:    [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP16]]
+; CHECK-NEXT:    [[TMP248:%.*]] = getelementptr inbounds i8, ptr [[TMP243]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP248]], align 1
+; CHECK-NEXT:    [[TMP249:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-NEXT:    [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP249]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP29]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -809,63 +808,74 @@ define i32 @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE9]])
 ; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
 ; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-NEXT:    br label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[NUM_IN]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP31]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX12:%.*]] = phi i32 [ [[TMP32]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP33]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX14:%.*]] = phi i32 [ [[TMP34]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[N_MOD_VF15:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[N_MOD_VF15]], 0
-; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 8, i64 [[N_MOD_VF15]]
-; CHECK-NEXT:    [[N_VEC16:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP27]]
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF15:%.*]] = urem i64 [[NUM_IN]], 8
+; CHECK-NEXT:    [[N_VEC16:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF15]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX12]], i32 0
 ; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX13]], i32 0
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX14]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX17:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX17:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL1]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI18:%.*]] = phi <2 x i32> [ [[TMP37]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE30:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI19:%.*]] = phi <2 x i32> [ [[TMP38]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI20:%.*]] = phi <2 x i32> [ [[TMP35]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI21:%.*]] = phi <2 x i32> [ [[TMP36]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE24:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP39:%.*]] = add i64 [[INDEX17]], 0
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP39]]
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP39]]
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP39]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = or disjoint i64 [[TMP39]], 1
-; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP44]]
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP44]]
+; CHECK-NEXT:    [[TMP250:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP44]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = or disjoint i64 [[TMP39]], 2
-; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP46]]
-; CHECK-NEXT:    [[TMP54:%.*]] = or disjoint i64 [[TMP39]], 3
-; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP54]]
-; CHECK-NEXT:    [[WIDE_VEC24:%.*]] = load <64 x i8>, ptr [[TMP40]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = shufflevector <64 x i8> [[WIDE_VEC24]], <64 x i8> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP46]]
+; CHECK-NEXT:    [[TMP251:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP46]]
+; CHECK-NEXT:    [[TMP180:%.*]] = or disjoint i64 [[TMP39]], 3
+; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP180]]
+; CHECK-NEXT:    [[TMP252:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP180]]
+; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP47]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i8>, ptr [[TMP59]], align 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = sext <8 x i8> [[WIDE_LOAD22]] to <8 x i32>
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP39]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[TMP49]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i8>, ptr [[TMP50]], align 1
 ; CHECK-NEXT:    [[TMP42:%.*]] = sext <8 x i8> [[WIDE_LOAD23]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP43:%.*]] = mul nsw <8 x i32> [[TMP42]], [[TMP41]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE24]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI21]], <8 x i32> [[TMP43]])
-; CHECK-NEXT:    [[WIDE_VEC28:%.*]] = load <64 x i8>, ptr [[TMP45]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = shufflevector <64 x i8> [[WIDE_VEC28]], <64 x i8> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
+; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[TMP45]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <8 x i8>, ptr [[TMP58]], align 1
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i8> [[WIDE_LOAD25]] to <8 x i32>
-; CHECK-NEXT:    [[TMP48:%.*]] = mul nsw <8 x i32> [[TMP53]], [[TMP42]]
+; CHECK-NEXT:    [[TMP253:%.*]] = getelementptr inbounds i8, ptr [[TMP250]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <8 x i8>, ptr [[TMP253]], align 1
+; CHECK-NEXT:    [[TMP256:%.*]] = sext <8 x i8> [[WIDE_LOAD32]] to <8 x i32>
+; CHECK-NEXT:    [[TMP48:%.*]] = mul nsw <8 x i32> [[TMP53]], [[TMP256]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE26]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI20]], <8 x i32> [[TMP48]])
-; CHECK-NEXT:    [[WIDE_VEC31:%.*]] = load <64 x i8>, ptr [[TMP47]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD27:%.*]] = shufflevector <64 x i8> [[WIDE_VEC31]], <64 x i8> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
+; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD27:%.*]] = load <8 x i8>, ptr [[TMP63]], align 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = sext <8 x i8> [[WIDE_LOAD27]] to <8 x i32>
-; CHECK-NEXT:    [[TMP52:%.*]] = mul nsw <8 x i32> [[TMP51]], [[TMP42]]
+; CHECK-NEXT:    [[TMP257:%.*]] = getelementptr inbounds i8, ptr [[TMP251]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD34:%.*]] = load <8 x i8>, ptr [[TMP257]], align 1
+; CHECK-NEXT:    [[TMP258:%.*]] = sext <8 x i8> [[WIDE_LOAD34]] to <8 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = mul nsw <8 x i32> [[TMP51]], [[TMP258]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE28]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI19]], <8 x i32> [[TMP52]])
-; CHECK-NEXT:    [[WIDE_VEC34:%.*]] = load <64 x i8>, ptr [[TMP55]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = shufflevector <64 x i8> [[WIDE_VEC34]], <64 x i8> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
+; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[TMP55]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <8 x i8>, ptr [[TMP68]], align 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = sext <8 x i8> [[WIDE_LOAD29]] to <8 x i32>
-; CHECK-NEXT:    [[TMP56:%.*]] = mul nsw <8 x i32> [[TMP57]], [[TMP42]]
+; CHECK-NEXT:    [[TMP254:%.*]] = getelementptr inbounds i8, ptr [[TMP252]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD36:%.*]] = load <8 x i8>, ptr [[TMP254]], align 1
+; CHECK-NEXT:    [[TMP255:%.*]] = sext <8 x i8> [[WIDE_LOAD36]] to <8 x i32>
+; CHECK-NEXT:    [[TMP56:%.*]] = mul nsw <8 x i32> [[TMP57]], [[TMP255]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE30]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI18]], <8 x i32> [[TMP56]])
 ; CHECK-NEXT:    [[INDEX_NEXT27]] = add nuw i64 [[INDEX17]], 8
 ; CHECK-NEXT:    [[TMP69:%.*]] = icmp eq i64 [[INDEX_NEXT27]], [[N_VEC16]]
@@ -875,124 +885,149 @@ define i32 @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE28]])
 ; CHECK-NEXT:    [[TMP72:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE26]])
 ; CHECK-NEXT:    [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE24]])
-; CHECK-NEXT:    br label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N38:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC16]]
+; CHECK-NEXT:    br i1 [[CMP_N38]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ;
 entry:
-  %cmp154 = icmp sgt i32 %num_out, 3
-  br i1 %cmp154, label %for.body.lr.ph, label %for.end98
-
-for.body.lr.ph:                                   ; preds = %entry
-  %div = sdiv i32 %num_out, 4
-  %mul = shl nsw i32 %div, 2
-  %cmp.num_in = icmp sgt i32 %num_in, 0
-  %ext.num_in = sext i32 %num_in to i64
-  %max_iv = zext nneg i32 %mul to i64
-  br i1 %cmp.num_in, label %for.body.us.preheader, label %for.end98
-
-for.body.us.preheader:                            ; preds = %for.body.lr.ph
-  %wide.trip.count = zext nneg i32 %num_in to i64
-  br label %for.body12.us
+  br label %for.body
 
-for.body12.us:                                    ; preds = %for.body.us, %for.body12.us
-  %iv = phi i64 [ 0, %for.body.us.preheader ], [ %iv.next, %for.body12.us ]
-  %accum3 = phi i32 [ 0, %for.body.us.preheader ], [ %add.a3, %for.body12.us ]
-  %accum2 = phi i32 [ 0, %for.body.us.preheader ], [ %add.a2, %for.body12.us ]
-  %accum1 = phi i32 [ 0, %for.body.us.preheader ], [ %add.a1, %for.body12.us ]
-  %accum0 = phi i32 [ 0, %for.body.us.preheader ], [ %add.a0, %for.body12.us ]
-  %gep.a0 = getelementptr inbounds ptr, ptr %a, i64 %iv
+for.body:                                    ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ]
+  %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ]
+  %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ]
+  %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ]
+  %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv
+  %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv
   %offset.1 = or disjoint i64 %iv, 1
-  %gep.a1 = getelementptr inbounds ptr, ptr %a, i64 %offset.1
+  %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1
+  %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1
   %offset.2 = or disjoint i64 %iv, 2
-  %gep.a2 = getelementptr inbounds ptr, ptr %a, i64 %offset.2
+  %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2
+  %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2
   %offset.3 = or disjoint i64 %iv, 3
-  %gep.a3 = getelementptr inbounds ptr, ptr %a, i64 %offset.3
+  %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3
+  %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3
   %load.a0 = load i8, ptr %gep.a0, align 1
   %ext.a0 = sext i8 %load.a0 to i32
-  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
-  %load.b = load i8, ptr %gep.b, align 1
-  %ext.b = sext i8 %load.b to i32
-  %mul.a0 = mul nsw i32 %ext.b, %ext.a0
+  %load.b0 = load i8, ptr %gep.b0, align 1
+  %ext.b0 = sext i8 %load.b0 to i32
+  %mul.a0 = mul nsw i32 %ext.b0, %ext.a0
   %add.a0 = add nsw i32 %mul.a0, %accum0
   %load.a1 = load i8, ptr %gep.a1, align 1
   %ext.a1 = sext i8 %load.a1 to i32
-  %mul.a1 = mul nsw i32 %ext.a1, %ext.b
+  %load.b1 = load i8, ptr %gep.b1, align 1
+  %ext.b1 = sext i8 %load.b1 to i32
+  %mul.a1 = mul nsw i32 %ext.a1, %ext.b1
   %add.a1 = add nsw i32 %mul.a1, %accum1
   %load.a2 = load i8, ptr %gep.a2, align 1
   %ext.a2 = sext i8 %load.a2 to i32
-  %mul.a2 = mul nsw i32 %ext.a2, %ext.b
+  %load.b2 = load i8, ptr %gep.b2, align 1
+  %ext.b2 = sext i8 %load.b2 to i32
+  %mul.a2 = mul nsw i32 %ext.a2, %ext.b2
   %add.a2 = add nsw i32 %mul.a2, %accum2
   %load.a3 = load i8, ptr %gep.a3, align 1
   %ext.a3 = sext i8 %load.a3 to i32
-  %mul.a3 = mul nsw i32 %ext.a3, %ext.b
+  %load.b3 = load i8, ptr %gep.b3, align 1
+  %ext.b3 = sext i8 %load.b3 to i32
+  %mul.a3 = mul nsw i32 %ext.a3, %ext.b3
   %add.a3 = add nsw i32 %mul.a3, %accum3
   %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.end98, label %for.body12.us
+  %exitcond.not = icmp eq i64 %iv.next, %num_in
+  br i1 %exitcond.not, label %exit, label %for.body
 
-for.end98:                                        ; preds = %for.body12.us, %entry
-  %result.a0 = phi i32 [ 0, %for.body.lr.ph ], [ 0, %entry ], [ %add.a0, %for.body12.us ]
-  %result.a1 = phi i32 [ 0, %for.body.lr.ph ], [ 0, %entry ], [ %add.a1, %for.body12.us ]
-  %result.a2 = phi i32 [ 0, %for.body.lr.ph ], [ 0, %entry ], [ %add.a2, %for.body12.us ]
-  %result.a3 = phi i32 [ 0, %for.body.lr.ph ], [ 0, %entry ], [ %add.a3, %for.body12.us ]
-  %result0 = add nsw i32 %result.a0, %result.a1
-  %result1 = add nsw i32 %result.a2, %result.a3
+exit:                                        ; preds = %for.body
+  %result0 = add nsw i32 %add.a0, %add.a1
+  %result1 = add nsw i32 %add.a2, %add.a3
   %result = add nsw i32 %result0, %result1
   ret i32 %result
 }
 
-define i32 @not_dotp_predicated(i32 %N, ptr %a, ptr %b) #0 {
-; CHECK-LABEL: define i32 @not_dotp_predicated(
-; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[N]], 16
-; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[REM]], 0
-; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[REM]] to i64
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 15
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP10]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i32> [[TMP17]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP15]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP8]], <16 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP18]], i32 0
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]])
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated(
+; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  iter.check:
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-INTERLEAVE1:       vector.main.loop.iter.check:
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated(
+; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  iter.check:
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-INTERLEAVED:       vector.main.loop.iter.check:
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 32
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE6]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP12]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ;
 entry:
-  %rem = srem i32 %N, 16
-  %cmp8 = icmp sgt i32 %rem, 0
-  br i1 %cmp8, label %for.body.preheader, label %for.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext nneg i32 %rem to i64
   br label %for.body
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %for.body.preheader ], [ %add, %for.body ]
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
   %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
   %load.a = load i8, ptr %gep.a, align 1
   %ext.a = sext i8 %load.a to i32
@@ -1002,32 +1037,26 @@ for.body:                                         ; preds = %for.body.preheader,
   %mul = mul nsw i32 %ext.b, %ext.a
   %add = add nsw i32 %mul, %accum
   %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.exit, label %for.body
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %for.body
 
-for.exit:                        ; preds = %for.body
-  %add.lcssa = phi i32 [ %add, %for.body ]
-  br label %for.cleanup
-
-for.cleanup:                                 ; preds = %for.exit, %entry
-  %total = phi i32 [ 0, %entry ], [ %add.lcssa, %for.exit ]
-  ret i32 %total
+exit:                        ; preds = %for.body
+  ret i32 %add
 }
 
-define i32 @not_dotp_predicated_pragma(i32 %N, ptr %a, ptr %b) #0 {
+define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-LABEL: define i32 @not_dotp_predicated_pragma(
-; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP8_NOT:%.*]] = icmp eq i32 [[N]], 0
-; CHECK-NEXT:    br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 15
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[N]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1046,25 +1075,20 @@ define i32 @not_dotp_predicated_pragma(i32 %N, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i32> [[TMP17]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP15]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP8]], <16 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[TMP2]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP18]], i32 0
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ;
 entry:
-  %cmp8.not = icmp eq i32 %N, 0
-  br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %for.body.preheader ], [ %add, %for.body ]
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
   %gep.a = getelementptr inbounds nuw i8, ptr %b, i64 %iv
   %load.a = load i8, ptr %gep.a, align 1
   %ext.a = sext i8 %load.a to i32
@@ -1074,16 +1098,11 @@ for.body:                                         ; preds = %for.body.preheader,
   %mul = mul nsw i32 %ext.b, %ext.a
   %add = add nsw i32 %mul, %accum
   %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.exit, label %for.body, !llvm.loop !7
-
-for.exit:                        ; preds = %for.body
-  %add.lcssa = phi i32 [ %add, %for.body ]
-  br label %for.cond.cleanup
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7
 
-for.cond.cleanup:                                 ; preds = %for.exit, %entry
-  %total = phi i32 [ 0, %entry ], [ %add.lcssa, %for.exit ]
-  ret i32 %total
+exit:                        ; preds = %for.body
+  ret i32 %add
 }
 
 define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
@@ -1114,7 +1133,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10]] = add <16 x i32> [[TMP9]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -1152,7 +1171,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16]] = add <16 x i32> [[TMP14]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ;
 entry:
   br label %for.body

>From 8db7d81839b93cbefb19961ceb47febf1b8531c6 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Thu, 31 Oct 2024 16:07:33 +0000
Subject: [PATCH 25/56] Improve printing test

---
 .../LoopVectorize/AArch64/vplan-printing.ll   | 54 +++++++++----------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index 79e7abb367be7a..fe29c3ebf5eda7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -21,15 +21,15 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
 ; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<%add>
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:   CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<[[STEPS]]>
-; CHECK-NEXT:   vp<%4> = vector-pointer ir<%arrayidx>
-; CHECK-NEXT:   WIDEN ir<%0> = load vp<%4>
-; CHECK-NEXT:   WIDEN-CAST ir<%conv> = zext ir<%0> to i32
-; CHECK-NEXT:   CLONE ir<%arrayidx2> = getelementptr ir<%b>, vp<[[STEPS]]>
-; CHECK-NEXT:   vp<%5> = vector-pointer ir<%arrayidx2>
-; CHECK-NEXT:   WIDEN ir<%1> = load vp<%5>
-; CHECK-NEXT:   WIDEN-CAST ir<%conv3> = zext ir<%1> to i32
-; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%conv3>, ir<%conv>
+; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]>
+; CHECK-NEXT:   vp<%4> = vector-pointer ir<%gep.a>
+; CHECK-NEXT:   WIDEN ir<%load.a> = load vp<%4>
+; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
+; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]>
+; CHECK-NEXT:   vp<%5> = vector-pointer ir<%gep.b>
+; CHECK-NEXT:   WIDEN ir<%load.b> = load vp<%5>
+; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
+; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
 ; CHECK-NEXT:   WIDEN ir<%add> = add ir<%mul>, ir<[[ACC]]>
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
@@ -64,15 +64,15 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
 ; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, vp<%6> (VF scaled by 1/4)
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:   CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<[[STEPS]]>
-; CHECK-NEXT:   vp<%4> = vector-pointer ir<%arrayidx>
-; CHECK-NEXT:   WIDEN ir<%0> = load vp<%4>
-; CHECK-NEXT:   WIDEN-CAST ir<%conv> = zext ir<%0> to i32
-; CHECK-NEXT:   CLONE ir<%arrayidx2> = getelementptr ir<%b>, vp<[[STEPS]]>
-; CHECK-NEXT:   vp<%5> = vector-pointer ir<%arrayidx2>
-; CHECK-NEXT:   WIDEN ir<%1> = load vp<%5>
-; CHECK-NEXT:   WIDEN-CAST ir<%conv3> = zext ir<%1> to i32
-; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%conv3>, ir<%conv>
+; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]>
+; CHECK-NEXT:   vp<%4> = vector-pointer ir<%gep.a>
+; CHECK-NEXT:   WIDEN ir<%load.a> = load vp<%4>
+; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
+; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]>
+; CHECK-NEXT:   vp<%5> = vector-pointer ir<%gep.b>
+; CHECK-NEXT:   WIDEN ir<%load.b> = load vp<%5>
+; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
+; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
 ; CHECK-NEXT:   PARTIAL-REDUCE vp<%6> = add ir<%mul>, ir<[[ACC]]>
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
@@ -100,15 +100,15 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %acc.010 = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %arrayidx = getelementptr i8, ptr %a, i64 %iv
-  %1 = load i8, ptr %arrayidx, align 1
-  %conv = zext i8 %1 to i32
-  %arrayidx2 = getelementptr i8, ptr %b, i64 %iv
-  %2 = load i8, ptr %arrayidx2, align 1
-  %conv3 = zext i8 %2 to i32
-  %mul = mul i32 %conv3, %conv
-  %add = add i32 %mul, %acc.010
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
   %iv.next = add i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 0
   br i1 %exitcond.not, label %exit, label %for.body

>From 94d10080130a1d26daef5ffed08897a3a0780ee5 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 1 Nov 2024 10:20:04 +0000
Subject: [PATCH 26/56] Fix windows build failure

---
 llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index a886a5bea5e037..1ba8148010371f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -269,7 +269,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
           })
           .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPWidenEVLRecipe,
                 VPReplicateRecipe, VPWidenCallRecipe, VPWidenMemoryRecipe,
-                VPWidenSelectRecipe, VPPartialReductionRecipe>(
+                VPWidenSelectRecipe>(
               [this](const auto *R) { return inferScalarTypeForRecipe(R); })
           .Case<VPWidenIntrinsicRecipe>([](const VPWidenIntrinsicRecipe *R) {
             return R->getResultType();

>From aced10b670cc86ad7ed8a0d9a3e35d99a033d8ac Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 1 Nov 2024 16:01:27 +0000
Subject: [PATCH 27/56] Clean up invalid partial reduction removal

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4a1896bedcb2dc..324e8150c6c092 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8830,27 +8830,23 @@ void VPRecipeBuilder::removeInvalidScaledReductionExitInstrs() {
   // extends are intended to be lowered along with the reduction itself.
 
   // Build up a set of partial reduction bin ops for efficient use checking
-  SmallSet<Instruction *, 4> PartialReductionBinOps;
-  for (auto It : ScaledReductionExitInstrs) {
-    if (It.second.BinOp)
-      PartialReductionBinOps.insert(It.second.BinOp);
+  SmallSet<User *, 4> PartialReductionBinOps;
+  for (const auto &[_, Chain] : ScaledReductionExitInstrs) {
+    if (Chain.BinOp)
+      PartialReductionBinOps.insert(Chain.BinOp);
   }
 
   auto ExtendIsOnlyUsedByPartialReductions =
-      [PartialReductionBinOps](Instruction *Extend) {
-        for (auto *Use : Extend->users()) {
-          Instruction *UseInstr = dyn_cast<Instruction>(Use);
-          if (!PartialReductionBinOps.contains(UseInstr))
-            return false;
-        }
-        return true;
+      [&PartialReductionBinOps](Instruction *Extend) {
+        return all_of(Extend->users(), [&](const User *U) {
+          return PartialReductionBinOps.contains(U);
+        });
       };
 
   // Check if each use of a chain's two extends is a partial reduction
   // and remove those that have non-partial reduction users
   SmallSet<Instruction *, 4> PartialReductionsToRemove;
-  for (auto It : ScaledReductionExitInstrs) {
-    PartialReductionChain Chain = It.second;
+  for (const auto &[_, Chain] : ScaledReductionExitInstrs) {
     if (!ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) ||
         !ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
       PartialReductionsToRemove.insert(Chain.Reduction);

>From b55c110f90e2015f16a04a186a7e906350016527 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 1 Nov 2024 16:02:47 +0000
Subject: [PATCH 28/56] Remove unndeeded stuff from VPlanAnalysis.cpp

---
 llvm/lib/Transforms/Vectorize/VPlanAnalysis.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index a34d9629eff9dd..cc21870bee2e3b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -27,7 +27,6 @@ struct VPWidenSelectRecipe;
 class VPReplicateRecipe;
 class VPRecipeBase;
 class VPlan;
-class VPPartialReductionRecipe;
 class Type;
 
 /// An analysis for type-inference for VPValues.
@@ -54,7 +53,6 @@ class VPTypeAnalysis {
   Type *inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R);
   Type *inferScalarTypeForRecipe(const VPWidenSelectRecipe *R);
   Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R);
-  Type *inferScalarTypeForRecipe(const VPPartialReductionRecipe *R);
 
 public:
   VPTypeAnalysis(Type *CanonicalIVTy)

>From 56ee0bf47f6e037939e77e7ce6b402a534a92c99 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 1 Nov 2024 16:24:00 +0000
Subject: [PATCH 29/56] Improve naming in codegen test

---
 .../CodeGen/AArch64/partial-reduce-sdot.ll    | 112 +++++++-----------
 1 file changed, 41 insertions(+), 71 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll b/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
index 3c82a4868f659d..7034bc6a9c6357 100644
--- a/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
+++ b/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
@@ -1,90 +1,59 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-target-instruction-cost=1 -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"
 
-define void @dotp(ptr %a, ptr %b) #0 {
-; CHECK-LABEL: define void @dotp(
+define i32 @dotp(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define i32 @dotp(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_BODY1:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]]
-; CHECK-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP15]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = lshr i32 [[ADD_LCSSA]], 0
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ACC_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP18]] to i32
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP22]] to i32
-; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[CONV3]], [[CONV]]
-; CHECK-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACC_010]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 0
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP23:%.*]] = mul <16 x i32> [[TMP8]], [[TMP22]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP23]])
+; CHECK-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 0
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
 ;
 entry:
   br label %for.body
 
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  %0 = lshr i32 %add, 0
-  ret void
-
 for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %acc.010 = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %arrayidx = getelementptr i8, ptr %a, i64 %indvars.iv
-  %1 = load i8, ptr %arrayidx, align 1
-  %conv = zext i8 %1 to i32
-  %arrayidx2 = getelementptr i8, ptr %b, i64 %indvars.iv
-  %2 = load i8, ptr %arrayidx2, align 1
-  %conv3 = zext i8 %2 to i32
-  %mul = mul i32 %conv3, %conv
-  %add = add i32 %mul, %acc.010
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, 0
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                        ; preds = %for.body
+  ret i32 %add
 
 ; uselistorder directives
   uselistorder i32 %add, { 1, 0 }
@@ -95,5 +64,6 @@ attributes #0 = { "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+v8a"
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
 ;.

>From fe8df54c9b1d202914be73433bd2641a6558b270 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Thu, 7 Nov 2024 10:28:15 +0000
Subject: [PATCH 30/56] Add full stop to comments and document
 tryToCreatePartialReduction

---
 llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index d21ff8e71cdb92..a98ec3a8b9abca 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -26,19 +26,19 @@ class TargetTransformInfo;
 
 /// A chain of instructions that form a partial reduction.
 /// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))),
-/// accumulator)
+/// accumulator).
 struct PartialReductionChain {
   /// The top-level binary operation that forms the reduction to a scalar
-  /// after the loop body
+  /// after the loop body.
   Instruction *Reduction;
-  /// The extension of each of the inner binary operation's operands
+  /// The extension of each of the inner binary operation's operands.
   Instruction *ExtendA;
   Instruction *ExtendB;
 
   Instruction *BinOp;
 
   /// The scaling factor between the size of the reduction type and the
-  /// (possibly extended) inputs
+  /// (possibly extended) inputs.
   unsigned ScaleFactor;
 };
 
@@ -167,6 +167,8 @@ class VPRecipeBuilder {
                                        ArrayRef<VPValue *> Operands,
                                        VFRange &Range, VPBasicBlock *VPBB);
 
+  // Create and return a partial reduction recipe for a reduction instruction
+  // along with binary operation and reduction phi operands.
   VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
                                             ArrayRef<VPValue *> Operands);
 

>From 06ebf27ce909e78bb4af17f975fc56a70814be64 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 7 Nov 2024 17:19:21 +0000
Subject: [PATCH 31/56] Only add the chains that are valid

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 25 +++++++++----------
 .../Transforms/Vectorize/VPRecipeBuilder.h    |  6 +----
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 324e8150c6c092..6f05f518620255 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8824,16 +8824,17 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   return tryToWiden(Instr, Operands, VPBB);
 }
 
-void VPRecipeBuilder::removeInvalidScaledReductionExitInstrs() {
+void VPRecipeBuilder::addScaledReductionExitInstrs(
+    SmallVector<PartialReductionChain> Chains) {
   // A partial reduction is invalid if any of its extends are used by
   // something that isn't another partial reduction. This is because the
   // extends are intended to be lowered along with the reduction itself.
 
   // Build up a set of partial reduction bin ops for efficient use checking
   SmallSet<User *, 4> PartialReductionBinOps;
-  for (const auto &[_, Chain] : ScaledReductionExitInstrs) {
-    if (Chain.BinOp)
-      PartialReductionBinOps.insert(Chain.BinOp);
+  for (auto It : Chains) {
+    if (It.BinOp)
+      PartialReductionBinOps.insert(It.BinOp);
   }
 
   auto ExtendIsOnlyUsedByPartialReductions =
@@ -8844,16 +8845,13 @@ void VPRecipeBuilder::removeInvalidScaledReductionExitInstrs() {
       };
 
   // Check if each use of a chain's two extends is a partial reduction
-  // and remove those that have non-partial reduction users
-  SmallSet<Instruction *, 4> PartialReductionsToRemove;
-  for (const auto &[_, Chain] : ScaledReductionExitInstrs) {
+  // and only add those those that don't have non-partial reduction users
+  for (auto It : Chains) {
+    PartialReductionChain Chain = It;
     if (!ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) ||
         !ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
-      PartialReductionsToRemove.insert(Chain.Reduction);
+      ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Chain));
   }
-
-  for (auto *Instr : PartialReductionsToRemove)
-    ScaledReductionExitInstrs.erase(Instr);
 }
 
 VPRecipeBase *
@@ -9223,11 +9221,12 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
 
   // Cache the partial reductions up front so we can remove the invalid ones
   // before creating the recipes
+  SmallVector<PartialReductionChain, 1> PartialReductionChains;
   for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
     if (std::optional<PartialReductionChain> Chain =
             getScaledReduction(Phi, RdxDesc, &TTI, Range, CM))
-      RecipeBuilder.addScaledReductionExitInstr(*Chain);
-  RecipeBuilder.removeInvalidScaledReductionExitInstrs();
+      PartialReductionChains.push_back(*Chain);
+  RecipeBuilder.addScaledReductionExitInstrs(PartialReductionChains);
 
   auto *MiddleVPBB = Plan->getMiddleBlock();
   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index a98ec3a8b9abca..530401a7520670 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -147,10 +147,6 @@ class VPRecipeBuilder {
       : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal),
         CM(CM), PSE(PSE), Builder(Builder) {}
 
-  void addScaledReductionExitInstr(PartialReductionChain Chain) {
-    ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Chain));
-  }
-
   std::optional<PartialReductionChain>
   getScaledReductionForInstr(const Instruction *ExitInst) {
     auto It = ScaledReductionExitInstrs.find(ExitInst);
@@ -159,7 +155,7 @@ class VPRecipeBuilder {
                : std::make_optional(It->second);
   }
 
-  void removeInvalidScaledReductionExitInstrs();
+  void addScaledReductionExitInstrs(SmallVector<PartialReductionChain> Chains);
 
   /// Create and return a widened recipe for \p I if one can be created within
   /// the given VF \p Range.

>From 46e97585fdf6fe2d731e62a56fc865e11b969c7b Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 8 Nov 2024 14:26:01 +0000
Subject: [PATCH 32/56] Add computeCost

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   4 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   4 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  24 ++--
 .../AArch64/partial-reduce-dot-product.ll     | 116 +++++++++---------
 .../LoopVectorize/AArch64/vplan-printing.ll   |  44 ++++++-
 5 files changed, 119 insertions(+), 73 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6f05f518620255..105ff0ef62d554 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8848,8 +8848,8 @@ void VPRecipeBuilder::addScaledReductionExitInstrs(
   // and only add those those that don't have non-partial reduction users
   for (auto It : Chains) {
     PartialReductionChain Chain = It;
-    if (!ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) ||
-        !ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
+    if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
+        ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
       ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Chain));
   }
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 3a9abcbd5e9127..43736dc69e528a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2409,6 +2409,10 @@ class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
   /// Generate the reduction in the loop
   void execute(VPTransformState &State) override;
 
+  /// Return the cost of this VPPartialReductionRecipe.
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8d51a36d518ac0..cacee66bde7abe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -291,16 +291,20 @@ InstructionCost VPRecipeBase::computeCost(ElementCount VF,
   llvm_unreachable("subclasses should implement computeCost");
 }
 
-InstructionCost VPSingleDefRecipe::computeCost(ElementCount VF,
-                                               VPCostContext &Ctx) const {
-  Instruction *UI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
-  if (isa<VPReplicateRecipe>(this)) {
-    assert(UI && "VPReplicateRecipe must have an underlying instruction");
-    // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
-    // transform, avoid computing their cost multiple times for now.
-    Ctx.SkipCostComputation.insert(UI);
-  }
-  return UI ? Ctx.getLegacyCost(UI, VF) : 0;
+InstructionCost
+VPPartialReductionRecipe::computeCost(ElementCount VF,
+                                      VPCostContext &Ctx) const {
+  auto *BinOp = cast<BinaryOperator>(getOperand(0)->getUnderlyingValue());
+  auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe());
+  auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
+  auto *ExtA = cast<Instruction>(BinOp->getOperand(0));
+  auto *ExtB = cast<Instruction>(BinOp->getOperand(1));
+  Value *A = ExtA->getOperand(0);
+  return Ctx.TTI.getPartialReductionCost(
+      Opcode, A->getType(), Phi->getType(), VF,
+      TargetTransformInfo::getPartialReductionExtendKind(ExtA),
+      TargetTransformInfo::getPartialReductionExtendKind(ExtB),
+      std::make_optional(BinOp->getOpcode()));
 }
 
 void VPPartialReductionRecipe::execute(VPTransformState &State) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 4c839fccf70498..8fa93b53323ed6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -530,8 +530,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
@@ -568,8 +567,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
@@ -640,8 +638,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
@@ -677,8 +674,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
@@ -696,14 +692,12 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 16 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP19]]
@@ -741,7 +735,9 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-LABEL: define i32 @dotp_unrolled(
 ; CHECK-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 8
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[NUM_IN]], 16
@@ -812,7 +808,9 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[NUM_IN]], [[N_VEC]]
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP51]], 4
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP40]]
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP31]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -820,19 +818,23 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP33]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX14:%.*]] = phi i32 [ [[TMP34]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[N_MOD_VF15:%.*]] = urem i64 [[NUM_IN]], 8
+; CHECK-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 4
+; CHECK-NEXT:    [[N_MOD_VF15:%.*]] = urem i64 [[NUM_IN]], [[TMP42]]
 ; CHECK-NEXT:    [[N_VEC16:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF15]]
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX12]], i32 0
-; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX13]], i32 0
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX14]], i32 0
+; CHECK-NEXT:    [[TMP43:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP52:%.*]] = mul i64 [[TMP43]], 4
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX12]], i32 0
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX13]], i32 0
+; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX14]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX17:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL1]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI18:%.*]] = phi <2 x i32> [ [[TMP37]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE30:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI19:%.*]] = phi <2 x i32> [ [[TMP38]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI20:%.*]] = phi <2 x i32> [ [[TMP35]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI21:%.*]] = phi <2 x i32> [ [[TMP36]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE24:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI21:%.*]] = phi <vscale x 4 x i32> [ [[TMP53]], [[VEC_EPILOG_PH]] ], [ [[TMP84:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI22:%.*]] = phi <vscale x 4 x i32> [ [[TMP56]], [[VEC_EPILOG_PH]] ], [ [[TMP78:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI23:%.*]] = phi <vscale x 4 x i32> [ [[TMP57]], [[VEC_EPILOG_PH]] ], [ [[TMP72:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI24:%.*]] = phi <vscale x 4 x i32> [ [[TMP48]], [[VEC_EPILOG_PH]] ], [ [[TMP66:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP39:%.*]] = add i64 [[INDEX17]], 0
 ; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP39]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP39]]
@@ -846,45 +848,45 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP180]]
 ; CHECK-NEXT:    [[TMP252:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP180]]
 ; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP47]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i8>, ptr [[TMP59]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = sext <8 x i8> [[WIDE_LOAD22]] to <8 x i32>
+; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <vscale x 4 x i8>, ptr [[TMP59]], align 1
+; CHECK-NEXT:    [[TMP62:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD25]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[TMP49]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i8>, ptr [[TMP50]], align 1
-; CHECK-NEXT:    [[TMP42:%.*]] = sext <8 x i8> [[WIDE_LOAD23]] to <8 x i32>
-; CHECK-NEXT:    [[TMP43:%.*]] = mul nsw <8 x i32> [[TMP42]], [[TMP41]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE24]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI21]], <8 x i32> [[TMP43]])
+; CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <vscale x 4 x i8>, ptr [[TMP50]], align 1
+; CHECK-NEXT:    [[TMP64:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD26]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP65:%.*]] = mul nsw <vscale x 4 x i32> [[TMP64]], [[TMP62]]
+; CHECK-NEXT:    [[TMP66]] = add <vscale x 4 x i32> [[TMP65]], [[VEC_PHI24]]
 ; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[TMP45]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <8 x i8>, ptr [[TMP58]], align 1
-; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i8> [[WIDE_LOAD25]] to <8 x i32>
+; CHECK-NEXT:    [[WIDE_LOAD27:%.*]] = load <vscale x 4 x i8>, ptr [[TMP58]], align 1
+; CHECK-NEXT:    [[TMP73:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD27]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP253:%.*]] = getelementptr inbounds i8, ptr [[TMP250]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <8 x i8>, ptr [[TMP253]], align 1
-; CHECK-NEXT:    [[TMP256:%.*]] = sext <8 x i8> [[WIDE_LOAD32]] to <8 x i32>
-; CHECK-NEXT:    [[TMP48:%.*]] = mul nsw <8 x i32> [[TMP53]], [[TMP256]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE26]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI20]], <8 x i32> [[TMP48]])
+; CHECK-NEXT:    [[WIDE_LOAD28:%.*]] = load <vscale x 4 x i8>, ptr [[TMP253]], align 1
+; CHECK-NEXT:    [[TMP70:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD28]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP71:%.*]] = mul nsw <vscale x 4 x i32> [[TMP73]], [[TMP70]]
+; CHECK-NEXT:    [[TMP72]] = add <vscale x 4 x i32> [[TMP71]], [[VEC_PHI23]]
 ; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD27:%.*]] = load <8 x i8>, ptr [[TMP63]], align 1
-; CHECK-NEXT:    [[TMP51:%.*]] = sext <8 x i8> [[WIDE_LOAD27]] to <8 x i32>
+; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <vscale x 4 x i8>, ptr [[TMP63]], align 1
+; CHECK-NEXT:    [[TMP74:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD29]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP257:%.*]] = getelementptr inbounds i8, ptr [[TMP251]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD34:%.*]] = load <8 x i8>, ptr [[TMP257]], align 1
-; CHECK-NEXT:    [[TMP258:%.*]] = sext <8 x i8> [[WIDE_LOAD34]] to <8 x i32>
-; CHECK-NEXT:    [[TMP52:%.*]] = mul nsw <8 x i32> [[TMP51]], [[TMP258]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE28]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI19]], <8 x i32> [[TMP52]])
+; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <vscale x 4 x i8>, ptr [[TMP257]], align 1
+; CHECK-NEXT:    [[TMP76:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD30]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP77:%.*]] = mul nsw <vscale x 4 x i32> [[TMP74]], [[TMP76]]
+; CHECK-NEXT:    [[TMP78]] = add <vscale x 4 x i32> [[TMP77]], [[VEC_PHI22]]
 ; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[TMP55]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <8 x i8>, ptr [[TMP68]], align 1
-; CHECK-NEXT:    [[TMP57:%.*]] = sext <8 x i8> [[WIDE_LOAD29]] to <8 x i32>
+; CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <vscale x 4 x i8>, ptr [[TMP68]], align 1
+; CHECK-NEXT:    [[TMP80:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD31]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP254:%.*]] = getelementptr inbounds i8, ptr [[TMP252]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD36:%.*]] = load <8 x i8>, ptr [[TMP254]], align 1
-; CHECK-NEXT:    [[TMP255:%.*]] = sext <8 x i8> [[WIDE_LOAD36]] to <8 x i32>
-; CHECK-NEXT:    [[TMP56:%.*]] = mul nsw <8 x i32> [[TMP57]], [[TMP255]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE30]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI18]], <8 x i32> [[TMP56]])
-; CHECK-NEXT:    [[INDEX_NEXT27]] = add nuw i64 [[INDEX17]], 8
+; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <vscale x 4 x i8>, ptr [[TMP254]], align 1
+; CHECK-NEXT:    [[TMP82:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD32]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP83:%.*]] = mul nsw <vscale x 4 x i32> [[TMP80]], [[TMP82]]
+; CHECK-NEXT:    [[TMP84]] = add <vscale x 4 x i32> [[TMP83]], [[VEC_PHI21]]
+; CHECK-NEXT:    [[INDEX_NEXT27]] = add nuw i64 [[INDEX17]], [[TMP52]]
 ; CHECK-NEXT:    [[TMP69:%.*]] = icmp eq i64 [[INDEX_NEXT27]], [[N_VEC16]]
 ; CHECK-NEXT:    br i1 [[TMP69]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP70:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE30]])
-; CHECK-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE28]])
-; CHECK-NEXT:    [[TMP72:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE26]])
-; CHECK-NEXT:    [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE24]])
+; CHECK-NEXT:    [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP84]])
+; CHECK-NEXT:    [[TMP87:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP78]])
+; CHECK-NEXT:    [[TMP88:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP72]])
+; CHECK-NEXT:    [[TMP89:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP66]])
 ; CHECK-NEXT:    [[CMP_N38:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC16]]
 ; CHECK-NEXT:    br i1 [[CMP_N38]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ;
@@ -947,7 +949,9 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated(
 ; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVE1-NEXT:  iter.check:
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP11]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK-INTERLEAVE1:       vector.main.loop.iter.check:
 ; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16
@@ -981,7 +985,9 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated(
 ; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  iter.check:
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP16]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK-INTERLEAVED:       vector.main.loop.iter.check:
 ; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 32
@@ -1076,7 +1082,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[TMP15]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP8]], <16 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[TMP2]])
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP18:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP18]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
@@ -1089,10 +1095,10 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %gep.a = getelementptr inbounds nuw i8, ptr %b, i64 %iv
+  %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv
   %load.a = load i8, ptr %gep.a, align 1
   %ext.a = sext i8 %load.a to i32
-  %gep.a2 = getelementptr inbounds nuw i8, ptr %a, i64 %iv
+  %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv
   %load.b = load i8, ptr %gep.a2, align 1
   %ext.b = sext i8 %load.b to i32
   %mul = mul nsw i32 %ext.b, %ext.a
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index fe29c3ebf5eda7..e62d34f59834a7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -38,17 +38,33 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<%8> = compute-reduction-result ir<[[ACC]]>, ir<%add>
-; CHECK-NEXT:   EMIT vp<%9> = extract-from-end vp<%8>, ir<1>
-; CHECK-NEXT:   EMIT vp<%10> = icmp eq ir<0>, vp<%1>
-; CHECK-NEXT:   EMIT branch-on-cond vp<%10>
+; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<%add>
+; CHECK-NEXT:   EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<0>, vp<%1>
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%9>)
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]>)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<%bc.merge.rdx> = resume-phi vp<[[RED_RESULT]]>, ir<0>
+; CHECK-NEXT: Successor(s): ir-bb<for.body>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.body>:
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:   IR   %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] (extra operand: vp<%bc.merge.rdx>)
+; CHECK-NEXT:   IR   %gep.a = getelementptr i8, ptr %a, i64 %iv
+; CHECK-NEXT:   IR   %load.a = load i8, ptr %gep.a, align 1
+; CHECK-NEXT:   IR   %ext.a = zext i8 %load.a to i32
+; CHECK-NEXT:   IR   %gep.b = getelementptr i8, ptr %b, i64 %iv
+; CHECK-NEXT:   IR   %load.b = load i8, ptr %gep.b, align 1
+; CHECK-NEXT:   IR   %ext.b = zext i8 %load.b to i32
+; CHECK-NEXT:   IR   %mul = mul i32 %ext.b, %ext.a
+; CHECK-NEXT:   IR   %add = add i32 %mul, %accum
+; CHECK-NEXT:   IR   %iv.next = add i64 %iv, 1
+; CHECK-NEXT:   IR   %exitcond.not = icmp eq i64 %iv.next, 0
 ; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
 ; CHECK:      VPlan 'Initial VPlan for VF={8,16},UF>=1' {
@@ -88,10 +104,26 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]>)
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%9>)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<%bc.merge.rdx> = resume-phi vp<[[RED_RESULT]]>, ir<0>
+; CHECK-NEXT: Successor(s): ir-bb<for.body>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.body>:
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:   IR   %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] (extra operand: vp<%bc.merge.rdx>)
+; CHECK-NEXT:   IR   %gep.a = getelementptr i8, ptr %a, i64 %iv
+; CHECK-NEXT:   IR   %load.a = load i8, ptr %gep.a, align 1
+; CHECK-NEXT:   IR   %ext.a = zext i8 %load.a to i32
+; CHECK-NEXT:   IR   %gep.b = getelementptr i8, ptr %b, i64 %iv
+; CHECK-NEXT:   IR   %load.b = load i8, ptr %gep.b, align 1
+; CHECK-NEXT:   IR   %ext.b = zext i8 %load.b to i32
+; CHECK-NEXT:   IR   %mul = mul i32 %ext.b, %ext.a
+; CHECK-NEXT:   IR   %add = add i32 %mul, %accum
+; CHECK-NEXT:   IR   %iv.next = add i64 %iv, 1
+; CHECK-NEXT:   IR   %exitcond.not = icmp eq i64 %iv.next, 0
 ; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
 ;

>From 86745585e1a9d58f25afca6282d8a04567b2c0f8 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 11 Nov 2024 10:21:05 +0000
Subject: [PATCH 33/56] Move functions to VPRecipeBuilder

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 83 +++++++++----------
 .../Transforms/Vectorize/VPRecipeBuilder.h    | 10 ++-
 2 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 105ff0ef62d554..07bf329d05c436 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8663,16 +8663,48 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
   return Recipe;
 }
 
+void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
+  // Find all possible partial reductions
+  SmallVector<PartialReductionChain, 1> PartialReductionChains;
+  for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
+    if (std::optional<PartialReductionChain> Chain =
+            getScaledReduction(Phi, RdxDesc, Range))
+      PartialReductionChains.push_back(*Chain);
+
+  // A partial reduction is invalid if any of its extends are used by
+  // something that isn't another partial reduction. This is because the
+  // extends are intended to be lowered along with the reduction itself.
+
+  // Build up a set of partial reduction bin ops for efficient use checking
+  SmallSet<User *, 4> PartialReductionBinOps;
+  for (auto It : PartialReductionChains) {
+    if (It.BinOp)
+      PartialReductionBinOps.insert(It.BinOp);
+  }
+
+  auto ExtendIsOnlyUsedByPartialReductions =
+      [&PartialReductionBinOps](Instruction *Extend) {
+        return all_of(Extend->users(), [&](const User *U) {
+          return PartialReductionBinOps.contains(U);
+        });
+      };
+
+  // Check if each use of a chain's two extends is a partial reduction
+  // and only add those those that don't have non-partial reduction users
+  for (auto It : PartialReductionChains) {
+    PartialReductionChain Chain = It;
+    if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
+        ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
+      ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Chain));
+  }
+}
+
 /// Examines reduction operations to see if the target can use a cheaper
 /// operation with a wider per-iteration input VF and narrower PHI VF.
 /// Returns a struct containing the ratio between the two VFs and other cached
 /// information, or null if no scalable reduction was found.
-static std::optional<PartialReductionChain>
-getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx,
-                   const TargetTransformInfo *TTI, VFRange &Range,
-                   LoopVectorizationCostModel &CM) {
-  // FIXME: Should we move this to VPRecipeBuilder and cache the values needed
-  //        for the TTI query?
+std::optional<PartialReductionChain> VPRecipeBuilder::getScaledReduction(
+    PHINode *PHI, const RecurrenceDescriptor &Rdx, VFRange &Range) {
   // TODO: Allow scaling reductions when predicating. The select at
   // the end of the loop chooses between the phi value and most recent
   // reduction result, both of which have different VFs to the active lane
@@ -8824,36 +8856,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   return tryToWiden(Instr, Operands, VPBB);
 }
 
-void VPRecipeBuilder::addScaledReductionExitInstrs(
-    SmallVector<PartialReductionChain> Chains) {
-  // A partial reduction is invalid if any of its extends are used by
-  // something that isn't another partial reduction. This is because the
-  // extends are intended to be lowered along with the reduction itself.
-
-  // Build up a set of partial reduction bin ops for efficient use checking
-  SmallSet<User *, 4> PartialReductionBinOps;
-  for (auto It : Chains) {
-    if (It.BinOp)
-      PartialReductionBinOps.insert(It.BinOp);
-  }
-
-  auto ExtendIsOnlyUsedByPartialReductions =
-      [&PartialReductionBinOps](Instruction *Extend) {
-        return all_of(Extend->users(), [&](const User *U) {
-          return PartialReductionBinOps.contains(U);
-        });
-      };
-
-  // Check if each use of a chain's two extends is a partial reduction
-  // and only add those those that don't have non-partial reduction users
-  for (auto It : Chains) {
-    PartialReductionChain Chain = It;
-    if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
-        ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
-      ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Chain));
-  }
-}
-
 VPRecipeBase *
 VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
                                              ArrayRef<VPValue *> Operands) {
@@ -9219,14 +9221,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         return Legal->blockNeedsPredication(BB) || NeedsBlends;
       });
 
-  // Cache the partial reductions up front so we can remove the invalid ones
-  // before creating the recipes
-  SmallVector<PartialReductionChain, 1> PartialReductionChains;
-  for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
-    if (std::optional<PartialReductionChain> Chain =
-            getScaledReduction(Phi, RdxDesc, &TTI, Range, CM))
-      PartialReductionChains.push_back(*Chain);
-  RecipeBuilder.addScaledReductionExitInstrs(PartialReductionChains);
+  RecipeBuilder.collectScaledReductions(Range);
 
   auto *MiddleVPBB = Plan->getMiddleBlock();
   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 530401a7520670..b6eb02cccaf32a 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -138,6 +138,10 @@ class VPRecipeBuilder {
   VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
                                          ArrayRef<VPValue *> Operands);
 
+  std::optional<PartialReductionChain>
+  getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx,
+                     VFRange &Range);
+
 public:
   VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI,
                   const TargetTransformInfo *TTI,
@@ -155,7 +159,7 @@ class VPRecipeBuilder {
                : std::make_optional(It->second);
   }
 
-  void addScaledReductionExitInstrs(SmallVector<PartialReductionChain> Chains);
+  void collectScaledReductions(VFRange &Range);
 
   /// Create and return a widened recipe for \p I if one can be created within
   /// the given VF \p Range.
@@ -163,8 +167,8 @@ class VPRecipeBuilder {
                                        ArrayRef<VPValue *> Operands,
                                        VFRange &Range, VPBasicBlock *VPBB);
 
-  // Create and return a partial reduction recipe for a reduction instruction
-  // along with binary operation and reduction phi operands.
+  /// Create and return a partial reduction recipe for a reduction instruction
+  /// along with binary operation and reduction phi operands.
   VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
                                             ArrayRef<VPValue *> Operands);
 

>From 460711f26d2c517727bcdecbee2f16d5bdfa5d35 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 11 Nov 2024 10:21:25 +0000
Subject: [PATCH 34/56] Remove unneeded test

---
 .../CodeGen/AArch64/partial-reduce-sdot.ll    | 69 -------------------
 1 file changed, 69 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll

diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll b/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
deleted file mode 100644
index 7034bc6a9c6357..00000000000000
--- a/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-target-instruction-cost=1 -S < %s | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-none-unknown-elf"
-
-define i32 @dotp(ptr %a, ptr %b) #0 {
-; CHECK-LABEL: define i32 @dotp(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY1:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY1]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP22:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP23:%.*]] = mul <16 x i32> [[TMP8]], [[TMP22]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP23]])
-; CHECK-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 0
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %gep.a = getelementptr i8, ptr %a, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = zext i8 %load.a to i32
-  %gep.b = getelementptr i8, ptr %b, i64 %iv
-  %load.b = load i8, ptr %gep.b, align 1
-  %ext.b = zext i8 %load.b to i32
-  %mul = mul i32 %ext.b, %ext.a
-  %add = add i32 %mul, %accum
-  %iv.next = add i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %exit, label %for.body
-
-exit:                        ; preds = %for.body
-  ret i32 %add
-
-; uselistorder directives
-  uselistorder i32 %add, { 1, 0 }
-}
-
-attributes #0 = { "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+v8a" }
-;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
-;.

>From 961eda30a1974754163993005889607e00708200 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 11 Nov 2024 10:27:28 +0000
Subject: [PATCH 35/56] Add more patterns to print test

---
 .../LoopVectorize/AArch64/vplan-printing.ll      | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index e62d34f59834a7..aba158a8b4c733 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -22,12 +22,12 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<%add>
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]>
-; CHECK-NEXT:   vp<%4> = vector-pointer ir<%gep.a>
-; CHECK-NEXT:   WIDEN ir<%load.a> = load vp<%4>
+; CHECK-NEXT:   vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
+; CHECK-NEXT:   WIDEN ir<%load.a> = load vp<[[PTR_A]]>
 ; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
 ; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]>
-; CHECK-NEXT:   vp<%5> = vector-pointer ir<%gep.b>
-; CHECK-NEXT:   WIDEN ir<%load.b> = load vp<%5>
+; CHECK-NEXT:   vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b>
+; CHECK-NEXT:   WIDEN ir<%load.b> = load vp<[[PTR_B:%.+]]>
 ; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
 ; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
 ; CHECK-NEXT:   WIDEN ir<%add> = add ir<%mul>, ir<[[ACC]]>
@@ -81,12 +81,12 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, vp<%6> (VF scaled by 1/4)
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]>
-; CHECK-NEXT:   vp<%4> = vector-pointer ir<%gep.a>
-; CHECK-NEXT:   WIDEN ir<%load.a> = load vp<%4>
+; CHECK-NEXT:   vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
+; CHECK-NEXT:   WIDEN ir<%load.a> = load vp<[[PTR_A]]>
 ; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
 ; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]>
-; CHECK-NEXT:   vp<%5> = vector-pointer ir<%gep.b>
-; CHECK-NEXT:   WIDEN ir<%load.b> = load vp<%5>
+; CHECK-NEXT:   vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b>
+; CHECK-NEXT:   WIDEN ir<%load.b> = load vp<[[PTR_B]]>
 ; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
 ; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
 ; CHECK-NEXT:   PARTIAL-REDUCE vp<%6> = add ir<%mul>, ir<[[ACC]]>

>From 84f903b2f994829a79f9bd3ad6b9674b10d827e1 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 11 Nov 2024 14:39:48 +0000
Subject: [PATCH 36/56] Remove attributes

---
 .../LoopVectorize/AArch64/partial-reduce-dot-product.ll         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 8fa93b53323ed6..5eb8d4d94af499 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1206,4 +1206,4 @@ for.exit:                        ; preds = %for.body
 !8 = !{!"llvm.loop.mustprogress"}
 !9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
 !10 = !{!"llvm.loop.vectorize.enable", i1 true}
-attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable vscale_range(1,16) "target-features"="+sve" }
+attributes #0 = { vscale_range(1,16) "target-features"="+sve" }

>From 1d99d87e13b06f57da34f8c5be5e1e103abb879d Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 11 Nov 2024 15:02:04 +0000
Subject: [PATCH 37/56] Disable epilogue vectorisation

---
 .../AArch64/partial-reduce-dot-product.ll     | 486 +++++-------------
 1 file changed, 139 insertions(+), 347 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 5eb8d4d94af499..0dca36faf09652 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-target-instruction-cost=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVE1
-; RUN: opt -passes=loop-vectorize -force-target-instruction-cost=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVED
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -force-target-instruction-cost=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVE1
+; RUN: opt -passes=loop-vectorize -force-target-instruction-cost=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVED
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"
@@ -8,12 +8,7 @@ target triple = "aarch64-none-unknown-elf"
 define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-LABEL: define i32 @dotp(
 ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-INTERLEAVE1-NEXT:  iter.check:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.main.loop.iter.check:
+; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY1:%.*]]
@@ -33,56 +28,35 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP24]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE1]])
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK-INTERLEAVE1:       vec.epilog.iter.check:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK-INTERLEAVE1:       vec.epilog.ph:
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
+; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP28]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vec.epilog.vector.body:
+; CHECK-INTERLEAVE1:       for.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul <vscale x 4 x i32> [[TMP25]], [[TMP22]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = add <vscale x 4 x i32> [[TMP26]], [[VEC_PHI3]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK-INTERLEAVE1:       vec.epilog.middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[ADD_LCSSA]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-INTERLEAVED-NEXT:  iter.check:
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.main.loop.iter.check:
+; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY1:%.*]]
@@ -111,48 +85,32 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP29]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX1:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX1]])
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK-INTERLEAVED:       vec.epilog.iter.check:
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-INTERLEAVED-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP18]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK-INTERLEAVED:       vec.epilog.ph:
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
+; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP19]], 4
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vec.epilog.vector.body:
+; CHECK-INTERLEAVED:       for.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI7:%.*]] = phi <vscale x 4 x i32> [ [[TMP33]], [[VEC_EPILOG_PH]] ], [ [[BIN_RDX:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP15]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul <vscale x 4 x i32> [[TMP30]], [[TMP27]]
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX]] = add <vscale x 4 x i32> [[TMP31]], [[VEC_PHI7]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK-INTERLEAVED:       vec.epilog.middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[ADD_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -179,12 +137,7 @@ for.exit:                        ; preds = %for.body
 define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types(
 ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  iter.check:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP72:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP73:%.*]] = mul i64 [[TMP72]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP73]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-INTERLEAVE1:       vector.main.loop.iter.check:
+; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -264,65 +217,17 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK-INTERLEAVE1:       vec.epilog.iter.check:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP74:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP75:%.*]] = mul i64 [[TMP74]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP75]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]]
-; CHECK-INTERLEAVE1:       vec.epilog.ph:
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP76:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP77:%.*]] = mul i64 [[TMP76]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP77]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP78:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP79:%.*]] = mul i64 [[TMP78]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
-; CHECK-INTERLEAVE1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-INTERLEAVE1-NEXT:    [[TMP80:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP81:%.*]] = add <vscale x 4 x i64> [[TMP80]], zeroinitializer
-; CHECK-INTERLEAVE1-NEXT:    [[TMP82:%.*]] = mul <vscale x 4 x i64> [[TMP81]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-INTERLEAVE1-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT]], [[TMP82]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP83:%.*]] = mul i64 1, [[TMP79]]
-; CHECK-INTERLEAVE1-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP83]], i64 0
-; CHECK-INTERLEAVE1-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-INTERLEAVE1-NEXT:    [[TMP84:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vec.epilog.vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 4 x i32> [ [[TMP84]], [[SCALAR_PH]] ], [ [[TMP92:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP85:%.*]] = add i64 [[INDEX2]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP86:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP85]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP87:%.*]] = getelementptr i8, ptr [[TMP86]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP87]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP88:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD6]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP89:%.*]] = getelementptr i8, ptr [[B]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP89]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
-; CHECK-INTERLEAVE1-NEXT:    [[TMP90:%.*]] = zext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP91:%.*]] = mul <vscale x 4 x i32> [[TMP90]], [[TMP88]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP92]] = add <vscale x 4 x i32> [[TMP91]], [[VEC_PHI5]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT7]] = add nuw i64 [[INDEX2]], [[TMP79]]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT4]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP93:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP93]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-INTERLEAVE1:       vec.epilog.middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP94:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP92]])
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK-INTERLEAVE1:       vec.epilog.scalar.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX8:%.*]] = phi i32 [ [[TMP94]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       for.body:
-; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX8]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
 ; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
@@ -333,19 +238,14 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
 ; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-INTERLEAVE1:       for.exit:
-; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[MIDDLE_BLOCK]] ], [ [[TMP94]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-INTERLEAVE1-NEXT:    ret i32 [[ADD_LCSSA]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  iter.check:
-; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = mul i64 [[TMP37]], 4
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP38]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-INTERLEAVED:       vector.main.loop.iter.check:
+; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -496,11 +396,11 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ;
 entry:
   br label %for.body
@@ -560,7 +460,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 16 x i32> [[TMP16]], [[TMP17]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -608,7 +508,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add <vscale x 16 x i32> [[TMP25]], [[TMP26]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -667,7 +567,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17]] = add <vscale x 16 x i32> [[TMP16]], [[TMP15]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -707,7 +607,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 16 x i32> [[TMP22]], [[TMP21]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -734,161 +634,73 @@ for.exit:                        ; preds = %for.body
 define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-LABEL: define i32 @dotp_unrolled(
 ; CHECK-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[NUM_IN]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[TMP10]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP241:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP10]], 2
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP242:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP160:%.*]] = or disjoint i64 [[TMP10]], 3
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP160]]
-; CHECK-NEXT:    [[TMP243:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP160]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP15]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP13]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP17]])
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
-; CHECK-NEXT:    [[TMP244:%.*]] = getelementptr inbounds i8, ptr [[TMP241]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i8>, ptr [[TMP244]], align 1
-; CHECK-NEXT:    [[TMP245:%.*]] = sext <16 x i8> [[WIDE_LOAD7]] to <16 x i32>
-; CHECK-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP245]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP21]])
+; CHECK-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP246:%.*]] = getelementptr inbounds i8, ptr [[TMP242]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP246]], align 1
-; CHECK-NEXT:    [[TMP247:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP247]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP25]])
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
-; CHECK-NEXT:    [[TMP248:%.*]] = getelementptr inbounds i8, ptr [[TMP243]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP248]], align 1
-; CHECK-NEXT:    [[TMP249:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
-; CHECK-NEXT:    [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP249]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP29]])
+; CHECK-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE11]])
-; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE9]])
-; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
-; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
+; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
+; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
+; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[NUM_IN]], [[N_VEC]]
-; CHECK-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP51]], 4
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP40]]
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP31]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX12:%.*]] = phi i32 [ [[TMP32]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP33]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX14:%.*]] = phi i32 [ [[TMP34]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 4
-; CHECK-NEXT:    [[N_MOD_VF15:%.*]] = urem i64 [[NUM_IN]], [[TMP42]]
-; CHECK-NEXT:    [[N_VEC16:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF15]]
-; CHECK-NEXT:    [[TMP43:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP52:%.*]] = mul i64 [[TMP43]], 4
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX12]], i32 0
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX13]], i32 0
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX14]], i32 0
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX17:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL1]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI21:%.*]] = phi <vscale x 4 x i32> [ [[TMP53]], [[VEC_EPILOG_PH]] ], [ [[TMP84:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI22:%.*]] = phi <vscale x 4 x i32> [ [[TMP56]], [[VEC_EPILOG_PH]] ], [ [[TMP78:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI23:%.*]] = phi <vscale x 4 x i32> [ [[TMP57]], [[VEC_EPILOG_PH]] ], [ [[TMP72:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI24:%.*]] = phi <vscale x 4 x i32> [ [[TMP48]], [[VEC_EPILOG_PH]] ], [ [[TMP66:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP39:%.*]] = add i64 [[INDEX17]], 0
-; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP39]]
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP39]]
-; CHECK-NEXT:    [[TMP44:%.*]] = or disjoint i64 [[TMP39]], 1
-; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP44]]
-; CHECK-NEXT:    [[TMP250:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP44]]
-; CHECK-NEXT:    [[TMP46:%.*]] = or disjoint i64 [[TMP39]], 2
-; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP46]]
-; CHECK-NEXT:    [[TMP251:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP46]]
-; CHECK-NEXT:    [[TMP180:%.*]] = or disjoint i64 [[TMP39]], 3
-; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP180]]
-; CHECK-NEXT:    [[TMP252:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP180]]
-; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP47]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <vscale x 4 x i8>, ptr [[TMP59]], align 1
-; CHECK-NEXT:    [[TMP62:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD25]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[TMP49]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <vscale x 4 x i8>, ptr [[TMP50]], align 1
-; CHECK-NEXT:    [[TMP64:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD26]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP65:%.*]] = mul nsw <vscale x 4 x i32> [[TMP64]], [[TMP62]]
-; CHECK-NEXT:    [[TMP66]] = add <vscale x 4 x i32> [[TMP65]], [[VEC_PHI24]]
-; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[TMP45]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD27:%.*]] = load <vscale x 4 x i8>, ptr [[TMP58]], align 1
-; CHECK-NEXT:    [[TMP73:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD27]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP253:%.*]] = getelementptr inbounds i8, ptr [[TMP250]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD28:%.*]] = load <vscale x 4 x i8>, ptr [[TMP253]], align 1
-; CHECK-NEXT:    [[TMP70:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD28]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP71:%.*]] = mul nsw <vscale x 4 x i32> [[TMP73]], [[TMP70]]
-; CHECK-NEXT:    [[TMP72]] = add <vscale x 4 x i32> [[TMP71]], [[VEC_PHI23]]
-; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <vscale x 4 x i8>, ptr [[TMP63]], align 1
-; CHECK-NEXT:    [[TMP74:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD29]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP257:%.*]] = getelementptr inbounds i8, ptr [[TMP251]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <vscale x 4 x i8>, ptr [[TMP257]], align 1
-; CHECK-NEXT:    [[TMP76:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD30]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP77:%.*]] = mul nsw <vscale x 4 x i32> [[TMP74]], [[TMP76]]
-; CHECK-NEXT:    [[TMP78]] = add <vscale x 4 x i32> [[TMP77]], [[VEC_PHI22]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[TMP55]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <vscale x 4 x i8>, ptr [[TMP68]], align 1
-; CHECK-NEXT:    [[TMP80:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD31]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP254:%.*]] = getelementptr inbounds i8, ptr [[TMP252]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <vscale x 4 x i8>, ptr [[TMP254]], align 1
-; CHECK-NEXT:    [[TMP82:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD32]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP83:%.*]] = mul nsw <vscale x 4 x i32> [[TMP80]], [[TMP82]]
-; CHECK-NEXT:    [[TMP84]] = add <vscale x 4 x i32> [[TMP83]], [[VEC_PHI21]]
-; CHECK-NEXT:    [[INDEX_NEXT27]] = add nuw i64 [[INDEX17]], [[TMP52]]
-; CHECK-NEXT:    [[TMP69:%.*]] = icmp eq i64 [[INDEX_NEXT27]], [[N_VEC16]]
-; CHECK-NEXT:    br i1 [[TMP69]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP84]])
-; CHECK-NEXT:    [[TMP87:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP78]])
-; CHECK-NEXT:    [[TMP88:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP72]])
-; CHECK-NEXT:    [[TMP89:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP66]])
-; CHECK-NEXT:    [[CMP_N38:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC16]]
-; CHECK-NEXT:    br i1 [[CMP_N38]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ;
 entry:
   br label %for.body
@@ -948,12 +760,7 @@ exit:                                        ; preds = %for.body
 define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated(
 ; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  iter.check:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP11]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-INTERLEAVE1:       vector.main.loop.iter.check:
+; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
@@ -976,20 +783,15 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated(
 ; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  iter.check:
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP16]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-INTERLEAVED:       vector.main.loop.iter.check:
+; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 32
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
@@ -1021,12 +823,12 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE6]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP12]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
 ;
 entry:
   br label %for.body
@@ -1067,27 +869,27 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP10]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i32> [[TMP17]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP15]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP8]], <16 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP8]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add <16 x i32> [[TMP10]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP12]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP11]], <16 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[TMP2]])
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP18]], i32 0
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP13]], i32 0
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]])
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP12]])
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ;
 entry:
   br label %for.body
@@ -1114,12 +916,7 @@ exit:                        ; preds = %for.body
 define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user(
 ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  iter.check:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-INTERLEAVE1:       vector.main.loop.iter.check:
+; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1139,16 +936,11 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10]] = add <16 x i32> [[TMP9]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  iter.check:
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-INTERLEAVED:       vector.main.loop.iter.check:
+; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1177,7 +969,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16]] = add <16 x i32> [[TMP14]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ;
 entry:
   br label %for.body

>From 38c066bd3c09574805fd653342b0bfe532b43438 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 11 Nov 2024 15:18:59 +0000
Subject: [PATCH 38/56] Add epilogue vectorisation test

---
 .../partial-reduce-dot-product-epilogue.ll    | 120 ++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
new file mode 100644
index 00000000000000..7b598d1edc72a3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-target-instruction-cost=1 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @dotp(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define i32 @dotp(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP11]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP15]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT6:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ [[TMP18]], [[SCALAR_PH]] ], [ [[TMP27:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX2]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP23]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP26:%.*]] = mul <vscale x 4 x i32> [[TMP25]], [[TMP22]]
+; CHECK-NEXT:    [[TMP27]] = add <vscale x 4 x i32> [[TMP26]], [[VEC_PHI3]]
+; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX2]], [[TMP17]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP27]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX7:%.*]] = phi i32 [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX7]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY1]] ], [ [[TMP9]], [[MIDDLE_BLOCK1]] ], [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+attributes #0 = { vscale_range(1,16) "target-features"="+sve" }

>From c4d0b4efaa72d0bd835a1be3f7214e3b467131fa Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 11 Nov 2024 15:48:56 +0000
Subject: [PATCH 39/56] Add max bandwidth run line

---
 .../AArch64/partial-reduce-dot-product.ll     | 670 +++++++++++++++---
 1 file changed, 562 insertions(+), 108 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 0dca36faf09652..fdf1ce82505970 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -force-target-instruction-cost=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVE1
-; RUN: opt -passes=loop-vectorize -force-target-instruction-cost=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVED
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -force-target-instruction-cost=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
+; RUN: opt -passes=loop-vectorize -force-target-instruction-cost=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"
@@ -112,6 +113,38 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret i32 [[ADD_LCSSA]]
 ;
+; CHECK-MAXBW-LABEL: define i32 @dotp(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+;
 entry:
   br label %for.body
 
@@ -402,6 +435,90 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-MAXBW-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-MAXBW-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-MAXBW-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-MAXBW-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-MAXBW-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-MAXBW-NEXT:    [[TMP101:%.*]] = load i16, ptr [[TMP37]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP102:%.*]] = load i16, ptr [[TMP38]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP103:%.*]] = load i16, ptr [[TMP39]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP104:%.*]] = load i16, ptr [[TMP40]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP105:%.*]] = load i16, ptr [[TMP41]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP106:%.*]] = load i16, ptr [[TMP42]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP107:%.*]] = load i16, ptr [[TMP43]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP108:%.*]] = load i16, ptr [[TMP44]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP109:%.*]] = load i16, ptr [[TMP45]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP110:%.*]] = load i16, ptr [[TMP46]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP111:%.*]] = load i16, ptr [[TMP47]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP112:%.*]] = load i16, ptr [[TMP48]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP113:%.*]] = load i16, ptr [[TMP49]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP50]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP51]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
+; CHECK-MAXBW-NEXT:    [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
+; CHECK-MAXBW-NEXT:    [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
+; CHECK-MAXBW-NEXT:    [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
+; CHECK-MAXBW-NEXT:    [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
+; CHECK-MAXBW-NEXT:    [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
+; CHECK-MAXBW-NEXT:    [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
+; CHECK-MAXBW-NEXT:    [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
+; CHECK-MAXBW-NEXT:    [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
+; CHECK-MAXBW-NEXT:    [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
+; CHECK-MAXBW-NEXT:    [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
+; CHECK-MAXBW-NEXT:    [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
+; CHECK-MAXBW-NEXT:    [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
+; CHECK-MAXBW-NEXT:    [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
+; CHECK-MAXBW-NEXT:    [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
+; CHECK-MAXBW-NEXT:    [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
+; CHECK-MAXBW-NEXT:    [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
+; CHECK-MAXBW-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+;
 entry:
   br label %for.body
 
@@ -510,6 +627,43 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
+; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP25]], i32 -1)
+; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
 entry:
   br label %for.body
 
@@ -609,6 +763,42 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP14]]
+; CHECK-MAXBW-NEXT:    [[TMP21]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
 entry:
   br label %for.body
 
@@ -632,75 +822,200 @@ for.exit:                        ; preds = %for.body
 }
 
 define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
-; CHECK-LABEL: define i32 @dotp_unrolled(
-; CHECK-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
-; CHECK-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
-; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
-; CHECK-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
-; CHECK-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
-; CHECK-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
-; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
-; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
-; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled(
+; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
+; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled(
+; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI4:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI7:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP6]], 1
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = or disjoint i64 [[TMP6]], 2
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = or disjoint i64 [[TMP6]], 3
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x i8>, ptr [[TMP24]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD9]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 8 x i32> [[TMP29]], [[TMP23]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE11]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI7]], <vscale x 8 x i32> [[TMP31]])
+; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 8 x i8>, ptr [[TMP32]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD12]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 8 x i8>, ptr [[TMP38]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD14]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = mul nsw <vscale x 8 x i32> [[TMP37]], [[TMP43]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI6]], <vscale x 8 x i32> [[TMP45]])
+; CHECK-MAXBW-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 8 x i8>, ptr [[TMP46]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD18]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD20:%.*]] = load <vscale x 8 x i8>, ptr [[TMP52]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP57:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD20]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP59:%.*]] = mul nsw <vscale x 8 x i32> [[TMP51]], [[TMP57]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE17]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI5]], <vscale x 8 x i32> [[TMP59]])
+; CHECK-MAXBW-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD24:%.*]] = load <vscale x 8 x i8>, ptr [[TMP60]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP65:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD24]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD26:%.*]] = load <vscale x 8 x i8>, ptr [[TMP66]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD26]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP73:%.*]] = mul nsw <vscale x 8 x i32> [[TMP65]], [[TMP71]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE16]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI4]], <vscale x 8 x i32> [[TMP73]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -830,6 +1145,39 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
 ;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated(
+; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul nsw <vscale x 8 x i32> [[TMP20]], [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+;
 entry:
   br label %for.body
 
@@ -853,43 +1201,117 @@ exit:                        ; preds = %for.body
 }
 
 define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 {
-; CHECK-LABEL: define i32 @not_dotp_predicated_pragma(
-; CHECK-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], 16
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[N]], 16
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[N]])
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP8]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add <16 x i32> [[TMP10]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP12]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP11]], <16 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[TMP2]])
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP13]], i32 0
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP12]])
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[N]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[N]])
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP8]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = add <16 x i32> [[TMP10]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP11]], <16 x i32> [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP13]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[N]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[N]])
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP8]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add <16 x i32> [[TMP10]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP11]], <16 x i32> [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[TMP2]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP13]], i32 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-MAXBW-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
+; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -971,6 +1393,38 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+;
 entry:
   br label %for.body
 

>From 00036eb30028b97138258350e86a8b3666913471 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 18 Nov 2024 10:01:40 +0000
Subject: [PATCH 40/56] Document getPartialReductionCost

---
 llvm/include/llvm/Analysis/TargetTransformInfo.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 3c43110b4ebd85..7ade2a0f033f72 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -2085,6 +2085,13 @@ class TargetTransformInfo::Concept {
   /// \return if target want to issue a prefetch in address space \p AS.
   virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;
 
+  /// \return The cost of a partial reduction, which is a reduction from a
+  /// vector to another vector with fewer elements of larger size. They are
+  /// represented by the llvm.experimental.partial.reduce.add intrinsic, which
+  /// takes an accumulator and a binary operation operand that itself is fed by
+  /// two extends. An example of an operation that uses a partial reduction is a
+  /// dot product, which reduces a vector to another of 4 times larger but fewer
+  /// elements.
   virtual InstructionCost
   getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
                           ElementCount VF, PartialReductionExtendKind OpAExtend,

>From 5f7b308c1eb0362e2fb4fa49224ba4ef1b23f2b1 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 18 Nov 2024 10:25:14 +0000
Subject: [PATCH 41/56] Full stops

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +++---
 llvm/lib/Transforms/Vectorize/VPlan.h           | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 07bf329d05c436..e417966e5a5b4a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8664,7 +8664,7 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
 }
 
 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
-  // Find all possible partial reductions
+  // Find all possible partial reductions.
   SmallVector<PartialReductionChain, 1> PartialReductionChains;
   for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
     if (std::optional<PartialReductionChain> Chain =
@@ -8675,7 +8675,7 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
   // something that isn't another partial reduction. This is because the
   // extends are intended to be lowered along with the reduction itself.
 
-  // Build up a set of partial reduction bin ops for efficient use checking
+  // Build up a set of partial reduction bin ops for efficient use checking.
   SmallSet<User *, 4> PartialReductionBinOps;
   for (auto It : PartialReductionChains) {
     if (It.BinOp)
@@ -8690,7 +8690,7 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
       };
 
   // Check if each use of a chain's two extends is a partial reduction
-  // and only add those those that don't have non-partial reduction users
+  // and only add those those that don't have non-partial reduction users.
   for (auto It : PartialReductionChains) {
     PartialReductionChain Chain = It;
     if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 43736dc69e528a..1ae45771fe11f8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2627,7 +2627,7 @@ class VPReductionRecipe : public VPSingleDefRecipe {
     return R && classof(R);
   }
 
-  /// Generate the reduction in the loop
+  /// Generate the reduction in the loop.
   void execute(VPTransformState &State) override;
 
   /// Return the cost of VPReductionRecipe.

>From 81b45a9c63fdd63c548b8dde7591c4f96e899116 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 18 Nov 2024 13:56:51 +0000
Subject: [PATCH 42/56] It -> Chain

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e417966e5a5b4a..ab30521b4cd3b0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8691,8 +8691,7 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
 
   // Check if each use of a chain's two extends is a partial reduction
   // and only add those those that don't have non-partial reduction users.
-  for (auto It : PartialReductionChains) {
-    PartialReductionChain Chain = It;
+  for (auto Chain : PartialReductionChains) {
     if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
         ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
       ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Chain));

>From 70364d55f60c1ccf1e8471f3b4f7d802cf4cd46a Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 18 Nov 2024 14:11:02 +0000
Subject: [PATCH 43/56] Use pattern matching for extends

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ab30521b4cd3b0..f9dc126024a9cd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8724,18 +8724,15 @@ std::optional<PartialReductionChain> VPRecipeBuilder::getScaledReduction(
   if (!BinOp || !BinOp->hasOneUse())
     return std::nullopt;
 
-  auto IsSextOrZext = [](Instruction *I) {
-    return I && (I->getOpcode() == Instruction::ZExt ||
-                 I->getOpcode() == Instruction::SExt);
-  };
-
-  auto *ExtA = dyn_cast<Instruction>(BinOp->getOperand(0));
-  auto *ExtB = dyn_cast<Instruction>(BinOp->getOperand(1));
-  if (!IsSextOrZext(ExtA) || !IsSextOrZext(ExtB))
+  using namespace llvm::PatternMatch;
+  Value *A, *B;
+  if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
+      !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
     return std::nullopt;
 
-  Value *A = ExtA->getOperand(0);
-  Value *B = ExtB->getOperand(0);
+  Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
+  Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
+
   // Check that the extends extend from the same type
   if (A->getType() != B->getType())
     return std::nullopt;
@@ -8754,7 +8751,7 @@ std::optional<PartialReductionChain> VPRecipeBuilder::getScaledReduction(
   Chain.ExtendA = ExtA;
   Chain.ExtendB = ExtB;
   Chain.ScaleFactor = TargetScaleFactor;
-  Chain.BinOp = dyn_cast<Instruction>(Op);
+  Chain.BinOp = BinOp;
 
   if (LoopVectorizationPlanner::getDecisionAndClampRange(
           [&](ElementCount VF) {

>From df90fa02d03a471363f42712a138f929790a1784 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 18 Nov 2024 14:16:41 +0000
Subject: [PATCH 44/56] Add PartialReductionChain constructor

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 8 ++------
 llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h | 5 +++++
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f9dc126024a9cd..315283fbfc21a3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8746,12 +8746,8 @@ std::optional<PartialReductionChain> VPRecipeBuilder::getScaledReduction(
   TTI::PartialReductionExtendKind OpBExtend =
       TargetTransformInfo::getPartialReductionExtendKind(ExtB);
 
-  PartialReductionChain Chain;
-  Chain.Reduction = Rdx.getLoopExitInstr();
-  Chain.ExtendA = ExtA;
-  Chain.ExtendB = ExtB;
-  Chain.ScaleFactor = TargetScaleFactor;
-  Chain.BinOp = BinOp;
+  PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp,
+                              TargetScaleFactor);
 
   if (LoopVectorizationPlanner::getDecisionAndClampRange(
           [&](ElementCount VF) {
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index b6eb02cccaf32a..3b1748af0f132d 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -28,6 +28,11 @@ class TargetTransformInfo;
 /// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))),
 /// accumulator).
 struct PartialReductionChain {
+  PartialReductionChain(Instruction *Reduction, Instruction *ExtendA,
+                        Instruction *ExtendB, Instruction *BinOp,
+                        unsigned ScaleFactor)
+      : Reduction(Reduction), ExtendA(ExtendA), ExtendB(ExtendB), BinOp(BinOp),
+        ScaleFactor(ScaleFactor) {}
   /// The top-level binary operation that forms the reduction to a scalar
   /// after the loop body.
   Instruction *Reduction;

>From dbb764af2cce0a6b0eaf9ddb354e1571d530ffe1 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 18 Nov 2024 14:31:40 +0000
Subject: [PATCH 45/56] Document BinOp

---
 llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 3b1748af0f132d..8c385a8a8b2440 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -40,6 +40,7 @@ struct PartialReductionChain {
   Instruction *ExtendA;
   Instruction *ExtendB;
 
+  /// The binary operation using the extends that is then reduced
   Instruction *BinOp;
 
   /// The scaling factor between the size of the reduction type and the

>From 05b50ccb10c4150513efbfe33d311b0eea3bd7b1 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 18 Nov 2024 14:55:12 +0000
Subject: [PATCH 46/56] Move ScaleFactor to map

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 39 ++++++++++---------
 .../Transforms/Vectorize/VPRecipeBuilder.h    | 19 ++++-----
 llvm/lib/Transforms/Vectorize/VPlan.h         |  3 --
 3 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 315283fbfc21a3..533a93d9acb10d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8665,11 +8665,12 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
 
 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
   // Find all possible partial reductions.
-  SmallVector<PartialReductionChain, 1> PartialReductionChains;
+  SmallVector<std::pair<PartialReductionChain, unsigned>, 1>
+      PartialReductionChains;
   for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
-    if (std::optional<PartialReductionChain> Chain =
+    if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
             getScaledReduction(Phi, RdxDesc, Range))
-      PartialReductionChains.push_back(*Chain);
+      PartialReductionChains.push_back(*Pair);
 
   // A partial reduction is invalid if any of its extends are used by
   // something that isn't another partial reduction. This is because the
@@ -8678,8 +8679,8 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
   // Build up a set of partial reduction bin ops for efficient use checking.
   SmallSet<User *, 4> PartialReductionBinOps;
   for (auto It : PartialReductionChains) {
-    if (It.BinOp)
-      PartialReductionBinOps.insert(It.BinOp);
+    if (It.first.BinOp)
+      PartialReductionBinOps.insert(It.first.BinOp);
   }
 
   auto ExtendIsOnlyUsedByPartialReductions =
@@ -8691,10 +8692,11 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
 
   // Check if each use of a chain's two extends is a partial reduction
   // and only add those those that don't have non-partial reduction users.
-  for (auto Chain : PartialReductionChains) {
+  for (auto Pair : PartialReductionChains) {
+    PartialReductionChain Chain = Pair.first;
     if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
         ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
-      ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Chain));
+      ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Pair));
   }
 }
 
@@ -8702,8 +8704,10 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
 /// operation with a wider per-iteration input VF and narrower PHI VF.
 /// Returns a struct containing the ratio between the two VFs and other cached
 /// information, or null if no scalable reduction was found.
-std::optional<PartialReductionChain> VPRecipeBuilder::getScaledReduction(
-    PHINode *PHI, const RecurrenceDescriptor &Rdx, VFRange &Range) {
+std::optional<std::pair<PartialReductionChain, unsigned>>
+VPRecipeBuilder::getScaledReduction(PHINode *PHI,
+                                    const RecurrenceDescriptor &Rdx,
+                                    VFRange &Range) {
   // TODO: Allow scaling reductions when predicating. The select at
   // the end of the loop chooses between the phi value and most recent
   // reduction result, both of which have different VFs to the active lane
@@ -8737,17 +8741,16 @@ std::optional<PartialReductionChain> VPRecipeBuilder::getScaledReduction(
   if (A->getType() != B->getType())
     return std::nullopt;
 
-  unsigned TargetScaleFactor =
-      PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
-          A->getType()->getPrimitiveSizeInBits());
-
   TTI::PartialReductionExtendKind OpAExtend =
       TargetTransformInfo::getPartialReductionExtendKind(ExtA);
   TTI::PartialReductionExtendKind OpBExtend =
       TargetTransformInfo::getPartialReductionExtendKind(ExtB);
 
-  PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp,
-                              TargetScaleFactor);
+  PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp);
+
+  unsigned TargetScaleFactor =
+      PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
+          A->getType()->getPrimitiveSizeInBits());
 
   if (LoopVectorizationPlanner::getDecisionAndClampRange(
           [&](ElementCount VF) {
@@ -8757,7 +8760,7 @@ std::optional<PartialReductionChain> VPRecipeBuilder::getScaledReduction(
             return Cost.isValid();
           },
           Range))
-    return Chain;
+    return std::make_pair(Chain, TargetScaleFactor);
 
   return std::nullopt;
 }
@@ -8788,9 +8791,9 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
 
       // If the PHI is used by a partial reduction, set the scale factor
-      std::optional<PartialReductionChain> Chain =
+      std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
           getScaledReductionForInstr(RdxDesc.getLoopExitInstr());
-      unsigned ScaleFactor = Chain ? Chain->ScaleFactor : 1;
+      unsigned ScaleFactor = Pair ? Pair->second : 1;
       PhiRecipe = new VPReductionPHIRecipe(
           Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
           CM.useOrderedReductions(RdxDesc), ScaleFactor);
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 8c385a8a8b2440..2903224ae78909 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -29,10 +29,9 @@ class TargetTransformInfo;
 /// accumulator).
 struct PartialReductionChain {
   PartialReductionChain(Instruction *Reduction, Instruction *ExtendA,
-                        Instruction *ExtendB, Instruction *BinOp,
-                        unsigned ScaleFactor)
-      : Reduction(Reduction), ExtendA(ExtendA), ExtendB(ExtendB), BinOp(BinOp),
-        ScaleFactor(ScaleFactor) {}
+                        Instruction *ExtendB, Instruction *BinOp)
+      : Reduction(Reduction), ExtendA(ExtendA), ExtendB(ExtendB), BinOp(BinOp) {
+  }
   /// The top-level binary operation that forms the reduction to a scalar
   /// after the loop body.
   Instruction *Reduction;
@@ -42,10 +41,6 @@ struct PartialReductionChain {
 
   /// The binary operation using the extends that is then reduced
   Instruction *BinOp;
-
-  /// The scaling factor between the size of the reduction type and the
-  /// (possibly extended) inputs.
-  unsigned ScaleFactor;
 };
 
 /// Helper class to create VPRecipies from IR instructions.
@@ -92,8 +87,8 @@ class VPRecipeBuilder {
   SmallVector<VPHeaderPHIRecipe *, 4> PhisToFix;
 
   /// The set of reduction exit instructions that will be scaled to
-  /// a smaller VF via partial reductions.
-  DenseMap<const Instruction *, PartialReductionChain>
+  /// a smaller VF via partial reductions. paired with the scaling factor.
+  DenseMap<const Instruction *, std::pair<PartialReductionChain, unsigned>>
       ScaledReductionExitInstrs;
 
   /// Check if \p I can be widened at the start of \p Range and possibly
@@ -144,7 +139,7 @@ class VPRecipeBuilder {
   VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
                                          ArrayRef<VPValue *> Operands);
 
-  std::optional<PartialReductionChain>
+  std::optional<std::pair<PartialReductionChain, unsigned>>
   getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx,
                      VFRange &Range);
 
@@ -157,7 +152,7 @@ class VPRecipeBuilder {
       : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal),
         CM(CM), PSE(PSE), Builder(Builder) {}
 
-  std::optional<PartialReductionChain>
+  std::optional<std::pair<PartialReductionChain, unsigned>>
   getScaledReductionForInstr(const Instruction *ExitInst) {
     auto It = ScaledReductionExitInstrs.find(ExitInst);
     return It == ScaledReductionExitInstrs.end()
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1ae45771fe11f8..9d980f4382bb1a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2328,9 +2328,6 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
   /// The phi is part of an ordered reduction. Requires IsInLoop to be true.
   bool IsOrdered;
 
-  /// The scaling difference between the size of the output of the entire
-  /// reduction and the size of the input.
-
   /// When expanding the reduction PHI, the plan's VF element count is divided
   /// by this factor to form the reduction phi's VF.
   unsigned VFScaleFactor = 1;

>From 04858f7c2584adec271013ca952cb4e17e1b4ada Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 18 Nov 2024 14:56:23 +0000
Subject: [PATCH 47/56] Document getPartialReduction

---
 llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 2903224ae78909..1c84469ed9aea9 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -139,6 +139,10 @@ class VPRecipeBuilder {
   VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
                                          ArrayRef<VPValue *> Operands);
 
+  /// Examines reduction operations to see if the target can use a cheaper
+  /// operation with a wider per-iteration input VF and narrower PHI VF.
+  /// Returns a struct containing the ratio between the two VFs and other cached
+  /// information, or null if no scalable reduction was found.
   std::optional<std::pair<PartialReductionChain, unsigned>>
   getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx,
                      VFRange &Range);

>From fca5172c292f439c2e6dc780c1c25a88689b3a39 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 18 Nov 2024 15:01:30 +0000
Subject: [PATCH 48/56] Document collectScaledReductions

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 ++
 llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 533a93d9acb10d..f84b4fed4327a9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8663,6 +8663,8 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
   return Recipe;
 }
 
+/// Find all possible partial reductions in the loop and track all of those that
+/// are valid so recipes can be formed later.
 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
   // Find all possible partial reductions.
   SmallVector<std::pair<PartialReductionChain, unsigned>, 1>
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 1c84469ed9aea9..2ecce94714d1bb 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -164,6 +164,8 @@ class VPRecipeBuilder {
                : std::make_optional(It->second);
   }
 
+  /// Find all possible partial reductions in the loop and track all of those
+  /// that are valid so recipes can be formed later.
   void collectScaledReductions(VFRange &Range);
 
   /// Create and return a widened recipe for \p I if one can be created within

>From 93fc7af49b78710f373cdc86196aec3ceca8985b Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 18 Nov 2024 15:39:36 +0000
Subject: [PATCH 49/56] Set flags

---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index cacee66bde7abe..5e140bbacd97b9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -322,6 +322,7 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
   CallInst *V = Builder.CreateIntrinsic(
       RetTy, Intrinsic::experimental_vector_partial_reduce_add,
       {PhiVal, BinOpVal}, nullptr, Twine("partial.reduce"));
+  setFlags(V);
 
   State.set(this, V);
   State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));

>From c5ae828491c6a9fd1a31d93dd3a3b377854972ae Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 25 Nov 2024 10:20:28 +0000
Subject: [PATCH 50/56] Review feedback

---
 .../llvm/Analysis/TargetTransformInfo.h       |  3 +-
 .../AArch64/AArch64TargetTransformInfo.h      |  1 -
 .../Transforms/Vectorize/LoopVectorize.cpp    | 17 +++-----
 .../Transforms/Vectorize/VPRecipeBuilder.h    | 11 ++---
 llvm/lib/Transforms/Vectorize/VPlan.h         | 11 ++---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 43 ++++++++++++-------
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  1 -
 .../LoopVectorize/AArch64/vplan-printing.ll   |  8 ++--
 8 files changed, 50 insertions(+), 45 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 7ade2a0f033f72..f3205fec70fea4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -2090,8 +2090,7 @@ class TargetTransformInfo::Concept {
   /// represented by the llvm.experimental.partial.reduce.add intrinsic, which
   /// takes an accumulator and a binary operation operand that itself is fed by
   /// two extends. An example of an operation that uses a partial reduction is a
-  /// dot product, which reduces a vector to another of 4 times larger but fewer
-  /// elements.
+  /// dot product, which reduces a vector to another of 4 times fewer elements.
   virtual InstructionCost
   getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
                           ElementCount VF, PartialReductionExtendKind OpAExtend,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 25517f1ccf42ce..1acd367fcd2942 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -364,7 +364,6 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     if (VF.isFixed() && !ST->isNeonAvailable() && !ST->hasDotProd())
       return Invalid;
 
-    // FIXME: There should be a nicer way of doing this?
     if (InputEVT == MVT::i8) {
       switch (VF.getKnownMinValue()) {
       default:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f84b4fed4327a9..f74ea7ee6e8fec 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8680,9 +8680,8 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
 
   // Build up a set of partial reduction bin ops for efficient use checking.
   SmallSet<User *, 4> PartialReductionBinOps;
-  for (auto It : PartialReductionChains) {
-    if (It.first.BinOp)
-      PartialReductionBinOps.insert(It.first.BinOp);
+  for (const auto &[PartialRdx, _] : PartialReductionChains) {
+    PartialReductionBinOps.insert(PartialRdx.BinOp);
   }
 
   auto ExtendIsOnlyUsedByPartialReductions =
@@ -8693,7 +8692,7 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
       };
 
   // Check if each use of a chain's two extends is a partial reduction
-  // and only add those those that don't have non-partial reduction users.
+  // and only add those that don't have non-partial reduction users.
   for (auto Pair : PartialReductionChains) {
     PartialReductionChain Chain = Pair.first;
     if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
@@ -8702,10 +8701,6 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
   }
 }
 
-/// Examines reduction operations to see if the target can use a cheaper
-/// operation with a wider per-iteration input VF and narrower PHI VF.
-/// Returns a struct containing the ratio between the two VFs and other cached
-/// information, or null if no scalable reduction was found.
 std::optional<std::pair<PartialReductionChain, unsigned>>
 VPRecipeBuilder::getScaledReduction(PHINode *PHI,
                                     const RecurrenceDescriptor &Rdx,
@@ -8725,7 +8720,6 @@ VPRecipeBuilder::getScaledReduction(PHINode *PHI,
   if (Op == PHI)
     Op = Update->getOperand(1);
 
-  // Match dot product pattern
   auto *BinOp = dyn_cast<BinaryOperator>(Op);
   if (!BinOp || !BinOp->hasOneUse())
     return std::nullopt;
@@ -8739,7 +8733,7 @@ VPRecipeBuilder::getScaledReduction(PHINode *PHI,
   Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
   Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
 
-  // Check that the extends extend from the same type
+  // Check that the extends extend from the same type.
   if (A->getType() != B->getType())
     return std::nullopt;
 
@@ -8866,8 +8860,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
 
   SmallVector<VPValue *, 2> OrderedOperands = {BinOp, Phi};
   return new VPPartialReductionRecipe(
-      Reduction->getOpcode(),
-      make_range(OrderedOperands.begin(), OrderedOperands.end()));
+      *Reduction, make_range(OrderedOperands.begin(), OrderedOperands.end()));
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 2ecce94714d1bb..b2141697f0ca5a 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -39,7 +39,7 @@ struct PartialReductionChain {
   Instruction *ExtendA;
   Instruction *ExtendB;
 
-  /// The binary operation using the extends that is then reduced
+  /// The binary operation using the extends that is then reduced.
   Instruction *BinOp;
 };
 
@@ -54,7 +54,7 @@ class VPRecipeBuilder {
   /// Target Library Info.
   const TargetLibraryInfo *TLI;
 
-  // Target Transform Info
+  // Target Transform Info.
   const TargetTransformInfo *TTI;
 
   /// The legality analysis.
@@ -87,7 +87,7 @@ class VPRecipeBuilder {
   SmallVector<VPHeaderPHIRecipe *, 4> PhisToFix;
 
   /// The set of reduction exit instructions that will be scaled to
-  /// a smaller VF via partial reductions. paired with the scaling factor.
+  /// a smaller VF via partial reductions, paired with the scaling factor.
   DenseMap<const Instruction *, std::pair<PartialReductionChain, unsigned>>
       ScaledReductionExitInstrs;
 
@@ -141,8 +141,9 @@ class VPRecipeBuilder {
 
   /// Examines reduction operations to see if the target can use a cheaper
   /// operation with a wider per-iteration input VF and narrower PHI VF.
-  /// Returns a struct containing the ratio between the two VFs and other cached
-  /// information, or null if no scalable reduction was found.
+  /// Returns null if no scaled reduction was found, otherwise a pair with a
+  /// struct containing reduction information and the scaling factor between the
+  /// number of elements in the input and output.
   std::optional<std::pair<PartialReductionChain, unsigned>>
   getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx,
                      VFRange &Range);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 9d980f4382bb1a..e2466e9916bed3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2385,20 +2385,21 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
 /// next accumulator. After the loop body, the accumulator is reduced to a
 /// scalar value.
 class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
-  unsigned Opcode;
+  Instruction &ReductionInst;
 
 public:
   template <typename IterT>
-  VPPartialReductionRecipe(unsigned ReductionOpcode,
+  VPPartialReductionRecipe(Instruction &ReductionInst,
                            iterator_range<IterT> Operands)
-      : VPRecipeWithIRFlags(VPDef::VPPartialReductionSC, Operands),
-        Opcode(ReductionOpcode) {
+      : VPRecipeWithIRFlags(VPDef::VPPartialReductionSC, Operands,
+                            ReductionInst),
+        ReductionInst(ReductionInst) {
     assert(isa<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe()) &&
            "Unexpected operand order for partial reduction recipe");
   }
   ~VPPartialReductionRecipe() override = default;
   VPPartialReductionRecipe *clone() override {
-    return new VPPartialReductionRecipe(Opcode, operands());
+    return new VPPartialReductionRecipe(ReductionInst, operands());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 5e140bbacd97b9..35b80c07393fed 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -294,24 +294,39 @@ InstructionCost VPRecipeBase::computeCost(ElementCount VF,
 InstructionCost
 VPPartialReductionRecipe::computeCost(ElementCount VF,
                                       VPCostContext &Ctx) const {
-  auto *BinOp = cast<BinaryOperator>(getOperand(0)->getUnderlyingValue());
-  auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe());
-  auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
-  auto *ExtA = cast<Instruction>(BinOp->getOperand(0));
-  auto *ExtB = cast<Instruction>(BinOp->getOperand(1));
-  Value *A = ExtA->getOperand(0);
-  return Ctx.TTI.getPartialReductionCost(
-      Opcode, A->getType(), Phi->getType(), VF,
-      TargetTransformInfo::getPartialReductionExtendKind(ExtA),
-      TargetTransformInfo::getPartialReductionExtendKind(ExtB),
-      std::make_optional(BinOp->getOpcode()));
+  std::optional<unsigned> Opcode = std::nullopt;
+  VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe();
+  if (auto *WidenR = dyn_cast<VPWidenRecipe>(BinOpR))
+    Opcode = std::make_optional(WidenR->getOpcode());
+
+  VPRecipeBase *ExtAR = BinOpR->getOperand(0)->getDefiningRecipe();
+  VPRecipeBase *ExtBR = BinOpR->getOperand(1)->getDefiningRecipe();
+
+  auto GetExtendKind = [](VPRecipeBase *R) {
+    auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
+    if (!WidenCastR)
+      return TargetTransformInfo::PR_None;
+    if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
+      return TargetTransformInfo::PR_ZeroExtend;
+    if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
+      return TargetTransformInfo::PR_SignExtend;
+    return TargetTransformInfo::PR_None;
+  };
+
+  auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
+  auto *ExtTy = Ctx.Types.inferScalarType(ExtAR->getOperand(0));
+
+  return Ctx.TTI.getPartialReductionCost(ReductionInst.getOpcode(), ExtTy,
+                                         PhiType, VF, GetExtendKind(ExtAR),
+                                         GetExtendKind(ExtBR), Opcode);
 }
 
 void VPPartialReductionRecipe::execute(VPTransformState &State) {
   State.setDebugLocFrom(getDebugLoc());
   auto &Builder = State.Builder;
 
-  assert(Opcode == Instruction::Add && "Unhandled partial reduction opcode");
+  assert(ReductionInst.getOpcode() == Instruction::Add &&
+         "Unhandled partial reduction opcode");
 
   Value *BinOpVal = State.get(getOperand(0));
   Value *PhiVal = State.get(getOperand(1));
@@ -322,7 +337,6 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
   CallInst *V = Builder.CreateIntrinsic(
       RetTy, Intrinsic::experimental_vector_partial_reduce_add,
       {PhiVal, BinOpVal}, nullptr, Twine("partial.reduce"));
-  setFlags(V);
 
   State.set(this, V);
   State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
@@ -333,8 +347,7 @@ void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                      VPSlotTracker &SlotTracker) const {
   O << Indent << "PARTIAL-REDUCE ";
   printAsOperand(O, SlotTracker);
-  O << " = " << Instruction::getOpcodeName(Opcode);
-  printFlags(O);
+  O << " = " << Instruction::getOpcodeName(ReductionInst.getOpcode()) << " ";
   printOperands(O, SlotTracker);
 }
 #endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9ea5e1b5ed097c..ea8845eaa75d4d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -24,7 +24,6 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Analysis/IVDescriptors.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index aba158a8b4c733..aa21519fe72833 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -78,7 +78,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT: vector.body:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, vp<%6> (VF scaled by 1/4)
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4)
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]>
 ; CHECK-NEXT:   vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
@@ -89,7 +89,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT:   WIDEN ir<%load.b> = load vp<[[PTR_B]]>
 ; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
 ; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
-; CHECK-NEXT:   PARTIAL-REDUCE vp<%6> = add ir<%mul>, ir<[[ACC]]>
+; CHECK-NEXT:   PARTIAL-REDUCE ir<[[REDUCE]]> = add ir<%mul>, ir<[[ACC]]>
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -97,14 +97,14 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, vp<%6>
+; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]>
 ; CHECK-NEXT:   EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1>
 ; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<0>, vp<%1>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%9>)
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]>)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:

>From 258a79cfaf358c2e0210d54cf91b3b333e661908 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 28 Nov 2024 14:15:11 +0000
Subject: [PATCH 51/56] Move getPartialReductionKind

---
 llvm/include/llvm/Analysis/TargetTransformInfo.h | 10 ++--------
 llvm/lib/Analysis/TargetTransformInfo.cpp        |  9 +++++++++
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f3205fec70fea4..4e05773b447abb 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -24,7 +24,6 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/IR/FMF.h"
 #include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/AtomicOrdering.h"
@@ -213,14 +212,9 @@ class TargetTransformInfo {
 public:
   enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend };
 
+  /// Get the kind of extension that an instruction represents.
   static PartialReductionExtendKind
-  getPartialReductionExtendKind(Instruction *I) {
-    if (isa<SExtInst>(I))
-      return PR_SignExtend;
-    if (isa<ZExtInst>(I))
-      return PR_ZeroExtend;
-    return PR_None;
-  }
+  getPartialReductionExtendKind(Instruction *I);
 
   /// Construct a TTI object using a type implementing the \c Concept
   /// API below.
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 02df8017a5380f..83d858fc9de677 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -968,6 +968,15 @@ InstructionCost TargetTransformInfo::getShuffleCost(
   return Cost;
 }
 
+TargetTransformInfo::PartialReductionExtendKind
+TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) {
+  if (isa<SExtInst>(I))
+    return PR_SignExtend;
+  if (isa<ZExtInst>(I))
+    return PR_ZeroExtend;
+  return PR_None;
+}
+
 TTI::CastContextHint
 TargetTransformInfo::getCastContextHint(const Instruction *I) {
   if (!I)

>From 1a87238f0e3b5974360a1f074efb885240ffef01 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 2 Dec 2024 10:00:39 +0000
Subject: [PATCH 52/56] Address review

---
 llvm/include/llvm/Analysis/TargetTransformInfo.h | 6 ++++++
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp  | 2 +-
 llvm/lib/Transforms/Vectorize/VPlan.h            | 8 ++++----
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 4e05773b447abb..d3aa3a76d0bf10 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1266,6 +1266,12 @@ class TargetTransformInfo {
   /// \return if target want to issue a prefetch in address space \p AS.
   bool shouldPrefetchAddressSpace(unsigned AS) const;
 
+  /// \return The cost of a partial reduction, which is a reduction from a
+  /// vector to another vector with fewer elements of larger size. They are
+  /// represented by the llvm.experimental.partial.reduce.add intrinsic, which
+  /// takes an accumulator and a binary operation operand that itself is fed by
+  /// two extends. An example of an operation that uses a partial reduction is a
+  /// dot product, which reduces a vector to another of 4 times fewer elements.
   InstructionCost
   getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
                           ElementCount VF, PartialReductionExtendKind OpAExtend,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f74ea7ee6e8fec..b184bb36754c11 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8786,7 +8786,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
       assert(RdxDesc.getRecurrenceStartValue() ==
              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
 
-      // If the PHI is used by a partial reduction, set the scale factor
+      // If the PHI is used by a partial reduction, set the scale factor.
       std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
           getScaledReductionForInstr(RdxDesc.getLoopExitInstr());
       unsigned ScaleFactor = Pair ? Pair->second : 1;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e2466e9916bed3..28951459583920 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2384,15 +2384,15 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
 /// vector operand are added together and passed to the next iteration as the
 /// next accumulator. After the loop body, the accumulator is reduced to a
 /// scalar value.
-class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
+class VPPartialReductionRecipe : public VPSingleDefRecipe {
   Instruction &ReductionInst;
 
 public:
   template <typename IterT>
   VPPartialReductionRecipe(Instruction &ReductionInst,
                            iterator_range<IterT> Operands)
-      : VPRecipeWithIRFlags(VPDef::VPPartialReductionSC, Operands,
-                            ReductionInst),
+      : VPSingleDefRecipe(VPDef::VPPartialReductionSC, Operands,
+                            &ReductionInst),
         ReductionInst(ReductionInst) {
     assert(isa<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe()) &&
            "Unexpected operand order for partial reduction recipe");
@@ -2404,7 +2404,7 @@ class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
 
   VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
 
-  /// Generate the reduction in the loop
+  /// Generate the reduction in the loop.
   void execute(VPTransformState &State) override;
 
   /// Return the cost of this VPPartialReductionRecipe.

>From 02efbedddd038e7994c743fa5f046784ab963bad Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 2 Dec 2024 11:32:37 +0000
Subject: [PATCH 53/56] Format

---
 llvm/lib/Transforms/Vectorize/VPlan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 28951459583920..664b8d66456e90 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2392,7 +2392,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
   VPPartialReductionRecipe(Instruction &ReductionInst,
                            iterator_range<IterT> Operands)
       : VPSingleDefRecipe(VPDef::VPPartialReductionSC, Operands,
-                            &ReductionInst),
+                          &ReductionInst),
         ReductionInst(ReductionInst) {
     assert(isa<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe()) &&
            "Unexpected operand order for partial reduction recipe");

>From 3e6210735a9b1f59dab9a7acb3047962fd7ca63e Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 2 Dec 2024 17:12:49 +0000
Subject: [PATCH 54/56] Remove ReductionInst field

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp |  2 +-
 llvm/lib/Transforms/Vectorize/VPlan.h           |  8 +++-----
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp  | 13 +++++++------
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b184bb36754c11..861697ad7acf73 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8860,7 +8860,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
 
   SmallVector<VPValue *, 2> OrderedOperands = {BinOp, Phi};
   return new VPPartialReductionRecipe(
-      *Reduction, make_range(OrderedOperands.begin(), OrderedOperands.end()));
+      Reduction, make_range(OrderedOperands.begin(), OrderedOperands.end()));
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 664b8d66456e90..1d6a8eebe653b1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2385,21 +2385,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
 /// next accumulator. After the loop body, the accumulator is reduced to a
 /// scalar value.
 class VPPartialReductionRecipe : public VPSingleDefRecipe {
-  Instruction &ReductionInst;
 
 public:
   template <typename IterT>
-  VPPartialReductionRecipe(Instruction &ReductionInst,
+  VPPartialReductionRecipe(Instruction *ReductionInst,
                            iterator_range<IterT> Operands)
       : VPSingleDefRecipe(VPDef::VPPartialReductionSC, Operands,
-                          &ReductionInst),
-        ReductionInst(ReductionInst) {
+                          ReductionInst) {
     assert(isa<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe()) &&
            "Unexpected operand order for partial reduction recipe");
   }
   ~VPPartialReductionRecipe() override = default;
   VPPartialReductionRecipe *clone() override {
-    return new VPPartialReductionRecipe(ReductionInst, operands());
+    return new VPPartialReductionRecipe(getUnderlyingInstr(), operands());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 35b80c07393fed..5068e0083f7334 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -316,16 +316,16 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
   auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
   auto *ExtTy = Ctx.Types.inferScalarType(ExtAR->getOperand(0));
 
-  return Ctx.TTI.getPartialReductionCost(ReductionInst.getOpcode(), ExtTy,
-                                         PhiType, VF, GetExtendKind(ExtAR),
-                                         GetExtendKind(ExtBR), Opcode);
+  return Ctx.TTI.getPartialReductionCost(
+      getUnderlyingInstr()->getOpcode(), ExtTy, PhiType, VF,
+      GetExtendKind(ExtAR), GetExtendKind(ExtBR), Opcode);
 }
 
 void VPPartialReductionRecipe::execute(VPTransformState &State) {
   State.setDebugLocFrom(getDebugLoc());
   auto &Builder = State.Builder;
 
-  assert(ReductionInst.getOpcode() == Instruction::Add &&
+  assert(getUnderlyingInstr()->getOpcode() == Instruction::Add &&
          "Unhandled partial reduction opcode");
 
   Value *BinOpVal = State.get(getOperand(0));
@@ -339,7 +339,7 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
       {PhiVal, BinOpVal}, nullptr, Twine("partial.reduce"));
 
   State.set(this, V);
-  State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
+  State.addMetadata(V, getUnderlyingInstr());
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -347,7 +347,8 @@ void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                      VPSlotTracker &SlotTracker) const {
   O << Indent << "PARTIAL-REDUCE ";
   printAsOperand(O, SlotTracker);
-  O << " = " << Instruction::getOpcodeName(ReductionInst.getOpcode()) << " ";
+  O << " = " << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode())
+    << " ";
   printOperands(O, SlotTracker);
 }
 #endif

>From b0e8374c0db3f5e5f7042c3d9fd0448229ee0136 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 5 Dec 2024 14:00:44 +0000
Subject: [PATCH 55/56] Move TargetTransformInfo forward declaration

---
 llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index b2141697f0ca5a..cf653e2d3e6584 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -21,8 +21,8 @@ namespace llvm {
 class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 class TargetLibraryInfo;
-struct HistogramInfo;
 class TargetTransformInfo;
+struct HistogramInfo;
 
 /// A chain of instructions that form a partial reduction.
 /// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))),

>From 31db826dfb1929f5cd743d396b2e87114b74183c Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 6 Dec 2024 13:22:43 +0000
Subject: [PATCH 56/56] Address review

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  3 +-
 .../LoopVectorize/AArch64/vplan-printing.ll   | 60 -------------------
 2 files changed, 1 insertion(+), 62 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 861697ad7acf73..b662dc3887e826 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8680,9 +8680,8 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
 
   // Build up a set of partial reduction bin ops for efficient use checking.
   SmallSet<User *, 4> PartialReductionBinOps;
-  for (const auto &[PartialRdx, _] : PartialReductionChains) {
+  for (const auto &[PartialRdx, _] : PartialReductionChains)
     PartialReductionBinOps.insert(PartialRdx.BinOp);
-  }
 
   auto ExtendIsOnlyUsedByPartialReductions =
       [&PartialReductionBinOps](Instruction *Extend) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index aa21519fe72833..7253bec777188f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -7,66 +7,6 @@ target triple = "aarch64-none-unknown-elf"
 ; Tests for printing VPlans that are enabled under AArch64
 
 define i32 @print_partial_reduction(ptr %a, ptr %b) {
-; CHECK-LABEL: Checking a loop in 'print_partial_reduction'
-; CHECK:      VPlan 'Initial VPlan for VF={2,4},UF>=1' {
-; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
-; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
-; CHECK-NEXT: Live-in ir<0> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT: vector.body:
-; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<%add>
-; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]>
-; CHECK-NEXT:   vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
-; CHECK-NEXT:   WIDEN ir<%load.a> = load vp<[[PTR_A]]>
-; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
-; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]>
-; CHECK-NEXT:   vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b>
-; CHECK-NEXT:   WIDEN ir<%load.b> = load vp<[[PTR_B:%.+]]>
-; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
-; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
-; CHECK-NEXT:   WIDEN ir<%add> = add ir<%mul>, ir<[[ACC]]>
-; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<%add>
-; CHECK-NEXT:   EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1>
-; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<0>, vp<%1>
-; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
-; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT:   EMIT vp<%bc.merge.rdx> = resume-phi vp<[[RED_RESULT]]>, ir<0>
-; CHECK-NEXT: Successor(s): ir-bb<for.body>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-; CHECK-NEXT:   IR   %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] (extra operand: vp<%bc.merge.rdx>)
-; CHECK-NEXT:   IR   %gep.a = getelementptr i8, ptr %a, i64 %iv
-; CHECK-NEXT:   IR   %load.a = load i8, ptr %gep.a, align 1
-; CHECK-NEXT:   IR   %ext.a = zext i8 %load.a to i32
-; CHECK-NEXT:   IR   %gep.b = getelementptr i8, ptr %b, i64 %iv
-; CHECK-NEXT:   IR   %load.b = load i8, ptr %gep.b, align 1
-; CHECK-NEXT:   IR   %ext.b = zext i8 %load.b to i32
-; CHECK-NEXT:   IR   %mul = mul i32 %ext.b, %ext.a
-; CHECK-NEXT:   IR   %add = add i32 %mul, %accum
-; CHECK-NEXT:   IR   %iv.next = add i64 %iv, 1
-; CHECK-NEXT:   IR   %exitcond.not = icmp eq i64 %iv.next, 0
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
 ; CHECK:      VPlan 'Initial VPlan for VF={8,16},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count



More information about the llvm-commits mailing list