[llvm] [LoopVectorizer] Add support for chaining partial reductions (PR #120272)
Nicholas Guy via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 17 09:18:35 PST 2024
https://github.com/NickGuy-Arm created https://github.com/llvm/llvm-project/pull/120272
Currently depends on #92418, so this will be mangled with that patch and will remain as a draft until it is merged
>From c3dd548d49549143d9f0f29b1bb24973872ede1f Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy at arm.com>
Date: Fri, 17 May 2024 11:17:26 +0100
Subject: [PATCH 1/9] [LoopVectorizer] Add support for partial reductions
---
.../llvm/Analysis/TargetTransformInfo.h | 39 +
.../llvm/Analysis/TargetTransformInfoImpl.h | 9 +
llvm/lib/Analysis/TargetTransformInfo.cpp | 17 +
.../AArch64/AArch64TargetTransformInfo.h | 56 +
.../Transforms/Vectorize/LoopVectorize.cpp | 132 +-
.../Transforms/Vectorize/VPRecipeBuilder.h | 59 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 62 +-
.../Transforms/Vectorize/VPlanAnalysis.cpp | 8 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 74 +-
llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 +
.../partial-reduce-dot-product-epilogue.ll | 120 ++
.../partial-reduce-dot-product-neon.ll | 1374 ++++++++++++++++
.../AArch64/partial-reduce-dot-product.ll | 1455 +++++++++++++++++
.../LoopVectorize/AArch64/vplan-printing.ll | 90 +
14 files changed, 3475 insertions(+), 21 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index c4d5459d250924..cd8e9b7887b664 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -211,6 +211,12 @@ typedef TargetTransformInfo TTI;
/// for IR-level transformations.
class TargetTransformInfo {
public:
+ enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend };
+
+ /// Get the kind of extension that an instruction represents.
+ static PartialReductionExtendKind
+ getPartialReductionExtendKind(Instruction *I);
+
/// Construct a TTI object using a type implementing the \c Concept
/// API below.
///
@@ -1274,6 +1280,18 @@ class TargetTransformInfo {
/// \return if target want to issue a prefetch in address space \p AS.
bool shouldPrefetchAddressSpace(unsigned AS) const;
+ /// \return The cost of a partial reduction, which is a reduction from a
+ /// vector to another vector with fewer elements of larger size. They are
+ /// represented by the llvm.experimental.partial.reduce.add intrinsic, which
+ /// takes an accumulator and a binary operation operand that itself is fed by
+ /// two extends. An example of an operation that uses a partial reduction is a
+ /// dot product, which reduces a vector to another of 4 times fewer elements.
+ InstructionCost
+ getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
+ ElementCount VF, PartialReductionExtendKind OpAExtend,
+ PartialReductionExtendKind OpBExtend,
+ std::optional<unsigned> BinOp = std::nullopt) const;
+
/// \return The maximum interleave factor that any transform should try to
/// perform for this target. This number depends on the level of parallelism
/// and the number of execution units in the CPU.
@@ -2098,6 +2116,18 @@ class TargetTransformInfo::Concept {
/// \return if target want to issue a prefetch in address space \p AS.
virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;
+ /// \return The cost of a partial reduction, which is a reduction from a
+ /// vector to another vector with fewer elements of larger size. They are
+ /// represented by the llvm.experimental.partial.reduce.add intrinsic, which
+ /// takes an accumulator and a binary operation operand that itself is fed by
+ /// two extends. An example of an operation that uses a partial reduction is a
+ /// dot product, which reduces a vector to another of 4 times fewer elements.
+ virtual InstructionCost
+ getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
+ ElementCount VF, PartialReductionExtendKind OpAExtend,
+ PartialReductionExtendKind OpBExtend,
+ std::optional<unsigned> BinOp) const = 0;
+
virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0;
virtual InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
@@ -2772,6 +2802,15 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
return Impl.shouldPrefetchAddressSpace(AS);
}
+ InstructionCost getPartialReductionCost(
+ unsigned Opcode, Type *InputType, Type *AccumType, ElementCount VF,
+ PartialReductionExtendKind OpAExtend,
+ PartialReductionExtendKind OpBExtend,
+ std::optional<unsigned> BinOp = std::nullopt) const override {
+ return Impl.getPartialReductionCost(Opcode, InputType, AccumType, VF,
+ OpAExtend, OpBExtend, BinOp);
+ }
+
unsigned getMaxInterleaveFactor(ElementCount VF) override {
return Impl.getMaxInterleaveFactor(VF);
}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 48ebffff8cbfc2..885fe4390e5681 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -580,6 +580,15 @@ class TargetTransformInfoImplBase {
bool enableWritePrefetching() const { return false; }
bool shouldPrefetchAddressSpace(unsigned AS) const { return !AS; }
+ InstructionCost
+ getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
+ ElementCount VF,
+ TTI::PartialReductionExtendKind OpAExtend,
+ TTI::PartialReductionExtendKind OpBExtend,
+ std::optional<unsigned> BinOp = std::nullopt) const {
+ return InstructionCost::getInvalid();
+ }
+
unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
InstructionCost getArithmeticInstrCost(
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index d4b6c08c5a32b2..efd92ebf921506 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -858,6 +858,14 @@ bool TargetTransformInfo::shouldPrefetchAddressSpace(unsigned AS) const {
return TTIImpl->shouldPrefetchAddressSpace(AS);
}
+InstructionCost TargetTransformInfo::getPartialReductionCost(
+ unsigned Opcode, Type *InputType, Type *AccumType, ElementCount VF,
+ PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend,
+ std::optional<unsigned> BinOp) const {
+ return TTIImpl->getPartialReductionCost(Opcode, InputType, AccumType, VF,
+ OpAExtend, OpBExtend, BinOp);
+}
+
unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
return TTIImpl->getMaxInterleaveFactor(VF);
}
@@ -969,6 +977,15 @@ InstructionCost TargetTransformInfo::getShuffleCost(
return Cost;
}
+TargetTransformInfo::PartialReductionExtendKind
+TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) {
+ if (isa<SExtInst>(I))
+ return PR_SignExtend;
+ if (isa<ZExtInst>(I))
+ return PR_ZeroExtend;
+ return PR_None;
+}
+
TTI::CastContextHint
TargetTransformInfo::getCastContextHint(const Instruction *I) {
if (!I)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 83b86e31565e49..c9d7e68c8346ba 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -23,6 +23,7 @@
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/InstructionCost.h"
#include <cstdint>
#include <optional>
@@ -357,6 +358,61 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
return BaseT::isLegalNTLoad(DataType, Alignment);
}
+ InstructionCost
+ getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
+ ElementCount VF,
+ TTI::PartialReductionExtendKind OpAExtend,
+ TTI::PartialReductionExtendKind OpBExtend,
+ std::optional<unsigned> BinOp) const {
+
+ InstructionCost Invalid = InstructionCost::getInvalid();
+ InstructionCost Cost(TTI::TCC_Basic);
+
+ if (Opcode != Instruction::Add)
+ return Invalid;
+
+ EVT InputEVT = EVT::getEVT(InputType);
+ EVT AccumEVT = EVT::getEVT(AccumType);
+
+ if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable())
+ return Invalid;
+ if (VF.isFixed() && !ST->isNeonAvailable() && !ST->hasDotProd())
+ return Invalid;
+
+ if (InputEVT == MVT::i8) {
+ switch (VF.getKnownMinValue()) {
+ default:
+ return Invalid;
+ case 8:
+ if (AccumEVT == MVT::i32)
+ Cost *= 2;
+ else if (AccumEVT != MVT::i64)
+ return Invalid;
+ break;
+ case 16:
+ if (AccumEVT == MVT::i64)
+ Cost *= 2;
+ else if (AccumEVT != MVT::i32)
+ return Invalid;
+ break;
+ }
+ } else if (InputEVT == MVT::i16) {
+ // FIXME: Allow i32 accumulator but increase cost, as we would extend
+ // it to i64.
+ if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64)
+ return Invalid;
+ } else
+ return Invalid;
+
+ if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None)
+ return Invalid;
+
+ if (!BinOp || (*BinOp) != Instruction::Mul)
+ return Invalid;
+
+ return Cost;
+ }
+
bool enableOrderedReductions() const { return true; }
InstructionCost getInterleavedMemoryOpCost(
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 61f7bd84902815..379ba270631d0e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8791,6 +8791,103 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
return Recipe;
}
+/// Find all possible partial reductions in the loop and track all of those that
+/// are valid so recipes can be formed later.
+void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
+ // Find all possible partial reductions.
+ SmallVector<std::pair<PartialReductionChain, unsigned>, 1>
+ PartialReductionChains;
+ for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
+ if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
+ getScaledReduction(Phi, RdxDesc, Range))
+ PartialReductionChains.push_back(*Pair);
+
+ // A partial reduction is invalid if any of its extends are used by
+ // something that isn't another partial reduction. This is because the
+ // extends are intended to be lowered along with the reduction itself.
+
+ // Build up a set of partial reduction bin ops for efficient use checking.
+ SmallSet<User *, 4> PartialReductionBinOps;
+ for (const auto &[PartialRdx, _] : PartialReductionChains)
+ PartialReductionBinOps.insert(PartialRdx.BinOp);
+
+ auto ExtendIsOnlyUsedByPartialReductions =
+ [&PartialReductionBinOps](Instruction *Extend) {
+ return all_of(Extend->users(), [&](const User *U) {
+ return PartialReductionBinOps.contains(U);
+ });
+ };
+
+ // Check if each use of a chain's two extends is a partial reduction
+ // and only add those that don't have non-partial reduction users.
+ for (auto Pair : PartialReductionChains) {
+ PartialReductionChain Chain = Pair.first;
+ if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
+ ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
+ ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Pair));
+ }
+}
+
+std::optional<std::pair<PartialReductionChain, unsigned>>
+VPRecipeBuilder::getScaledReduction(PHINode *PHI,
+ const RecurrenceDescriptor &Rdx,
+ VFRange &Range) {
+ // TODO: Allow scaling reductions when predicating. The select at
+ // the end of the loop chooses between the phi value and most recent
+ // reduction result, both of which have different VFs to the active lane
+ // mask when scaling.
+ if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
+ return std::nullopt;
+
+ auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
+ if (!Update)
+ return std::nullopt;
+
+ Value *Op = Update->getOperand(0);
+ if (Op == PHI)
+ Op = Update->getOperand(1);
+
+ auto *BinOp = dyn_cast<BinaryOperator>(Op);
+ if (!BinOp || !BinOp->hasOneUse())
+ return std::nullopt;
+
+ using namespace llvm::PatternMatch;
+ Value *A, *B;
+ if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
+ !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
+ return std::nullopt;
+
+ Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
+ Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
+
+ // Check that the extends extend from the same type.
+ if (A->getType() != B->getType())
+ return std::nullopt;
+
+ TTI::PartialReductionExtendKind OpAExtend =
+ TargetTransformInfo::getPartialReductionExtendKind(ExtA);
+ TTI::PartialReductionExtendKind OpBExtend =
+ TargetTransformInfo::getPartialReductionExtendKind(ExtB);
+
+ PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp);
+
+ unsigned TargetScaleFactor =
+ PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
+ A->getType()->getPrimitiveSizeInBits());
+
+ if (LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) {
+ InstructionCost Cost = TTI->getPartialReductionCost(
+ Update->getOpcode(), A->getType(), PHI->getType(), VF,
+ OpAExtend, OpBExtend, std::make_optional(BinOp->getOpcode()));
+ return Cost.isValid();
+ },
+ Range))
+ return std::make_pair(Chain, TargetScaleFactor);
+
+ return std::nullopt;
+}
+
VPRecipeBase *
VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
ArrayRef<VPValue *> Operands,
@@ -8815,9 +8912,14 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
Legal->getReductionVars().find(Phi)->second;
assert(RdxDesc.getRecurrenceStartValue() ==
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
- PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
- CM.isInLoopReduction(Phi),
- CM.useOrderedReductions(RdxDesc));
+
+ // If the PHI is used by a partial reduction, set the scale factor.
+ std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
+ getScaledReductionForInstr(RdxDesc.getLoopExitInstr());
+ unsigned ScaleFactor = Pair ? Pair->second : 1;
+ PhiRecipe = new VPReductionPHIRecipe(
+ Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
+ CM.useOrderedReductions(RdxDesc), ScaleFactor);
} else {
// TODO: Currently fixed-order recurrences are modeled as chains of
// first-order recurrences. If there are no users of the intermediate
@@ -8849,6 +8951,9 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range);
+ if (getScaledReductionForInstr(Instr))
+ return tryToCreatePartialReduction(Instr, Operands);
+
if (!shouldWiden(Instr, Range))
return nullptr;
@@ -8869,6 +8974,21 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
return tryToWiden(Instr, Operands, VPBB);
}
+VPRecipeBase *
+VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
+ ArrayRef<VPValue *> Operands) {
+ assert(Operands.size() == 2 &&
+ "Unexpected number of operands for partial reduction");
+
+ VPValue *BinOp = Operands[0];
+ VPValue *Phi = Operands[1];
+ if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe()))
+ std::swap(BinOp, Phi);
+
+ return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi,
+ Reduction);
+}
+
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
ElementCount MaxVF) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -9185,7 +9305,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
- VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
+ VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
+ Builder);
// ---------------------------------------------------------------------------
// Pre-construction: record ingredients whose recipes we'll need to further
@@ -9231,6 +9352,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
return Legal->blockNeedsPredication(BB) || NeedsBlends;
});
+
+ RecipeBuilder.collectScaledReductions(Range);
+
auto *MiddleVPBB = Plan->getMiddleBlock();
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 5d4a3b555981ce..cf653e2d3e6584 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -21,8 +21,28 @@ namespace llvm {
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class TargetLibraryInfo;
+class TargetTransformInfo;
struct HistogramInfo;
+/// A chain of instructions that form a partial reduction.
+/// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))),
+/// accumulator).
+struct PartialReductionChain {
+ PartialReductionChain(Instruction *Reduction, Instruction *ExtendA,
+ Instruction *ExtendB, Instruction *BinOp)
+ : Reduction(Reduction), ExtendA(ExtendA), ExtendB(ExtendB), BinOp(BinOp) {
+ }
+ /// The top-level binary operation that forms the reduction to a scalar
+ /// after the loop body.
+ Instruction *Reduction;
+ /// The extension of each of the inner binary operation's operands.
+ Instruction *ExtendA;
+ Instruction *ExtendB;
+
+ /// The binary operation using the extends that is then reduced.
+ Instruction *BinOp;
+};
+
/// Helper class to create VPRecipies from IR instructions.
class VPRecipeBuilder {
/// The VPlan new recipes are added to.
@@ -34,6 +54,9 @@ class VPRecipeBuilder {
/// Target Library Info.
const TargetLibraryInfo *TLI;
+ // Target Transform Info.
+ const TargetTransformInfo *TTI;
+
/// The legality analysis.
LoopVectorizationLegality *Legal;
@@ -63,6 +86,11 @@ class VPRecipeBuilder {
/// created.
SmallVector<VPHeaderPHIRecipe *, 4> PhisToFix;
+ /// The set of reduction exit instructions that will be scaled to
+ /// a smaller VF via partial reductions, paired with the scaling factor.
+ DenseMap<const Instruction *, std::pair<PartialReductionChain, unsigned>>
+ ScaledReductionExitInstrs;
+
/// Check if \p I can be widened at the start of \p Range and possibly
/// decrease the range such that the returned value holds for the entire \p
/// Range. The function should not be called for memory instructions or calls.
@@ -111,13 +139,35 @@ class VPRecipeBuilder {
VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
ArrayRef<VPValue *> Operands);
+ /// Examines reduction operations to see if the target can use a cheaper
+ /// operation with a wider per-iteration input VF and narrower PHI VF.
+ /// Returns null if no scaled reduction was found, otherwise a pair with a
+ /// struct containing reduction information and the scaling factor between the
+ /// number of elements in the input and output.
+ std::optional<std::pair<PartialReductionChain, unsigned>>
+ getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx,
+ VFRange &Range);
+
public:
VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI,
LoopVectorizationLegality *Legal,
LoopVectorizationCostModel &CM,
PredicatedScalarEvolution &PSE, VPBuilder &Builder)
- : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM),
- PSE(PSE), Builder(Builder) {}
+ : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal),
+ CM(CM), PSE(PSE), Builder(Builder) {}
+
+ std::optional<std::pair<PartialReductionChain, unsigned>>
+ getScaledReductionForInstr(const Instruction *ExitInst) {
+ auto It = ScaledReductionExitInstrs.find(ExitInst);
+ return It == ScaledReductionExitInstrs.end()
+ ? std::nullopt
+ : std::make_optional(It->second);
+ }
+
+ /// Find all possible partial reductions in the loop and track all of those
+ /// that are valid so recipes can be formed later.
+ void collectScaledReductions(VFRange &Range);
/// Create and return a widened recipe for \p I if one can be created within
/// the given VF \p Range.
@@ -125,6 +175,11 @@ class VPRecipeBuilder {
ArrayRef<VPValue *> Operands,
VFRange &Range, VPBasicBlock *VPBB);
+ /// Create and return a partial reduction recipe for a reduction instruction
+ /// along with binary operation and reduction phi operands.
+ VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
+ ArrayRef<VPValue *> Operands);
+
/// Set the recipe created for given ingredient.
void setRecipe(Instruction *I, VPRecipeBase *R) {
assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 12208a7968338b..2dc388ac0248f5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -889,6 +889,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenPointerInductionSC:
case VPRecipeBase::VPReductionPHISC:
case VPRecipeBase::VPScalarCastSC:
+ case VPRecipeBase::VPPartialReductionSC:
return true;
case VPRecipeBase::VPBranchOnMaskSC:
case VPRecipeBase::VPInterleaveSC:
@@ -2393,23 +2394,28 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// The phi is part of an ordered reduction. Requires IsInLoop to be true.
bool IsOrdered;
+ /// When expanding the reduction PHI, the plan's VF element count is divided
+ /// by this factor to form the reduction phi's VF.
+ unsigned VFScaleFactor = 1;
+
public:
/// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
/// RdxDesc.
VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
VPValue &Start, bool IsInLoop = false,
- bool IsOrdered = false)
+ bool IsOrdered = false, unsigned VFScaleFactor = 1)
: VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
- RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
+ RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
+ VFScaleFactor(VFScaleFactor) {
assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
}
~VPReductionPHIRecipe() override = default;
VPReductionPHIRecipe *clone() override {
- auto *R =
- new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc,
- *getOperand(0), IsInLoop, IsOrdered);
+ auto *R = new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()),
+ RdxDesc, *getOperand(0), IsInLoop,
+ IsOrdered, VFScaleFactor);
R->addOperand(getBackedgeValue());
return R;
}
@@ -2440,6 +2446,50 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
bool isInLoop() const { return IsInLoop; }
};
+/// A recipe for forming partial reductions. In the loop, an accumulator and
+/// vector operand are added together and passed to the next iteration as the
+/// next accumulator. After the loop body, the accumulator is reduced to a
+/// scalar value.
+class VPPartialReductionRecipe : public VPSingleDefRecipe {
+ unsigned Opcode;
+
+public:
+ VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
+ VPValue *Op1)
+ : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1,
+ ReductionInst) {}
+ VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
+ Instruction *ReductionInst = nullptr)
+ : VPSingleDefRecipe(VPDef::VPPartialReductionSC,
+ ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
+ Opcode(Opcode) {
+ assert(isa<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe()) &&
+ "Unexpected operand order for partial reduction recipe");
+ }
+ ~VPPartialReductionRecipe() override = default;
+ VPPartialReductionRecipe *clone() override {
+ return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1));
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
+
+ /// Generate the reduction in the loop.
+ void execute(VPTransformState &State) override;
+
+ /// Return the cost of this VPPartialReductionRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
+ /// Get the binary op's opcode.
+ unsigned getOpcode() const { return Opcode; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A recipe for vectorizing a phi-node as a sequence of mask-based select
/// instructions.
class VPBlendRecipe : public VPSingleDefRecipe {
@@ -2647,7 +2697,7 @@ class VPReductionRecipe : public VPSingleDefRecipe {
return R && classof(R);
}
- /// Generate the reduction in the loop
+ /// Generate the reduction in the loop.
void execute(VPTransformState &State) override;
/// Return the cost of VPReductionRecipe.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 0d981ff5826ed8..0422b6bc079e92 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -225,10 +225,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
[](const auto *R) { return R->getScalarType(); })
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
- VPReverseVectorPointerRecipe, VPWidenCanonicalIVRecipe>(
- [this](const VPRecipeBase *R) {
- return inferScalarType(R->getOperand(0));
- })
+ VPReverseVectorPointerRecipe, VPWidenCanonicalIVRecipe,
+ VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
+ return inferScalarType(R->getOperand(0));
+ })
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPWidenEVLRecipe,
VPReplicateRecipe, VPWidenCallRecipe, VPWidenMemoryRecipe,
VPWidenSelectRecipe>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ab95b647f211b7..e92d709618f5ed 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -292,6 +292,66 @@ InstructionCost VPRecipeBase::computeCost(ElementCount VF,
llvm_unreachable("subclasses should implement computeCost");
}
+InstructionCost
+VPPartialReductionRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ std::optional<unsigned> Opcode = std::nullopt;
+ VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe();
+ if (auto *WidenR = dyn_cast<VPWidenRecipe>(BinOpR))
+ Opcode = std::make_optional(WidenR->getOpcode());
+
+ VPRecipeBase *ExtAR = BinOpR->getOperand(0)->getDefiningRecipe();
+ VPRecipeBase *ExtBR = BinOpR->getOperand(1)->getDefiningRecipe();
+
+ auto GetExtendKind = [](VPRecipeBase *R) {
+ auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
+ if (!WidenCastR)
+ return TargetTransformInfo::PR_None;
+ if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
+ return TargetTransformInfo::PR_ZeroExtend;
+ if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
+ return TargetTransformInfo::PR_SignExtend;
+ return TargetTransformInfo::PR_None;
+ };
+
+ auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
+ auto *ExtTy = Ctx.Types.inferScalarType(ExtAR->getOperand(0));
+
+ return Ctx.TTI.getPartialReductionCost(getOpcode(), ExtTy, PhiType, VF,
+ GetExtendKind(ExtAR),
+ GetExtendKind(ExtBR), Opcode);
+}
+
+void VPPartialReductionRecipe::execute(VPTransformState &State) {
+ State.setDebugLocFrom(getDebugLoc());
+ auto &Builder = State.Builder;
+
+ assert(getOpcode() == Instruction::Add &&
+ "Unhandled partial reduction opcode");
+
+ Value *BinOpVal = State.get(getOperand(0));
+ Value *PhiVal = State.get(getOperand(1));
+ assert(PhiVal && BinOpVal && "Phi and Mul must be set");
+
+ Type *RetTy = PhiVal->getType();
+
+ CallInst *V = Builder.CreateIntrinsic(
+ RetTy, Intrinsic::experimental_vector_partial_reduce_add,
+ {PhiVal, BinOpVal}, nullptr, Twine("partial.reduce"));
+
+ State.set(this, V);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "PARTIAL-REDUCE ";
+ printAsOperand(O, SlotTracker);
+ O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
+ printOperands(O, SlotTracker);
+}
+#endif
+
FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
assert(OpType == OperationType::FPMathOp &&
"recipe doesn't have fast math flags");
@@ -3366,6 +3426,8 @@ void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
void VPReductionPHIRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
+ auto VF = State.VF.divideCoefficientBy(VFScaleFactor);
+
// Reductions do not have to start at zero. They can start with
// any loop invariant values.
VPValue *StartVPV = getStartValue();
@@ -3375,9 +3437,9 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
// stage #1: We create a new vector PHI node with no incoming edges. We'll use
// this value when we vectorize all of the instructions that use the PHI.
- bool ScalarPHI = State.VF.isScalar() || IsInLoop;
- Type *VecTy = ScalarPHI ? StartV->getType()
- : VectorType::get(StartV->getType(), State.VF);
+ bool ScalarPHI = VF.isScalar() || IsInLoop;
+ Type *VecTy =
+ ScalarPHI ? StartV->getType() : VectorType::get(StartV->getType(), VF);
BasicBlock *HeaderBB = State.CFG.PrevBB;
assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
@@ -3425,13 +3487,13 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
// Create start and identity vector values for the reduction in the
// preheader.
// TODO: Introduce recipes in VPlan preheader to create initial values.
- Iden = Builder.CreateVectorSplat(State.VF, Iden);
+ Iden = Builder.CreateVectorSplat(VF, Iden);
IRBuilderBase::InsertPointGuard IPBuilder(Builder);
Builder.SetInsertPoint(VectorPH->getTerminator());
Constant *Zero = Builder.getInt32(0);
StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
} else {
- Iden = Builder.CreateVectorSplat(State.VF, Iden);
+ Iden = Builder.CreateVectorSplat(VF, Iden);
}
}
}
@@ -3449,6 +3511,8 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
printAsOperand(O, SlotTracker);
O << " = phi ";
printOperands(O, SlotTracker);
+ if (VFScaleFactor != 1)
+ O << " (VF scaled by 1/" << VFScaleFactor << ")";
}
#endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 957a602091c733..7aaf4002b8b3e7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -329,6 +329,7 @@ class VPDef {
VPInterleaveSC,
VPReductionEVLSC,
VPReductionSC,
+ VPPartialReductionSC,
VPReplicateSC,
VPScalarCastSC,
VPScalarIVStepsSC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
new file mode 100644
index 00000000000000..7b598d1edc72a3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-target-instruction-cost=1 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @dotp(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define i32 @dotp(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: iter.check:
+; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP11]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK: vector.main.loop.iter.check:
+; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK: vec.epilog.iter.check:
+; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]]
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]]
+; CHECK: vec.epilog.ph:
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP15]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-NEXT: [[TMP18:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: vec.epilog.vector.body:
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT6:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ [[TMP18]], [[SCALAR_PH]] ], [ [[TMP27:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX2]], 0
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
+; CHECK-NEXT: [[TMP22:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP23]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
+; CHECK-NEXT: [[TMP25:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
+; CHECK-NEXT: [[TMP26:%.*]] = mul <vscale x 4 x i32> [[TMP25]], [[TMP22]]
+; CHECK-NEXT: [[TMP27]] = add <vscale x 4 x i32> [[TMP26]], [[VEC_PHI3]]
+; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX2]], [[TMP17]]
+; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: vec.epilog.middle.block:
+; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP27]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK: vec.epilog.scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi i32 [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX7]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: for.exit:
+; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY1]] ], [ [[TMP9]], [[MIDDLE_BLOCK1]] ], [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i32 [[ADD_LCSSA]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %gep.a = getelementptr i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = zext i8 %load.a to i32
+ %gep.b = getelementptr i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %ext.b = zext i8 %load.b to i32
+ %mul = mul i32 %ext.b, %ext.a
+ %add = add i32 %mul, %accum
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 0
+ br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit: ; preds = %for.body
+ ret i32 %add
+}
+
+attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
new file mode 100644
index 00000000000000..36fcb444d14a11
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -0,0 +1,1374 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -force-vector-interleave=1 -vectorizer-maximize-bandwidth -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @dotp(ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @dotp(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1: middle.block:
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-MAXBW-LABEL: define i32 @dotp(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-MAXBW: middle.block:
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %gep.a = getelementptr i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = zext i8 %load.a to i32
+ %gep.b = getelementptr i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %ext.b = zext i8 %load.b to i32
+ %mul = mul i32 %ext.b, %ext.a
+ %add = add i32 %mul, %accum
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 0
+ br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit: ; preds = %for.body
+ ret i32 %add
+}
+
+define i32 @not_dotp_different_types(ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3
+; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5
+; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6
+; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7
+; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9
+; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10
+; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11
+; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12
+; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13
+; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14
+; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15
+; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 17
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 18
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 19
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 20
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 21
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 22
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 23
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 24
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 25
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = add i64 [[INDEX]], 26
+; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 27
+; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], 28
+; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 29
+; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30
+; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31
+; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
+; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
+; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
+; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
+; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
+; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
+; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
+; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
+; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
+; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
+; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i16, ptr [[TMP37]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = load i16, ptr [[TMP38]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = load i16, ptr [[TMP39]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = load i16, ptr [[TMP40]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = load i16, ptr [[TMP41]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i16, ptr [[TMP42]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = load i16, ptr [[TMP43]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = load i16, ptr [[TMP44]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = load i16, ptr [[TMP45]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = load i16, ptr [[TMP46]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i16, ptr [[TMP47]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = load i16, ptr [[TMP48]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = load i16, ptr [[TMP49]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP50]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP51]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
+; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
+; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
+; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
+; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
+; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
+; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
+; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
+; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
+; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
+; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
+; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
+; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
+; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
+; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP53]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP54]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP55]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP56]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP57]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP58]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP59]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP60]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP61]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP62]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP63]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP64]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP65]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP66]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP67]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP68]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
+; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
+; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
+; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
+; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
+; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
+; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
+; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
+; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
+; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
+; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
+; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
+; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
+; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
+; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]]
+; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
+; CHECK-INTERLEAVED-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2
+; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2
+; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2
+; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2
+; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2
+; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2
+; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2
+; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2
+; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2
+; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2
+; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2
+; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2
+; CHECK-MAXBW-NEXT: [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2
+; CHECK-MAXBW-NEXT: [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2
+; CHECK-MAXBW-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2
+; CHECK-MAXBW-NEXT: [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2
+; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0
+; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1
+; CHECK-MAXBW-NEXT: [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2
+; CHECK-MAXBW-NEXT: [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3
+; CHECK-MAXBW-NEXT: [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4
+; CHECK-MAXBW-NEXT: [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5
+; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6
+; CHECK-MAXBW-NEXT: [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7
+; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8
+; CHECK-MAXBW-NEXT: [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9
+; CHECK-MAXBW-NEXT: [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10
+; CHECK-MAXBW-NEXT: [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11
+; CHECK-MAXBW-NEXT: [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12
+; CHECK-MAXBW-NEXT: [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13
+; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14
+; CHECK-MAXBW-NEXT: [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15
+; CHECK-MAXBW-NEXT: [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]]
+; CHECK-MAXBW-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %gep.a = getelementptr i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = zext i8 %load.a to i32
+ %gep.b = getelementptr i8, ptr %b, i64 %iv
+ %load.b = load i16, ptr %gep.b, align 2
+ %ext.b = zext i16 %load.b to i32
+ %mul = mul i32 %ext.b, %ext.a
+ %add = add i32 %mul, %accum
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 0
+ br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit: ; preds = %for.body
+ ret i32 %add
+}
+
+define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ]
+ %gep.a = getelementptr i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = zext i8 %load.a to i32
+ %gep.b = getelementptr i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %ext.b = zext i8 %load.b to i32
+ %mul = mul i32 %ext.b, %ext.a
+ %add = add i32 %mul, %accum
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 0
+ br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit: ; preds = %for.body
+ ret i32 %add
+}
+
+define i32 @not_dotp_not_phi(ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %gep.a = getelementptr i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = zext i8 %load.a to i32
+ %gep.b = getelementptr i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %ext.b = zext i8 %load.b to i32
+ %mul = mul i32 %ext.b, %ext.a
+ %add = add i32 %mul, %ext.b
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 0
+ br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit: ; preds = %for.body
+ ret i32 %add
+}
+
+define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled(
+; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVE1: middle.block:
+; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
+; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
+; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
+; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
+; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled(
+; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-MAXBW: middle.block:
+; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
+; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
+; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
+; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ]
+ %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ]
+ %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ]
+ %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ]
+ %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv
+ %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv
+ %offset.1 = or disjoint i64 %iv, 1
+ %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1
+ %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1
+ %offset.2 = or disjoint i64 %iv, 2
+ %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2
+ %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2
+ %offset.3 = or disjoint i64 %iv, 3
+ %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3
+ %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3
+ %load.a0 = load i8, ptr %gep.a0, align 1
+ %ext.a0 = sext i8 %load.a0 to i32
+ %load.b0 = load i8, ptr %gep.b0, align 1
+ %ext.b0 = sext i8 %load.b0 to i32
+ %mul.a0 = mul nsw i32 %ext.b0, %ext.a0
+ %add.a0 = add nsw i32 %mul.a0, %accum0
+ %load.a1 = load i8, ptr %gep.a1, align 1
+ %ext.a1 = sext i8 %load.a1 to i32
+ %load.b1 = load i8, ptr %gep.b1, align 1
+ %ext.b1 = sext i8 %load.b1 to i32
+ %mul.a1 = mul nsw i32 %ext.a1, %ext.b1
+ %add.a1 = add nsw i32 %mul.a1, %accum1
+ %load.a2 = load i8, ptr %gep.a2, align 1
+ %ext.a2 = sext i8 %load.a2 to i32
+ %load.b2 = load i8, ptr %gep.b2, align 1
+ %ext.b2 = sext i8 %load.b2 to i32
+ %mul.a2 = mul nsw i32 %ext.a2, %ext.b2
+ %add.a2 = add nsw i32 %mul.a2, %accum2
+ %load.a3 = load i8, ptr %gep.a3, align 1
+ %ext.a3 = sext i8 %load.a3 to i32
+ %load.b3 = load i8, ptr %gep.b3, align 1
+ %ext.b3 = sext i8 %load.b3 to i32
+ %mul.a3 = mul nsw i32 %ext.a3, %ext.b3
+ %add.a3 = add nsw i32 %mul.a3, %accum3
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %num_in
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit: ; preds = %for.body
+ %result0 = add nsw i32 %add.a0, %add.a1
+ %result1 = add nsw i32 %add.a2, %add.a3
+ %result = add nsw i32 %result0, %result1
+ ret i32 %result
+}
+
+define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated(
+; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVE1: middle.block:
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated(
+; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32
+; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated(
+; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-MAXBW: middle.block:
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = sext i8 %load.a to i32
+ %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %ext.b = sext i8 %load.b to i32
+ %mul = mul nsw i32 %ext.b, %ext.a
+ %add = add nsw i32 %mul, %accum
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %N
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit: ; preds = %for.body
+ ret i32 %add
+}
+
+define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
+; CHECK-MAXBW-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
+; CHECK-MAXBW-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = sext i8 %load.a to i32
+ %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv
+ %load.b = load i8, ptr %gep.a2, align 1
+ %ext.b = sext i8 %load.b to i32
+ %mul = mul nsw i32 %ext.b, %ext.a
+ %add = add nsw i32 %mul, %accum
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %N
+ br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7
+
+exit: ; preds = %for.body
+ ret i32 %add
+}
+
+define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-INTERLEAVE1: middle.block:
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1: scalar.ph:
+; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: for.body:
+; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVE1: for.exit:
+; CHECK-INTERLEAVE1-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
+; CHECK-INTERLEAVE1-NEXT: ret i32 [[RESULT]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i32> [[TMP10]], i32 15
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED: scalar.ph:
+; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVED: for.body:
+; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED: for.exit:
+; CHECK-INTERLEAVED-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
+; CHECK-INTERLEAVED-NEXT: ret i32 [[RESULT]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-MAXBW: middle.block:
+; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
+; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
+; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-MAXBW: scalar.ph:
+; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW: for.body:
+; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-MAXBW: for.exit:
+; CHECK-MAXBW-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
+; CHECK-MAXBW-NEXT: ret i32 [[RESULT]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %gep.a = getelementptr i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = zext i8 %load.a to i32
+ %gep.b = getelementptr i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %ext.b = zext i8 %load.b to i32
+ %mul = mul i32 %ext.b, %ext.a
+ %add = add i32 %mul, %accum
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 0
+ br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit: ; preds = %for.body
+ %result = add i32 %add, %ext.b
+ ret i32 %result
+}
+
+!7 = distinct !{!7, !8, !9, !10}
+!8 = !{!"llvm.loop.mustprogress"}
+!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!10 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
new file mode 100644
index 00000000000000..fdf1ce82505970
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -0,0 +1,1455 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -force-target-instruction-cost=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
+; RUN: opt -passes=loop-vectorize -force-target-instruction-cost=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @dotp(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @dotp(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY1:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE1:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP23]], [[TMP19]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP24]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1: middle.block:
+; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE1]])
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
+; CHECK-INTERLEAVE1: scalar.ph:
+; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
+; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: for.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVE1: for.exit:
+; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY1:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[TMP20]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP28]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul <16 x i32> [[TMP11]], [[TMP25]]
+; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul <16 x i32> [[TMP12]], [[TMP26]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP13]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP29]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 0
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX1:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX1]])
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
+; CHECK-INTERLEAVED: scalar.ph:
+; CHECK-INTERLEAVED-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
+; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: for.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVED: for.exit:
+; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]]
+;
+; CHECK-MAXBW-LABEL: define i32 @dotp(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %gep.a = getelementptr i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = zext i8 %load.a to i32
+ %gep.b = getelementptr i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %ext.b = zext i8 %load.b to i32
+ %mul = mul i32 %ext.b, %ext.a
+ %add = add i32 %mul, %accum
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 0
+ br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit: ; preds = %for.body
+ ret i32 %add
+}
+
+define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = load i16, ptr [[TMP23]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = load i16, ptr [[TMP24]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP25]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = load i16, ptr [[TMP26]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = load i16, ptr [[TMP27]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = load i16, ptr [[TMP28]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = load i16, ptr [[TMP29]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = load i16, ptr [[TMP30]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3
+; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP41]], i32 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 5
+; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 6
+; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 7
+; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 9
+; CHECK-INTERLEAVE1-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP55]], i32 10
+; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP56]], i32 11
+; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12
+; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13
+; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14
+; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15
+; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVE1: middle.block:
+; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1: scalar.ph:
+; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ]
+; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: for.body:
+; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-INTERLEAVE1: for.exit:
+; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 17
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 18
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 19
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 20
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 21
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 22
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 23
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 24
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 25
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = add i64 [[INDEX]], 26
+; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 27
+; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], 28
+; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 29
+; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30
+; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31
+; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
+; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
+; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
+; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
+; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
+; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
+; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
+; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
+; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
+; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
+; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i16, ptr [[TMP39]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = load i16, ptr [[TMP40]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = load i16, ptr [[TMP41]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = load i16, ptr [[TMP42]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = load i16, ptr [[TMP43]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i16, ptr [[TMP44]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = load i16, ptr [[TMP45]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = load i16, ptr [[TMP46]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = load i16, ptr [[TMP47]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = load i16, ptr [[TMP48]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i16, ptr [[TMP49]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = load i16, ptr [[TMP50]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = load i16, ptr [[TMP51]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP53]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP54]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
+; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
+; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
+; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
+; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
+; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
+; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
+; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
+; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
+; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
+; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
+; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
+; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
+; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
+; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP55]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP56]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP57]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP58]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP59]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP60]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP61]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP62]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP63]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP64]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP65]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP66]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP67]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP68]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP139]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP140]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
+; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
+; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
+; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
+; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
+; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
+; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
+; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
+; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
+; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
+; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
+; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
+; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
+; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
+; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]]
+; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
+; CHECK-INTERLEAVED-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
+; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-MAXBW-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-MAXBW-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP37]], align 2
+; CHECK-MAXBW-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP38]], align 2
+; CHECK-MAXBW-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP39]], align 2
+; CHECK-MAXBW-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP40]], align 2
+; CHECK-MAXBW-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP41]], align 2
+; CHECK-MAXBW-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP42]], align 2
+; CHECK-MAXBW-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP43]], align 2
+; CHECK-MAXBW-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP44]], align 2
+; CHECK-MAXBW-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP45]], align 2
+; CHECK-MAXBW-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP46]], align 2
+; CHECK-MAXBW-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP47]], align 2
+; CHECK-MAXBW-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP48]], align 2
+; CHECK-MAXBW-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP49]], align 2
+; CHECK-MAXBW-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP50]], align 2
+; CHECK-MAXBW-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP51]], align 2
+; CHECK-MAXBW-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-MAXBW-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
+; CHECK-MAXBW-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
+; CHECK-MAXBW-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
+; CHECK-MAXBW-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
+; CHECK-MAXBW-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
+; CHECK-MAXBW-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
+; CHECK-MAXBW-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
+; CHECK-MAXBW-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
+; CHECK-MAXBW-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
+; CHECK-MAXBW-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
+; CHECK-MAXBW-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
+; CHECK-MAXBW-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
+; CHECK-MAXBW-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
+; CHECK-MAXBW-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
+; CHECK-MAXBW-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
+; CHECK-MAXBW-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
+; CHECK-MAXBW-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
+; CHECK-MAXBW-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %gep.a = getelementptr i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = zext i8 %load.a to i32
+ %gep.b = getelementptr i8, ptr %b, i64 %iv
+ %load.b = load i16, ptr %gep.b, align 2
+ %ext.b = zext i16 %load.b to i32
+ %mul = mul i32 %ext.b, %ext.a
+ %add = add i32 %mul, %accum
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 0
+ br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit: ; preds = %for.body
+ ret i32 %add
+}
+
+define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 16 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP16]] = mul <vscale x 16 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> [[VECTOR_RECUR]], <vscale x 16 x i32> [[TMP16]], i32 -1)
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add <vscale x 16 x i32> [[TMP16]], [[TMP17]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 16 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul <vscale x 16 x i32> [[TMP22]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP25]] = mul <vscale x 16 x i32> [[TMP23]], [[TMP16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> [[TMP24]], <vscale x 16 x i32> [[TMP25]], i32 -1)
+; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add <vscale x 16 x i32> [[TMP25]], [[TMP26]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
+; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP25]], i32 -1)
+; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ]
+ %gep.a = getelementptr i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = zext i8 %load.a to i32
+ %gep.b = getelementptr i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %ext.b = zext i8 %load.b to i32
+ %mul = mul i32 %ext.b, %ext.a
+ %add = add i32 %mul, %accum
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 0
+ br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit: ; preds = %for.body
+ ret i32 %add
+}
+
+define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 16 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul <vscale x 16 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP17]] = add <vscale x 16 x i32> [[TMP16]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 16 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP19]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 16 x i32> [[TMP21]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <vscale x 16 x i32> [[TMP22]], [[TMP21]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
+; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP14]]
+; CHECK-MAXBW-NEXT: [[TMP21]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %gep.a = getelementptr i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = zext i8 %load.a to i32
+ %gep.b = getelementptr i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %ext.b = zext i8 %load.b to i32
+ %mul = mul i32 %ext.b, %ext.a
+ %add = add i32 %mul, %ext.b
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 0
+ br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit: ; preds = %for.body
+ ret i32 %add
+}
+
+define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled(
+; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
+; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled(
+; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP1]]
+; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP3]]
+; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI4:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI5:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI6:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI7:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP6]], 1
+; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = or disjoint i64 [[TMP6]], 2
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = or disjoint i64 [[TMP6]], 3
+; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
+; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 8 x i8>, ptr [[TMP24]], align 1
+; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD9]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw <vscale x 8 x i32> [[TMP29]], [[TMP23]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE11]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI7]], <vscale x 8 x i32> [[TMP31]])
+; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load <vscale x 8 x i8>, ptr [[TMP32]], align 1
+; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD12]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD14:%.*]] = load <vscale x 8 x i8>, ptr [[TMP38]], align 1
+; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD14]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = mul nsw <vscale x 8 x i32> [[TMP37]], [[TMP43]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI6]], <vscale x 8 x i32> [[TMP45]])
+; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD18:%.*]] = load <vscale x 8 x i8>, ptr [[TMP46]], align 1
+; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD18]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD20:%.*]] = load <vscale x 8 x i8>, ptr [[TMP52]], align 1
+; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD20]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = mul nsw <vscale x 8 x i32> [[TMP51]], [[TMP57]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE17]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI5]], <vscale x 8 x i32> [[TMP59]])
+; CHECK-MAXBW-NEXT: [[TMP60:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD24:%.*]] = load <vscale x 8 x i8>, ptr [[TMP60]], align 1
+; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD24]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD26:%.*]] = load <vscale x 8 x i8>, ptr [[TMP66]], align 1
+; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD26]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP73:%.*]] = mul nsw <vscale x 8 x i32> [[TMP65]], [[TMP71]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE16]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI4]], <vscale x 8 x i32> [[TMP73]])
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ]
+ %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ]
+ %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ]
+ %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ]
+ %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv
+ %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv
+ %offset.1 = or disjoint i64 %iv, 1
+ %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1
+ %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1
+ %offset.2 = or disjoint i64 %iv, 2
+ %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2
+ %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2
+ %offset.3 = or disjoint i64 %iv, 3
+ %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3
+ %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3
+ %load.a0 = load i8, ptr %gep.a0, align 1
+ %ext.a0 = sext i8 %load.a0 to i32
+ %load.b0 = load i8, ptr %gep.b0, align 1
+ %ext.b0 = sext i8 %load.b0 to i32
+ %mul.a0 = mul nsw i32 %ext.b0, %ext.a0
+ %add.a0 = add nsw i32 %mul.a0, %accum0
+ %load.a1 = load i8, ptr %gep.a1, align 1
+ %ext.a1 = sext i8 %load.a1 to i32
+ %load.b1 = load i8, ptr %gep.b1, align 1
+ %ext.b1 = sext i8 %load.b1 to i32
+ %mul.a1 = mul nsw i32 %ext.a1, %ext.b1
+ %add.a1 = add nsw i32 %mul.a1, %accum1
+ %load.a2 = load i8, ptr %gep.a2, align 1
+ %ext.a2 = sext i8 %load.a2 to i32
+ %load.b2 = load i8, ptr %gep.b2, align 1
+ %ext.b2 = sext i8 %load.b2 to i32
+ %mul.a2 = mul nsw i32 %ext.a2, %ext.b2
+ %add.a2 = add nsw i32 %mul.a2, %accum2
+ %load.a3 = load i8, ptr %gep.a3, align 1
+ %ext.a3 = sext i8 %load.a3 to i32
+ %load.b3 = load i8, ptr %gep.b3, align 1
+ %ext.b3 = sext i8 %load.b3 to i32
+ %mul.a3 = mul nsw i32 %ext.a3, %ext.b3
+ %add.a3 = add nsw i32 %mul.a3, %accum3
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %num_in
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit: ; preds = %for.body
+ %result0 = add nsw i32 %add.a0, %add.a1
+ %result1 = add nsw i32 %add.a2, %add.a3
+ %result = add nsw i32 %result0, %result1
+ ret i32 %result
+}
+
+define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated(
+; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVE1: middle.block:
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated(
+; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 32
+; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE6]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP12]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated(
+; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul nsw <vscale x 8 x i32> [[TMP20]], [[TMP13]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = sext i8 %load.a to i32
+ %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %ext.b = sext i8 %load.b to i32
+ %mul = mul nsw i32 %ext.b, %ext.a
+ %add = add nsw i32 %mul, %accum
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %N
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit: ; preds = %for.body
+ ret i32 %add
+}
+
+define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = sub i64 [[N]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
+; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[N]])
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP8]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add <16 x i32> [[TMP10]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP12]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP11]], <16 x i32> [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = extractelement <16 x i1> [[TMP13]], i32 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = sub i64 [[N]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
+; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[N]])
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP8]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add <16 x i32> [[TMP10]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP11]], <16 x i32> [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[TMP2]])
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = extractelement <16 x i1> [[TMP13]], i32 0
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-MAXBW-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
+; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
+; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
+; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
+; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT: [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
+; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
+; CHECK-MAXBW-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = sext i8 %load.a to i32
+ %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv
+ %load.b = load i8, ptr %gep.a2, align 1
+ %ext.b = sext i8 %load.b to i32
+ %mul = mul nsw i32 %ext.b, %ext.a
+ %add = add nsw i32 %mul, %accum
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %N
+ br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7
+
+exit: ; preds = %for.body
+ ret i32 %add
+}
+
+define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP10]] = add <16 x i32> [[TMP9]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP8]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul <16 x i32> [[TMP11]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul <16 x i32> [[TMP12]], [[TMP7]]
+; CHECK-INTERLEAVED-NEXT: [[TMP15]] = add <16 x i32> [[TMP13]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[TMP16]] = add <16 x i32> [[TMP14]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
+; CHECK-MAXBW-NEXT: [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %gep.a = getelementptr i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = zext i8 %load.a to i32
+ %gep.b = getelementptr i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %ext.b = zext i8 %load.b to i32
+ %mul = mul i32 %ext.b, %ext.a
+ %add = add i32 %mul, %accum
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 0
+ br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit: ; preds = %for.body
+ %result = add i32 %add, %ext.b
+ ret i32 %result
+}
+
+!7 = distinct !{!7, !8, !9, !10}
+!8 = !{!"llvm.loop.mustprogress"}
+!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!10 = !{!"llvm.loop.vectorize.enable", i1 true}
+attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
new file mode 100644
index 00000000000000..7253bec777188f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -0,0 +1,90 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -disable-output %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+; Tests for printing VPlans that are enabled under AArch64
+
+define i32 @print_partial_reduction(ptr %a, ptr %b) {
+; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<0> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4)
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
+; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]>
+; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
+; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b>
+; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]>
+; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
+; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
+; CHECK-NEXT: PARTIAL-REDUCE ir<[[REDUCE]]> = add ir<%mul>, ir<[[ACC]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]>
+; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1>
+; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<0>, vp<%1>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: EMIT vp<%bc.merge.rdx> = resume-phi vp<[[RED_RESULT]]>, ir<0>
+; CHECK-NEXT: Successor(s): ir-bb<for.body>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.body>:
+; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT: IR %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] (extra operand: vp<%bc.merge.rdx>)
+; CHECK-NEXT: IR %gep.a = getelementptr i8, ptr %a, i64 %iv
+; CHECK-NEXT: IR %load.a = load i8, ptr %gep.a, align 1
+; CHECK-NEXT: IR %ext.a = zext i8 %load.a to i32
+; CHECK-NEXT: IR %gep.b = getelementptr i8, ptr %b, i64 %iv
+; CHECK-NEXT: IR %load.b = load i8, ptr %gep.b, align 1
+; CHECK-NEXT: IR %ext.b = zext i8 %load.b to i32
+; CHECK-NEXT: IR %mul = mul i32 %ext.b, %ext.a
+; CHECK-NEXT: IR %add = add i32 %mul, %accum
+; CHECK-NEXT: IR %iv.next = add i64 %iv, 1
+; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 0
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %gep.a = getelementptr i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = zext i8 %load.a to i32
+ %gep.b = getelementptr i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %ext.b = zext i8 %load.b to i32
+ %mul = mul i32 %ext.b, %ext.a
+ %add = add i32 %mul, %accum
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 0
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+ ret i32 %add
+}
>From 0a2ae415dee02632b1901dce463da2e55ddfaf59 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 10 Dec 2024 10:51:10 +0000
Subject: [PATCH 2/9] Rebase
---
.../LoopVectorize/AArch64/fully-unrolled-cost.ll | 12 ++++++------
.../AArch64/partial-reduce-dot-product-epilogue.ll | 4 ++--
.../AArch64/partial-reduce-dot-product.ll | 2 +-
.../LoopVectorize/AArch64/vplan-printing.ll | 13 ++++++++-----
4 files changed, 17 insertions(+), 14 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index 1cfb507a74344d..924546addc61f8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -11,10 +11,10 @@ define i64 @test(ptr %a, ptr %b) #0 {
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 26
+; CHECK: Cost for VF 8: 23
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 48
+; CHECK: Cost for VF 16: 42
; CHECK: LV: Selecting VF: 16
entry:
br label %for.body
@@ -45,11 +45,11 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 {
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 26
+; CHECK: Cost for VF 8: 23
; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 49
+; CHECK: Cost for VF 16: 43
; CHECK: LV: Selecting VF: vscale x 2
entry:
br label %for.body
@@ -82,11 +82,11 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 27
+; CHECK: Cost for VF 8: 24
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 48
+; CHECK: Cost for VF 16: 42
; CHECK: LV: Selecting VF: 16
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index 7b598d1edc72a3..cf923f15cdffdf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -42,8 +42,8 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]]
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]]
; CHECK: vec.epilog.ph:
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP15]]
@@ -75,7 +75,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi i32 [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi i32 [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ]
; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY1]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index fdf1ce82505970..20fcaa834437ee 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1309,7 +1309,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
-; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
; CHECK-MAXBW-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index 7253bec777188f..1b28cc3e5dbfba 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -12,6 +12,9 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
; CHECK-NEXT: Live-in ir<0> = original trip-count
; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: Successor(s): vector loop
; CHECK-EMPTY:
@@ -43,17 +46,13 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
; CHECK-NEXT: EMIT vp<%bc.merge.rdx> = resume-phi vp<[[RED_RESULT]]>, ir<0>
; CHECK-NEXT: Successor(s): ir-bb<for.body>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body>:
; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-; CHECK-NEXT: IR %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] (extra operand: vp<%bc.merge.rdx>)
+; CHECK-NEXT: IR %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
; CHECK-NEXT: IR %gep.a = getelementptr i8, ptr %a, i64 %iv
; CHECK-NEXT: IR %load.a = load i8, ptr %gep.a, align 1
; CHECK-NEXT: IR %ext.a = zext i8 %load.a to i32
@@ -65,6 +64,10 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-NEXT: IR %iv.next = add i64 %iv, 1
; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 0
; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]> from middle.block)
+; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
entry:
>From 9d589de02941b8574972e2fd846659b1a19978e8 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 13 Dec 2024 11:31:48 +0000
Subject: [PATCH 3/9] Edit fully unrolled cost test to not trip assertion
---
.../LoopVectorize/AArch64/fully-unrolled-cost.ll | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index 924546addc61f8..c3e8c895fce24a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -11,10 +11,10 @@ define i64 @test(ptr %a, ptr %b) #0 {
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 23
+; CHECK: Cost for VF 8: 30
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 42
+; CHECK: Cost for VF 16: 56
; CHECK: LV: Selecting VF: 16
entry:
br label %for.body
@@ -31,8 +31,8 @@ for.body: ; preds = %entry, %for.body
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv
%1 = load i8, ptr %arrayidx2, align 1
%conv3 = zext i8 %1 to i64
- %mul = mul nuw nsw i64 %conv3, %conv
- %add = add i64 %mul, %sum
+ %div = udiv i64 %conv3, %conv
+ %add = add i64 %div, %sum
%i.iv.next = add nuw nsw i64 %i.iv, 1
%exitcond.not = icmp eq i64 %i.iv.next, 16
br i1 %exitcond.not, label %exit, label %for.body
@@ -45,11 +45,11 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 {
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 23
+; CHECK: Cost for VF 8: 30
; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 43
+; CHECK: Cost for VF 16: 57
; CHECK: LV: Selecting VF: vscale x 2
entry:
br label %for.body
@@ -64,8 +64,8 @@ for.body: ; preds = %entry, %for.body
%arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next
%1 = load i8, ptr %arrayidx2, align 1
%conv3 = zext i8 %1 to i64
- %mul = mul nuw nsw i64 %conv3, %conv
- %add = add i64 %sum, %mul
+ %div = udiv i64 %conv3, %conv
+ %add = add i64 %sum, %div
%exitcond.not = icmp eq i64 %i.iv.next, 16
br i1 %exitcond.not, label %exit, label %for.body
>From 48dfa47b9944c289fdd63c7a14439eb3d4225ed0 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 16 Dec 2024 15:17:58 +0000
Subject: [PATCH 4/9] Fix neon and dotprod logic
---
.../AArch64/AArch64TargetTransformInfo.h | 2 +-
.../partial-reduce-dot-product-epilogue.ll | 221 ++++----
.../partial-reduce-dot-product-neon.ll | 494 +++++++++++++++++-
.../AArch64/partial-reduce-dot-product.ll | 68 +--
.../LoopVectorize/AArch64/vplan-printing.ll | 2 +-
5 files changed, 634 insertions(+), 153 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index c9d7e68c8346ba..2a31cacc203f44 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -376,7 +376,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable())
return Invalid;
- if (VF.isFixed() && !ST->isNeonAvailable() && !ST->hasDotProd())
+ if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd()))
return Invalid;
if (InputEVT == MVT::i8) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index cf923f15cdffdf..f83eb3362cb704 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -1,99 +1,140 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-target-instruction-cost=1 -S < %s | FileCheck %s
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-target-instruction-cost=1 -S < %s | FileCheck %s --check-prefix=CHECK-NODOTPROD
+; RUN: opt -mattr=+dotprod -passes=loop-vectorize -force-vector-interleave=1 -force-target-instruction-cost=1 -S < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-none-unknown-elf"
define i32 @dotp(ptr %a, ptr %b) #0 {
-; CHECK-LABEL: define i32 @dotp(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: iter.check:
-; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP11]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[ENTRY:%.*]]
-; CHECK: vector.main.loop.iter.check:
-; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
-; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]]
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]]
-; CHECK: vec.epilog.ph:
-; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP15]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
-; CHECK-NEXT: [[TMP18:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT6:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ [[TMP18]], [[SCALAR_PH]] ], [ [[TMP27:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX2]], 0
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP19]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
-; CHECK-NEXT: [[TMP22:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP23]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
-; CHECK-NEXT: [[TMP25:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
-; CHECK-NEXT: [[TMP26:%.*]] = mul <vscale x 4 x i32> [[TMP25]], [[TMP22]]
-; CHECK-NEXT: [[TMP27]] = add <vscale x 4 x i32> [[TMP26]], [[VEC_PHI3]]
-; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX2]], [[TMP17]]
-; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP27]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK: vec.epilog.scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi i32 [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY1]] ]
-; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX7]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY1]] ]
-; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
-; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
-; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
-; CHECK-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: for.exit:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY1]] ], [ [[TMP9]], [[MIDDLE_BLOCK1]] ], [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[ADD_LCSSA]]
+; CHECK-NODOTPROD-LABEL: define i32 @dotp(
+; CHECK-NODOTPROD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NODOTPROD-NEXT: iter.check:
+; CHECK-NODOTPROD-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NODOTPROD-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NODOTPROD-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-NODOTPROD-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-NODOTPROD: vector.main.loop.iter.check:
+; CHECK-NODOTPROD-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NODOTPROD: vector.ph:
+; CHECK-NODOTPROD-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NODOTPROD: vector.body:
+; CHECK-NODOTPROD-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NODOTPROD-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NODOTPROD-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NODOTPROD-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NODOTPROD-NEXT: [[TMP10]] = add <16 x i32> [[TMP9]], [[VEC_PHI]]
+; CHECK-NODOTPROD-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NODOTPROD-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NODOTPROD-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NODOTPROD: middle.block:
+; CHECK-NODOTPROD-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP10]])
+; CHECK-NODOTPROD-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-NODOTPROD: vec.epilog.iter.check:
+; CHECK-NODOTPROD-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NODOTPROD-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-NODOTPROD-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP14]]
+; CHECK-NODOTPROD-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NODOTPROD: vec.epilog.ph:
+; CHECK-NODOTPROD-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NODOTPROD-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NODOTPROD-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NODOTPROD-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NODOTPROD-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP16]]
+; CHECK-NODOTPROD-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-NODOTPROD-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NODOTPROD-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4
+; CHECK-NODOTPROD-NEXT: [[TMP19:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NODOTPROD-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-NODOTPROD: vec.epilog.vector.body:
+; CHECK-NODOTPROD-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ [[TMP19]], [[VEC_EPILOG_PH]] ], [ [[TMP28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[TMP20:%.*]] = add i64 [[INDEX2]], 0
+; CHECK-NODOTPROD-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP20]]
+; CHECK-NODOTPROD-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[TMP21]], i32 0
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP22]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP23:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
+; CHECK-NODOTPROD-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP24]], i32 0
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP25]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP26:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP27:%.*]] = mul <vscale x 4 x i32> [[TMP26]], [[TMP23]]
+; CHECK-NODOTPROD-NEXT: [[TMP28]] = add <vscale x 4 x i32> [[TMP27]], [[VEC_PHI3]]
+; CHECK-NODOTPROD-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX2]], [[TMP18]]
+; CHECK-NODOTPROD-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
+; CHECK-NODOTPROD-NEXT: br i1 [[TMP29]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; CHECK-DOTPROD-LABEL: define i32 @dotp(
+; CHECK-DOTPROD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-DOTPROD-NEXT: iter.check:
+; CHECK-DOTPROD-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-DOTPROD-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-DOTPROD-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-DOTPROD-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-DOTPROD: vector.main.loop.iter.check:
+; CHECK-DOTPROD-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-DOTPROD: vector.ph:
+; CHECK-DOTPROD-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-DOTPROD: vector.body:
+; CHECK-DOTPROD-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-DOTPROD-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-DOTPROD-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-DOTPROD-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-DOTPROD-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; CHECK-DOTPROD-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-DOTPROD-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-DOTPROD-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-DOTPROD-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-DOTPROD-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-DOTPROD-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-DOTPROD-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]]
+; CHECK-DOTPROD-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
+; CHECK-DOTPROD-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-DOTPROD-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-DOTPROD-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-DOTPROD: middle.block:
+; CHECK-DOTPROD-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-DOTPROD-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-DOTPROD: vec.epilog.iter.check:
+; CHECK-DOTPROD-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-DOTPROD-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-DOTPROD-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]]
+; CHECK-DOTPROD-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-DOTPROD: vec.epilog.ph:
+; CHECK-DOTPROD-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-DOTPROD-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-DOTPROD-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-DOTPROD-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-DOTPROD-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP15]]
+; CHECK-DOTPROD-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-DOTPROD-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-DOTPROD-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-DOTPROD-NEXT: [[TMP18:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-DOTPROD-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-DOTPROD: vec.epilog.vector.body:
+; CHECK-DOTPROD-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-DOTPROD-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-DOTPROD-NEXT: [[TMP19:%.*]] = add i64 [[INDEX2]], 0
+; CHECK-DOTPROD-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP19]]
+; CHECK-DOTPROD-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
+; CHECK-DOTPROD-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
+; CHECK-DOTPROD-NEXT: [[TMP22:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-DOTPROD-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-DOTPROD-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP23]], i32 0
+; CHECK-DOTPROD-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
+; CHECK-DOTPROD-NEXT: [[TMP25:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
+; CHECK-DOTPROD-NEXT: [[TMP26:%.*]] = mul <vscale x 4 x i32> [[TMP25]], [[TMP22]]
+; CHECK-DOTPROD-NEXT: [[TMP27]] = add <vscale x 4 x i32> [[TMP26]], [[VEC_PHI3]]
+; CHECK-DOTPROD-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX2]], [[TMP17]]
+; CHECK-DOTPROD-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
+; CHECK-DOTPROD-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
;
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index 36fcb444d14a11..708ebb43c69cdc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -1,14 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
-; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
-; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -force-vector-interleave=1 -vectorizer-maximize-bandwidth -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -vectorizer-maximize-bandwidth -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon -S < %s | FileCheck %s --check-prefixes=CHECK-NODOTPROD
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-none-unknown-elf"
define i32 @dotp(ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-LABEL: define i32 @dotp(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
@@ -35,7 +36,7 @@ define i32 @dotp(ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
;
; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
@@ -72,7 +73,7 @@ define i32 @dotp(ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
;
; CHECK-MAXBW-LABEL: define i32 @dotp(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
@@ -98,6 +99,43 @@ define i32 @dotp(ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
;
+; CHECK-NODOTPROD-LABEL: define i32 @dotp(
+; CHECK-NODOTPROD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NODOTPROD-NEXT: entry:
+; CHECK-NODOTPROD-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NODOTPROD: vector.ph:
+; CHECK-NODOTPROD-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NODOTPROD: vector.body:
+; CHECK-NODOTPROD-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NODOTPROD-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NODOTPROD-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-NODOTPROD-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NODOTPROD-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-NODOTPROD-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-NODOTPROD-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NODOTPROD-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]]
+; CHECK-NODOTPROD-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
+; CHECK-NODOTPROD-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NODOTPROD-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NODOTPROD-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NODOTPROD: middle.block:
+; CHECK-NODOTPROD-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]]
+; CHECK-NODOTPROD-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-NODOTPROD-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
entry:
br label %for.body
@@ -122,7 +160,7 @@ for.exit: ; preds = %for.body
define i32 @not_dotp_different_types(ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
@@ -206,7 +244,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
;
; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
@@ -361,7 +399,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
;
; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
@@ -444,6 +482,165 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
;
+; CHECK-NODOTPROD-LABEL: define i32 @not_dotp_different_types(
+; CHECK-NODOTPROD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NODOTPROD-NEXT: entry:
+; CHECK-NODOTPROD-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NODOTPROD: vector.ph:
+; CHECK-NODOTPROD-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NODOTPROD: vector.body:
+; CHECK-NODOTPROD-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NODOTPROD-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NODOTPROD-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NODOTPROD-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NODOTPROD-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NODOTPROD-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NODOTPROD-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NODOTPROD-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NODOTPROD-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NODOTPROD-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-NODOTPROD-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-NODOTPROD-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-NODOTPROD-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NODOTPROD-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-NODOTPROD-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-NODOTPROD-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-NODOTPROD-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16
+; CHECK-NODOTPROD-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 17
+; CHECK-NODOTPROD-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 18
+; CHECK-NODOTPROD-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 19
+; CHECK-NODOTPROD-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 20
+; CHECK-NODOTPROD-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 21
+; CHECK-NODOTPROD-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 22
+; CHECK-NODOTPROD-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 23
+; CHECK-NODOTPROD-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 24
+; CHECK-NODOTPROD-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 25
+; CHECK-NODOTPROD-NEXT: [[TMP26:%.*]] = add i64 [[INDEX]], 26
+; CHECK-NODOTPROD-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 27
+; CHECK-NODOTPROD-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], 28
+; CHECK-NODOTPROD-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 29
+; CHECK-NODOTPROD-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30
+; CHECK-NODOTPROD-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31
+; CHECK-NODOTPROD-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NODOTPROD-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
+; CHECK-NODOTPROD-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NODOTPROD-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NODOTPROD-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NODOTPROD-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-NODOTPROD-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-NODOTPROD-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-NODOTPROD-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-NODOTPROD-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-NODOTPROD-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-NODOTPROD-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-NODOTPROD-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-NODOTPROD-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-NODOTPROD-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-NODOTPROD-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-NODOTPROD-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-NODOTPROD-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-NODOTPROD-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
+; CHECK-NODOTPROD-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-NODOTPROD-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
+; CHECK-NODOTPROD-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-NODOTPROD-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
+; CHECK-NODOTPROD-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
+; CHECK-NODOTPROD-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
+; CHECK-NODOTPROD-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
+; CHECK-NODOTPROD-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
+; CHECK-NODOTPROD-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
+; CHECK-NODOTPROD-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
+; CHECK-NODOTPROD-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
+; CHECK-NODOTPROD-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
+; CHECK-NODOTPROD-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-NODOTPROD-NEXT: [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
+; CHECK-NODOTPROD-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
+; CHECK-NODOTPROD-NEXT: [[TMP69:%.*]] = load i16, ptr [[TMP37]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP70:%.*]] = load i16, ptr [[TMP38]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP71:%.*]] = load i16, ptr [[TMP39]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP72:%.*]] = load i16, ptr [[TMP40]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP73:%.*]] = load i16, ptr [[TMP41]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP74:%.*]] = load i16, ptr [[TMP42]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP75:%.*]] = load i16, ptr [[TMP43]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP76:%.*]] = load i16, ptr [[TMP44]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP77:%.*]] = load i16, ptr [[TMP45]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP78:%.*]] = load i16, ptr [[TMP46]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP79:%.*]] = load i16, ptr [[TMP47]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP80:%.*]] = load i16, ptr [[TMP48]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP81:%.*]] = load i16, ptr [[TMP49]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP50]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP51]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
+; CHECK-NODOTPROD-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
+; CHECK-NODOTPROD-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
+; CHECK-NODOTPROD-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
+; CHECK-NODOTPROD-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
+; CHECK-NODOTPROD-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
+; CHECK-NODOTPROD-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
+; CHECK-NODOTPROD-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
+; CHECK-NODOTPROD-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
+; CHECK-NODOTPROD-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
+; CHECK-NODOTPROD-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
+; CHECK-NODOTPROD-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
+; CHECK-NODOTPROD-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
+; CHECK-NODOTPROD-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
+; CHECK-NODOTPROD-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
+; CHECK-NODOTPROD-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
+; CHECK-NODOTPROD-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP53]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP54]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP55]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP56]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP57]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP58]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP59]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP60]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP61]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP62]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP63]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP64]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP65]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP66]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP67]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP68]], align 2
+; CHECK-NODOTPROD-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
+; CHECK-NODOTPROD-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
+; CHECK-NODOTPROD-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
+; CHECK-NODOTPROD-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
+; CHECK-NODOTPROD-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
+; CHECK-NODOTPROD-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
+; CHECK-NODOTPROD-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
+; CHECK-NODOTPROD-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
+; CHECK-NODOTPROD-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
+; CHECK-NODOTPROD-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
+; CHECK-NODOTPROD-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
+; CHECK-NODOTPROD-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
+; CHECK-NODOTPROD-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
+; CHECK-NODOTPROD-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
+; CHECK-NODOTPROD-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
+; CHECK-NODOTPROD-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
+; CHECK-NODOTPROD-NEXT: [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]]
+; CHECK-NODOTPROD-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
+; CHECK-NODOTPROD-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]]
+; CHECK-NODOTPROD-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
+; CHECK-NODOTPROD-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NODOTPROD-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NODOTPROD-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NODOTPROD: middle.block:
+; CHECK-NODOTPROD-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
+; CHECK-NODOTPROD-NEXT: [[TMP140:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-NODOTPROD-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
entry:
br label %for.body
@@ -468,7 +665,7 @@ for.exit: ; preds = %for.body
define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
@@ -493,7 +690,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
;
; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
@@ -518,7 +715,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
;
; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
@@ -542,6 +739,35 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
;
+; CHECK-NODOTPROD-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-NODOTPROD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NODOTPROD-NEXT: entry:
+; CHECK-NODOTPROD-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NODOTPROD: vector.ph:
+; CHECK-NODOTPROD-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NODOTPROD: vector.body:
+; CHECK-NODOTPROD-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NODOTPROD-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NODOTPROD-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NODOTPROD-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NODOTPROD-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-NODOTPROD-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NODOTPROD-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NODOTPROD-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NODOTPROD-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NODOTPROD: middle.block:
+; CHECK-NODOTPROD-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15
+; CHECK-NODOTPROD-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
+; CHECK-NODOTPROD-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
entry:
br label %for.body
@@ -566,7 +792,7 @@ for.exit: ; preds = %for.body
define i32 @not_dotp_not_phi(ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
@@ -590,7 +816,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
;
; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
@@ -614,7 +840,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
;
; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
@@ -637,6 +863,34 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
;
+; CHECK-NODOTPROD-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-NODOTPROD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NODOTPROD-NEXT: entry:
+; CHECK-NODOTPROD-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NODOTPROD: vector.ph:
+; CHECK-NODOTPROD-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NODOTPROD: vector.body:
+; CHECK-NODOTPROD-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NODOTPROD-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NODOTPROD-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NODOTPROD-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NODOTPROD-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NODOTPROD-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NODOTPROD-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NODOTPROD-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NODOTPROD: middle.block:
+; CHECK-NODOTPROD-NEXT: [[TMP10:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15
+; CHECK-NODOTPROD-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15
+; CHECK-NODOTPROD-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
entry:
br label %for.body
@@ -661,7 +915,7 @@ for.exit: ; preds = %for.body
define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled(
-; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -731,7 +985,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
;
; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
-; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -801,7 +1055,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
;
; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled(
-; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -870,6 +1124,76 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
;
+; CHECK-NODOTPROD-LABEL: define i32 @dotp_unrolled(
+; CHECK-NODOTPROD-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NODOTPROD-NEXT: entry:
+; CHECK-NODOTPROD-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-NODOTPROD-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NODOTPROD: vector.ph:
+; CHECK-NODOTPROD-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-NODOTPROD-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-NODOTPROD-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NODOTPROD: vector.body:
+; CHECK-NODOTPROD-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[VEC_PHI3:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NODOTPROD-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NODOTPROD-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NODOTPROD-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-NODOTPROD-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-NODOTPROD-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-NODOTPROD-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-NODOTPROD-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-NODOTPROD-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-NODOTPROD-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-NODOTPROD-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-NODOTPROD-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-NODOTPROD-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
+; CHECK-NODOTPROD-NEXT: [[TMP17]] = add <16 x i32> [[TMP16]], [[VEC_PHI3]]
+; CHECK-NODOTPROD-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP19:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP21]]
+; CHECK-NODOTPROD-NEXT: [[TMP23]] = add <16 x i32> [[TMP22]], [[VEC_PHI2]]
+; CHECK-NODOTPROD-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD7]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP26]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP27]]
+; CHECK-NODOTPROD-NEXT: [[TMP29]] = add <16 x i32> [[TMP28]], [[VEC_PHI1]]
+; CHECK-NODOTPROD-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP30]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP31:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP32]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP31]], [[TMP33]]
+; CHECK-NODOTPROD-NEXT: [[TMP35]] = add <16 x i32> [[TMP34]], [[VEC_PHI]]
+; CHECK-NODOTPROD-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NODOTPROD-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NODOTPROD-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NODOTPROD: middle.block:
+; CHECK-NODOTPROD-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP35]])
+; CHECK-NODOTPROD-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP29]])
+; CHECK-NODOTPROD-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP23]])
+; CHECK-NODOTPROD-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP17]])
+; CHECK-NODOTPROD-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
+; CHECK-NODOTPROD-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
entry:
br label %for.body
@@ -927,7 +1251,7 @@ exit: ; preds = %for.body
define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated(
-; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -958,7 +1282,7 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
;
; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated(
-; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -999,7 +1323,7 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
;
; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated(
-; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -1029,6 +1353,47 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
;
+; CHECK-NODOTPROD-LABEL: define i32 @not_dotp_predicated(
+; CHECK-NODOTPROD-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NODOTPROD-NEXT: entry:
+; CHECK-NODOTPROD-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32
+; CHECK-NODOTPROD-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NODOTPROD: vector.ph:
+; CHECK-NODOTPROD-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
+; CHECK-NODOTPROD-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NODOTPROD-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NODOTPROD: vector.body:
+; CHECK-NODOTPROD-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NODOTPROD-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NODOTPROD-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NODOTPROD-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NODOTPROD-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
+; CHECK-NODOTPROD-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-NODOTPROD-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NODOTPROD-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]]
+; CHECK-NODOTPROD-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
+; CHECK-NODOTPROD-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NODOTPROD-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NODOTPROD-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NODOTPROD: middle.block:
+; CHECK-NODOTPROD-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]]
+; CHECK-NODOTPROD-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-NODOTPROD-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NODOTPROD-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
entry:
br label %for.body
@@ -1053,7 +1418,7 @@ exit: ; preds = %for.body
define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma(
-; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
@@ -1089,7 +1454,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
;
; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma(
-; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
@@ -1125,7 +1490,7 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
;
; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma(
-; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
@@ -1159,6 +1524,43 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
; CHECK-MAXBW-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+;
+; CHECK-NODOTPROD-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-NODOTPROD-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NODOTPROD-NEXT: entry:
+; CHECK-NODOTPROD-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NODOTPROD: vector.ph:
+; CHECK-NODOTPROD-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; CHECK-NODOTPROD-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-NODOTPROD-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NODOTPROD-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; CHECK-NODOTPROD-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-NODOTPROD-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-NODOTPROD-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NODOTPROD: vector.body:
+; CHECK-NODOTPROD-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
+; CHECK-NODOTPROD-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-NODOTPROD-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-NODOTPROD-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NODOTPROD-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NODOTPROD-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NODOTPROD-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NODOTPROD-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NODOTPROD-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NODOTPROD-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NODOTPROD-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NODOTPROD-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NODOTPROD-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-NODOTPROD-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-NODOTPROD-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-NODOTPROD-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NODOTPROD-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-NODOTPROD-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-NODOTPROD-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-NODOTPROD-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NODOTPROD-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
+; CHECK-NODOTPROD-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+;
entry:
br label %for.body
@@ -1183,7 +1585,7 @@ exit: ; preds = %for.body
define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
@@ -1234,7 +1636,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
; CHECK-INTERLEAVE1-NEXT: ret i32 [[RESULT]]
;
; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
@@ -1295,7 +1697,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: ret i32 [[RESULT]]
;
; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
@@ -1345,6 +1747,44 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
; CHECK-MAXBW-NEXT: ret i32 [[RESULT]]
;
+; CHECK-NODOTPROD-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-NODOTPROD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NODOTPROD-NEXT: entry:
+; CHECK-NODOTPROD-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NODOTPROD: vector.ph:
+; CHECK-NODOTPROD-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NODOTPROD: vector.body:
+; CHECK-NODOTPROD-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NODOTPROD-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NODOTPROD-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NODOTPROD-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-NODOTPROD-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NODOTPROD-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-NODOTPROD-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NODOTPROD-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-NODOTPROD-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NODOTPROD-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-NODOTPROD-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NODOTPROD-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]]
+; CHECK-NODOTPROD-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
+; CHECK-NODOTPROD-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NODOTPROD-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NODOTPROD-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NODOTPROD: middle.block:
+; CHECK-NODOTPROD-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]]
+; CHECK-NODOTPROD-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-NODOTPROD-NEXT: [[TMP17:%.*]] = extractelement <16 x i32> [[TMP10]], i32 15
+; CHECK-NODOTPROD-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 20fcaa834437ee..7a63fe128160a5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -15,7 +15,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY1:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE1:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP8:%.*]], [[VECTOR_BODY1]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX1]], 0
; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]]
; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
@@ -26,12 +26,12 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP23]], [[TMP19]]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP9]], [[VEC_PHI]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 0
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP24]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE1]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
@@ -63,8 +63,8 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY1:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY1]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP15:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP14:%.*]], [[VECTOR_BODY1]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX1]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]]
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
@@ -82,14 +82,14 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul <16 x i32> [[TMP11]], [[TMP25]]
; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul <16 x i32> [[TMP12]], [[TMP26]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP13]])
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP29]])
+; CHECK-INTERLEAVED-NEXT: [[TMP15]] = add <16 x i32> [[TMP13]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[TMP14]] = add <16 x i32> [[TMP29]], [[VEC_PHI1]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 0
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX1:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX1]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
; CHECK-INTERLEAVED: scalar.ph:
; CHECK-INTERLEAVED-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
@@ -833,10 +833,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
@@ -856,7 +856,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP36]] = add <16 x i32> [[TMP16]], [[VEC_PHI3]]
; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
@@ -864,7 +864,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP34]] = add <16 x i32> [[TMP21]], [[VEC_PHI2]]
; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
@@ -872,7 +872,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP33]] = add <16 x i32> [[TMP26]], [[VEC_PHI1]]
; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
@@ -880,7 +880,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP35]] = add <16 x i32> [[TMP31]], [[VEC_PHI]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -896,10 +896,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
@@ -919,7 +919,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT: [[TMP36]] = add <16 x i32> [[TMP16]], [[VEC_PHI3]]
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
@@ -927,7 +927,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT: [[TMP34]] = add <16 x i32> [[TMP21]], [[VEC_PHI2]]
; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
@@ -935,7 +935,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT: [[TMP33]] = add <16 x i32> [[TMP26]], [[VEC_PHI1]]
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
@@ -943,7 +943,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVED-NEXT: [[TMP35]] = add <16 x i32> [[TMP31]], [[VEC_PHI]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1084,7 +1084,7 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
@@ -1095,12 +1095,12 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP9]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP9]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
;
@@ -1115,8 +1115,8 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
@@ -1134,14 +1134,14 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP4]]
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE6]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP12]])
+; CHECK-INTERLEAVED-NEXT: [[TMP15]] = add <16 x i32> [[TMP11]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]]
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index 1b28cc3e5dbfba..06aaf29b382a22 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -1,5 +1,5 @@
; REQUIRES: asserts
-; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -mattr=+neon,+dotprod -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -disable-output %s 2>&1 | FileCheck %s
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-none-unknown-elf"
>From 5999a64689c7150f008d24277351364df234fe23 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 16 Dec 2024 17:31:35 +0000
Subject: [PATCH 5/9] Mark partial reductions as additional simplifications
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 379ba270631d0e..ebc0a5c5a89ce3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7575,6 +7575,11 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
}
continue;
}
+ // A partial reduction is very differently costed compared to its
+ // underlying add instruction that the legacy cost model sees, so consider
+ // it an additional simplification.
+ if (dyn_cast<VPPartialReductionRecipe>(&R))
+ continue;
if (Instruction *UI = GetInstructionForCost(&R))
SeenInstrs.insert(UI);
}
>From b1c34897ed7f4f9873d1f1d2e1e8d5e997840d6f Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Tue, 17 Dec 2024 09:44:46 +0000
Subject: [PATCH 6/9] Remove -force-target-instruction-cost
---
.../AArch64/partial-reduce-dot-product.ll | 652 +++++++++++-------
1 file changed, 394 insertions(+), 258 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 7a63fe128160a5..d79b6af97a9020 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -force-target-instruction-cost=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
-; RUN: opt -passes=loop-vectorize -force-target-instruction-cost=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -10,31 +10,40 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-LABEL: define i32 @dotp(
; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY1:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP8:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP14:%.*]], [[VECTOR_BODY1]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX1]], 0
; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]]
; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP23]], [[TMP19]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP9]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16
-; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 0
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP24]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP18]], [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
-; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
; CHECK-INTERLEAVE1: scalar.ph:
-; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: for.body:
@@ -58,41 +67,54 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
; CHECK-INTERLEAVED-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY1:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP15:%.*]], [[VECTOR_BODY1]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP14:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP23:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP24:%.*]], [[VECTOR_BODY1]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX1]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]]
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[TMP20]], i32 16
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP28]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul <16 x i32> [[TMP11]], [[TMP25]]
-; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul <16 x i32> [[TMP12]], [[TMP26]]
-; CHECK-INTERLEAVED-NEXT: [[TMP15]] = add <16 x i32> [[TMP13]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP14]] = add <16 x i32> [[TMP29]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 32
-; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 0
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP28]], i64 [[TMP27]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP29]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
; CHECK-INTERLEAVED: scalar.ph:
-; CHECK-INTERLEAVED-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: for.body:
@@ -546,35 +568,35 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8
; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 16 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP16]] = mul <vscale x 16 x i32> [[TMP15]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> [[VECTOR_RECUR]], <vscale x 16 x i32> [[TMP16]], i32 -1)
-; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add <vscale x 16 x i32> [[TMP16]], [[TMP17]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP16]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP16]], i32 -1)
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -583,46 +605,46 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 16 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 8
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP21]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul <vscale x 16 x i32> [[TMP22]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT: [[TMP25]] = mul <vscale x 16 x i32> [[TMP23]], [[TMP16]]
-; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> [[TMP24]], <vscale x 16 x i32> [[TMP25]], i32 -1)
-; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add <vscale x 16 x i32> [[TMP25]], [[TMP26]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1)
+; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -691,34 +713,34 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8
; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 16 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul <vscale x 16 x i32> [[TMP15]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP17]] = add <vscale x 16 x i32> [[TMP16]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP17]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -727,38 +749,38 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 16 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP19]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP20]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 16 x i32> [[TMP21]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <vscale x 16 x i32> [[TMP22]], [[TMP21]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -825,18 +847,24 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled(
; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
-; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul i64 [[TMP20]], 4
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
@@ -850,56 +878,66 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP36]] = add <16 x i32> [[TMP16]], [[VEC_PHI3]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = mul nsw <vscale x 4 x i32> [[TMP21]], [[TMP36]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP23]] = add <vscale x 4 x i32> [[TMP38]], [[VEC_PHI3]]
; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP34]] = add <16 x i32> [[TMP21]], [[VEC_PHI2]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD6]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = mul nsw <vscale x 4 x i32> [[TMP25]], [[TMP42]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP30]] = add <vscale x 4 x i32> [[TMP28]], [[VEC_PHI2]]
; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD7]] to <vscale x 4 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP33]] = add <16 x i32> [[TMP26]], [[VEC_PHI1]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = mul nsw <vscale x 4 x i32> [[TMP31]], [[TMP33]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP35]] = add <vscale x 4 x i32> [[TMP34]], [[VEC_PHI1]]
; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP35]] = add <16 x i32> [[TMP31]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = mul nsw <vscale x 4 x i32> [[TMP37]], [[TMP39]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP41]] = add <vscale x 4 x i32> [[TMP40]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]]
; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
;
; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], 8
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], 8
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul i64 [[TMP34]], 8
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
@@ -913,38 +951,86 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP56]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[TMP36]] = add <16 x i32> [[TMP16]], [[VEC_PHI3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 [[TMP26]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP72]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nsw <vscale x 4 x i32> [[TMP28]], [[TMP66]]
+; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <vscale x 4 x i32> [[TMP82]], [[TMP23]]
+; CHECK-INTERLEAVED-NEXT: [[TMP50]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP33]] = add <vscale x 4 x i32> [[TMP31]], [[VEC_PHI7]]
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = mul i64 [[TMP35]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP36]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <vscale x 4 x i8>, ptr [[TMP37]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD11]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD12]] to <vscale x 4 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
-; CHECK-INTERLEAVED-NEXT: [[TMP34]] = add <16 x i32> [[TMP21]], [[VEC_PHI2]]
+; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP42]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <vscale x 4 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <vscale x 4 x i8>, ptr [[TMP43]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD13]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD14]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = mul nsw <vscale x 4 x i32> [[TMP38]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = mul nsw <vscale x 4 x i32> [[TMP39]], [[TMP45]]
+; CHECK-INTERLEAVED-NEXT: [[TMP48]] = add <vscale x 4 x i32> [[TMP46]], [[VEC_PHI4]]
+; CHECK-INTERLEAVED-NEXT: [[TMP49]] = add <vscale x 4 x i32> [[TMP47]], [[VEC_PHI5]]
; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = mul i64 [[TMP51]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP52]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <vscale x 4 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD16:%.*]] = load <vscale x 4 x i8>, ptr [[TMP53]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD15]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD16]] to <vscale x 4 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
-; CHECK-INTERLEAVED-NEXT: [[TMP33]] = add <16 x i32> [[TMP26]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP58]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD17:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <vscale x 4 x i8>, ptr [[TMP59]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD17]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD18]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = mul nsw <vscale x 4 x i32> [[TMP54]], [[TMP60]]
+; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = mul nsw <vscale x 4 x i32> [[TMP55]], [[TMP61]]
+; CHECK-INTERLEAVED-NEXT: [[TMP64]] = add <vscale x 4 x i32> [[TMP62]], [[VEC_PHI2]]
+; CHECK-INTERLEAVED-NEXT: [[TMP65]] = add <vscale x 4 x i32> [[TMP63]], [[VEC_PHI3]]
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = mul i64 [[TMP67]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP68]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <vscale x 4 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <vscale x 4 x i8>, ptr [[TMP69]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD19]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD20]] to <vscale x 4 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
-; CHECK-INTERLEAVED-NEXT: [[TMP35]] = add <16 x i32> [[TMP31]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = mul i64 [[TMP73]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP74]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <vscale x 4 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD22:%.*]] = load <vscale x 4 x i8>, ptr [[TMP75]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD21]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD22]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = mul nsw <vscale x 4 x i32> [[TMP70]], [[TMP76]]
+; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = mul nsw <vscale x 4 x i32> [[TMP71]], [[TMP77]]
+; CHECK-INTERLEAVED-NEXT: [[TMP80]] = add <vscale x 4 x i32> [[TMP78]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[TMP81]] = add <vscale x 4 x i32> [[TMP79]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP40]]
; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
;
@@ -1076,72 +1162,88 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated(
; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
-; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP7]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP10]], 4
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = mul i64 [[TMP11]], 4
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP9]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw <vscale x 4 x i32> [[TMP12]], [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP9]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
;
; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated(
; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP14]]
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP15]], 8
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT: [[TMP15]] = add <16 x i32> [[TMP11]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <vscale x 4 x i32> [[TMP19]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nsw <vscale x 4 x i32> [[TMP20]], [[TMP25]]
+; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
;
@@ -1206,70 +1308,84 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1
+; CHECK-INTERLEAVE1-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = sub i64 [[N]], 16
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
-; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[N]])
+; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP8]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add <16 x i32> [[TMP10]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP12]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP11]], <16 x i32> [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
-; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[TMP2]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = extractelement <16 x i1> [[TMP13]], i32 0
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
;
; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma(
; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1
+; CHECK-INTERLEAVED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]]
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = sub i64 [[N]], 16
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
-; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[N]])
+; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP8]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add <16 x i32> [[TMP10]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP12]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP11]], <16 x i32> [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
-; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[TMP2]])
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = extractelement <16 x i1> [[TMP13]], i32 0
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
;
; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma(
; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -1339,59 +1455,79 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user(
; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul i64 [[TMP8]], 4
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP11]], 4
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP10]] = add <16 x i32> [[TMP9]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
;
; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
; CHECK-INTERLEAVED-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP7]]
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP14]], 8
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP8]], i32 16
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul <16 x i32> [[TMP11]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul <16 x i32> [[TMP12]], [[TMP7]]
-; CHECK-INTERLEAVED-NEXT: [[TMP15]] = add <16 x i32> [[TMP13]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT: [[TMP16]] = add <16 x i32> [[TMP14]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP20]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
;
; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user(
; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
>From 16bf8192ecc8aee7bf2c91f18a78a2bd78d19460 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Tue, 17 Dec 2024 10:24:49 +0000
Subject: [PATCH 7/9] Return true from planContainsAdditionalSimplifications
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ebc0a5c5a89ce3..52aabc58767f77 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7575,11 +7575,9 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
}
continue;
}
- // A partial reduction is very differently costed compared to its
- // underlying add instruction that the legacy cost model sees, so consider
- // it an additional simplification.
+ // The VPlan-based cost model is more accurate for partial reduction and comparing against the legacy cost isn't desirable.
if (dyn_cast<VPPartialReductionRecipe>(&R))
- continue;
+ return true;
if (Instruction *UI = GetInstructionForCost(&R))
SeenInstrs.insert(UI);
}
>From ab3e00861a1c6f87f5d5cae914655b0bf8738636 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Tue, 17 Dec 2024 11:17:21 +0000
Subject: [PATCH 8/9] Add cost disagreement test
---
.../Transforms/Vectorize/LoopVectorize.cpp | 3 +-
.../AArch64/partial-reduce-dot-product.ll | 142 ++++++++++++++++++
2 files changed, 144 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 52aabc58767f77..3616c25f5c244d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7575,7 +7575,8 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
}
continue;
}
- // The VPlan-based cost model is more accurate for partial reduction and comparing against the legacy cost isn't desirable.
+ // The VPlan-based cost model is more accurate for partial reduction and
+ // comparing against the legacy cost isn't desirable.
if (dyn_cast<VPPartialReductionRecipe>(&R))
return true;
if (Instruction *UI = GetInstructionForCost(&R))
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index d79b6af97a9020..67583ba6334291 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1584,6 +1584,148 @@ for.exit: ; preds = %for.body
ret i32 %result
}
+define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i64 @dotp_cost_disagreement(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD1]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP13]], [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i64 @dotp_cost_disagreement(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[TMP6]], 1
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP20]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP21]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP22]]
+; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add <vscale x 2 x i64> [[VEC_PHI1]], [[TMP23]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
+; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
+; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
+; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1
+; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64>
+; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP13]], [[TMP9]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i64> @llvm.experimental.vector.partial.reduce.add.nxv1i64.nxv8i64(<vscale x 1 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[TMP14]])
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-MAXBW: middle.block:
+; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+ %sum = phi i64 [ 0, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds nuw i8, ptr %a, i64 %i.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %0 to i64
+ %i.iv.next = add nuw nsw i64 %i.iv, 1
+ %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next
+ %1 = load i8, ptr %arrayidx2, align 1
+ %conv3 = zext i8 %1 to i64
+ %mul = mul nuw nsw i64 %conv3, %conv
+ %add = add i64 %sum, %mul
+ %exitcond.not = icmp eq i64 %i.iv.next, 16
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit: ; preds = %for.body
+ ret i64 %add
+}
+
!7 = distinct !{!7, !8, !9, !10}
!8 = !{!"llvm.loop.mustprogress"}
!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
>From d5d2c0d63eaa5d5dac88e988da2e41caa53ae0e0 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Tue, 17 Dec 2024 17:07:45 +0000
Subject: [PATCH 9/9] [LoopVectorizer] Add support for chaining partial
reductions
---
.../AArch64/AArch64TargetTransformInfo.h | 2 +-
.../Transforms/Vectorize/LoopVectorize.cpp | 52 +-
.../Transforms/Vectorize/VPRecipeBuilder.h | 4 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 5 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 14 +-
.../AArch64/partial-reduction-chained.ll | 568 ++++++++++++++++++
6 files changed, 619 insertions(+), 26 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/partial-reduction-chained.ll
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 2a31cacc203f44..76f9f14c35dbc7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -368,7 +368,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
InstructionCost Invalid = InstructionCost::getInvalid();
InstructionCost Cost(TTI::TCC_Basic);
- if (Opcode != Instruction::Add)
+ if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
return Invalid;
EVT InputEVT = EVT::getEVT(InputType);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3616c25f5c244d..83a5b793a1f567 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8799,12 +8799,10 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
/// are valid so recipes can be formed later.
void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
// Find all possible partial reductions.
- SmallVector<std::pair<PartialReductionChain, unsigned>, 1>
+ SmallVector<std::pair<PartialReductionChain, unsigned>>
PartialReductionChains;
for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
- if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
- getScaledReduction(Phi, RdxDesc, Range))
- PartialReductionChains.push_back(*Pair);
+ PartialReductionChains.append(getScaledReduction(Phi, RdxDesc.getLoopExitInstr(), Range));
// A partial reduction is invalid if any of its extends are used by
// something that isn't another partial reduction. This is because the
@@ -8832,48 +8830,65 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
}
}
-std::optional<std::pair<PartialReductionChain, unsigned>>
-VPRecipeBuilder::getScaledReduction(PHINode *PHI,
- const RecurrenceDescriptor &Rdx,
+SmallVector<std::pair<PartialReductionChain, unsigned>>
+VPRecipeBuilder::getScaledReduction(Instruction *PHI,
+ Instruction *RdxExitInstr,
VFRange &Range) {
+ SmallVector<std::pair<PartialReductionChain, unsigned>> Chains;
+
+ if(!CM.TheLoop->contains(RdxExitInstr))
+ return Chains;
+
// TODO: Allow scaling reductions when predicating. The select at
// the end of the loop chooses between the phi value and most recent
// reduction result, both of which have different VFs to the active lane
// mask when scaling.
- if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
- return std::nullopt;
+ if (CM.blockNeedsPredicationForAnyReason(RdxExitInstr->getParent()))
+ return Chains;
- auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
+ auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr);
if (!Update)
- return std::nullopt;
+ return Chains;
Value *Op = Update->getOperand(0);
if (Op == PHI)
Op = Update->getOperand(1);
+ if (auto *OpInst = dyn_cast<Instruction>(Op)) {
+ auto SR0 = getScaledReduction(PHI, OpInst, Range);
+ if(!SR0.empty()) {
+ Chains.append(SR0);
+ PHI = SR0.rbegin()->first.Reduction;
+
+ Op = Update->getOperand(0);
+ if (Op == PHI)
+ Op = Update->getOperand(1);
+ }
+ }
+
auto *BinOp = dyn_cast<BinaryOperator>(Op);
if (!BinOp || !BinOp->hasOneUse())
- return std::nullopt;
+ return Chains;
using namespace llvm::PatternMatch;
Value *A, *B;
if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
!match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
- return std::nullopt;
+ return Chains;
Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
// Check that the extends extend from the same type.
if (A->getType() != B->getType())
- return std::nullopt;
+ return Chains;
TTI::PartialReductionExtendKind OpAExtend =
TargetTransformInfo::getPartialReductionExtendKind(ExtA);
TTI::PartialReductionExtendKind OpBExtend =
TargetTransformInfo::getPartialReductionExtendKind(ExtB);
- PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp);
+ PartialReductionChain Chain(RdxExitInstr, ExtA, ExtB, BinOp);
unsigned TargetScaleFactor =
PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
@@ -8887,9 +8902,9 @@ VPRecipeBuilder::getScaledReduction(PHINode *PHI,
return Cost.isValid();
},
Range))
- return std::make_pair(Chain, TargetScaleFactor);
+ Chains.push_back(std::make_pair(Chain, TargetScaleFactor));
- return std::nullopt;
+ return Chains;
}
VPRecipeBase *
@@ -8986,7 +9001,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
VPValue *BinOp = Operands[0];
VPValue *Phi = Operands[1];
- if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe()))
+ VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
+ if (isa<VPReductionPHIRecipe>(BinOpRecipe) || isa<VPPartialReductionRecipe>(BinOpRecipe))
std::swap(BinOp, Phi);
return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi,
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index cf653e2d3e6584..81c4d6a21768b0 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -144,8 +144,8 @@ class VPRecipeBuilder {
/// Returns null if no scaled reduction was found, otherwise a pair with a
/// struct containing reduction information and the scaling factor between the
/// number of elements in the input and output.
- std::optional<std::pair<PartialReductionChain, unsigned>>
- getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx,
+ SmallVector<std::pair<PartialReductionChain, unsigned>>
+ getScaledReduction(Instruction *PHI, Instruction *RdxExitInstr,
VFRange &Range);
public:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 2dc388ac0248f5..ed10b4ad58c95d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2463,12 +2463,13 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
: VPSingleDefRecipe(VPDef::VPPartialReductionSC,
ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
Opcode(Opcode) {
- assert(isa<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe()) &&
+ auto *DefiningRecipe = getOperand(1)->getDefiningRecipe();
+ assert((isa<VPReductionPHIRecipe>(DefiningRecipe) || isa<VPPartialReductionRecipe>(DefiningRecipe)) &&
"Unexpected operand order for partial reduction recipe");
}
~VPPartialReductionRecipe() override = default;
VPPartialReductionRecipe *clone() override {
- return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1));
+ return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), getUnderlyingInstr());
}
VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e92d709618f5ed..37ae217774d1b7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -326,13 +326,21 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
State.setDebugLocFrom(getDebugLoc());
auto &Builder = State.Builder;
- assert(getOpcode() == Instruction::Add &&
- "Unhandled partial reduction opcode");
-
Value *BinOpVal = State.get(getOperand(0));
Value *PhiVal = State.get(getOperand(1));
assert(PhiVal && BinOpVal && "Phi and Mul must be set");
+ auto Opcode = getOpcode();
+
+ // Currently we don't have a partial_reduce_sub intrinsic,
+ // so mimic the behaviour by negating the second operand
+ if(Opcode == Instruction::Sub) {
+ BinOpVal = Builder.CreateSub(Constant::getNullValue(BinOpVal->getType()), BinOpVal);
+ Opcode = Instruction::Add;
+ }
+
+ assert(Opcode == Instruction::Add && "Unhandled partial reduction opcode");
+
Type *RetTy = PhiVal->getType();
CallInst *V = Builder.CreateIntrinsic(
diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-chained.ll b/llvm/test/CodeGen/AArch64/partial-reduction-chained.ll
new file mode 100644
index 00000000000000..96602baea16fe5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/partial-reduction-chained.ll
@@ -0,0 +1,568 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -o - %s --passes=loop-vectorize -force-vector-width=16 -force-vector-interleave=1 -scalable-vectorization=on | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) vscale_range(1,16)
+define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-LABEL: define i32 @chained_partial_reduce_add_sub(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT: [[TMP25:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP27:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD6]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP18]]
+; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP29]])
+; CHECK-NEXT: [[TMP31:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP27]]
+; CHECK-NEXT: [[TMP33:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP31]]
+; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE7]], <vscale x 16 x i32> [[TMP33]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE9]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32
+; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AC]]
+; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]]
+; CHECK-NEXT: [[SUB]] = sub i32 [[ADD]], [[MUL_DB]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]], !loop [[META4:![0-9]+]]
+;
+entry:
+ %cmp28.not = icmp ult i32 %N, 2
+ %div27 = lshr i32 %N, 1
+ %wide.trip.count = zext nneg i32 %div27 to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %res.0.lcssa = phi i32 [ %sub, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %res = phi i32 [ 0, %entry ], [ %sub, %for.body ]
+ %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+ %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+ %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+ %a.val = load i8, ptr %a.ptr, align 1
+ %b.val = load i8, ptr %b.ptr, align 1
+ %c.val = load i8, ptr %c.ptr, align 1
+ %a.ext = sext i8 %a.val to i32
+ %b.ext = sext i8 %b.val to i32
+ %c.ext = sext i8 %c.val to i32
+ %mul.ab = mul nsw i32 %a.ext, %b.ext
+ %add = add nsw i32 %res, %mul.ab
+ %mul.ac = mul nsw i32 %a.ext, %c.ext
+ %sub = sub i32 %add, %mul.ac
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-LABEL: define i32 @chained_partial_reduce_add_add(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT: [[TMP25:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP27:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD6]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP18]]
+; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP29]])
+; CHECK-NEXT: [[TMP31:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP27]]
+; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE7]], <vscale x 16 x i32> [[TMP31]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE9]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32
+; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AC]]
+; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]]
+; CHECK-NEXT: [[SUB]] = add i32 [[ADD]], [[MUL_DB]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]], !loop [[META4]]
+;
+entry:
+ %cmp28.not = icmp ult i32 %N, 2
+ %div27 = lshr i32 %N, 1
+ %wide.trip.count = zext nneg i32 %div27 to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %res.0.lcssa = phi i32 [ %add.2, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %res = phi i32 [ 0, %entry ], [ %add.2, %for.body ]
+ %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+ %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+ %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+ %a.val = load i8, ptr %a.ptr, align 1
+ %b.val = load i8, ptr %b.ptr, align 1
+ %c.val = load i8, ptr %c.ptr, align 1
+ %a.ext = sext i8 %a.val to i32
+ %b.ext = sext i8 %b.val to i32
+ %c.ext = sext i8 %c.val to i32
+ %mul.ab = mul nsw i32 %a.ext, %b.ext
+ %add = add nsw i32 %res, %mul.ab
+ %mul.ac = mul nsw i32 %a.ext, %c.ext
+ %add.2 = add i32 %add, %mul.ac
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-LABEL: define i32 @chained_partial_reduce_sub_add(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT: [[TMP25:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP27:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD6]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP18]]
+; CHECK-NEXT: [[TMP31:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP29]]
+; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP31]])
+; CHECK-NEXT: [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP27]]
+; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE7]], <vscale x 16 x i32> [[TMP33]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE9]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY1]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32
+; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
+; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[RES]], [[MUL_AC]]
+; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]]
+; CHECK-NEXT: [[SUB]] = add i32 [[SUB1]], [[MUL_DB]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]], !loop [[META4]]
+;
+entry:
+ %cmp28.not = icmp ult i32 %N, 2
+ %div27 = lshr i32 %N, 1
+ %wide.trip.count = zext nneg i32 %div27 to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %res.0.lcssa = phi i32 [ %add, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %res = phi i32 [ 0, %entry ], [ %add, %for.body ]
+
+ %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+ %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+ %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+ %a.val = load i8, ptr %a.ptr, align 1
+ %b.val = load i8, ptr %b.ptr, align 1
+ %c.val = load i8, ptr %c.ptr, align 1
+
+ %a.ext = sext i8 %a.val to i32
+ %b.ext = sext i8 %b.val to i32
+ %c.ext = sext i8 %c.val to i32
+ %mul.ab = mul nsw i32 %a.ext, %b.ext
+ %sub = sub nsw i32 %res, %mul.ab
+ %mul.ac = mul nsw i32 %a.ext, %c.ext
+ %add = add i32 %sub, %mul.ac
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-LABEL: define i32 @chained_partial_reduce_sub_sub(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT: [[TMP25:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP27:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD6]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP18]]
+; CHECK-NEXT: [[TMP31:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP29]]
+; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP31]])
+; CHECK-NEXT: [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP25]], [[TMP27]]
+; CHECK-NEXT: [[TMP35:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP33]]
+; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE7]], <vscale x 16 x i32> [[TMP35]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE9]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY1]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY1]] ]
+; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32
+; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32
+; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]]
+; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AC]]
+; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]]
+; CHECK-NEXT: [[ADD]] = sub i32 [[SUB]], [[MUL_DB]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP11:![0-9]+]], !loop [[META4]]
+;
+entry:
+ %cmp28.not = icmp ult i32 %N, 2
+ %div27 = lshr i32 %N, 1
+ %wide.trip.count = zext nneg i32 %div27 to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %res.0.lcssa = phi i32 [ %sub.2, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
+
+ %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+ %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+ %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+ %a.val = load i8, ptr %a.ptr, align 1
+ %b.val = load i8, ptr %b.ptr, align 1
+ %c.val = load i8, ptr %c.ptr, align 1
+
+ %a.ext = sext i8 %a.val to i32
+ %b.ext = sext i8 %b.val to i32
+ %c.ext = sext i8 %c.val to i32
+
+ %mul.ab = mul nsw i32 %a.ext, %b.ext
+ %sub = sub nsw i32 %res, %mul.ab
+ %mul.ac = mul nsw i32 %a.ext, %c.ext
+ %sub.2 = sub i32 %sub, %mul.ac
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-LABEL: define i32 @chained_partial_reduce_sub_add_sub(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP15]], align 1
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1
+; CHECK-NEXT: [[TMP19:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP20:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP21:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP22:%.*]] = mul nsw <vscale x 16 x i32> [[TMP19]], [[TMP20]]
+; CHECK-NEXT: [[TMP23:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP22]]
+; CHECK-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP23]])
+; CHECK-NEXT: [[TMP24:%.*]] = mul nsw <vscale x 16 x i32> [[TMP19]], [[TMP21]]
+; CHECK-NEXT: [[PARTIAL_REDUCE4:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP24]])
+; CHECK-NEXT: [[TMP25:%.*]] = mul nsw <vscale x 16 x i32> [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP26:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP25]]
+; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE4]], <vscale x 16 x i32> [[TMP26]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE5]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
+; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
+; CHECK-NEXT: [[E_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1
+; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32
+; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32
+; CHECK-NEXT: [[E_EXT:%.*]] = sext i8 [[E_VAL]] to i32
+; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[C_EXT]], [[D_EXT]]
+; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AC]]
+; CHECK-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[C_EXT]], [[E_EXT]]
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AB]]
+; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[D_EXT]], [[E_EXT]]
+; CHECK-NEXT: [[SUB_2]] = sub i32 [[ADD]], [[MUL_DB]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]], !loop [[META4]]
+;
+entry:
+ %cmp28.not = icmp ult i32 %N, 2
+ %div27 = lshr i32 %N, 1
+ %wide.trip.count = zext nneg i32 %div27 to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %res.0.lcssa = phi i32 [ %sub.2, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
+
+ %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+ %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+ %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+ %a.val = load i8, ptr %a.ptr, align 1
+ %b.val = load i8, ptr %b.ptr, align 1
+ %c.val = load i8, ptr %c.ptr, align 1
+
+ %a.ext = sext i8 %a.val to i32
+ %b.ext = sext i8 %b.val to i32
+ %c.ext = sext i8 %c.val to i32
+
+ %mul.ab = mul nsw i32 %a.ext, %b.ext
+ %sub = sub nsw i32 %res, %mul.ab
+ %mul.ac = mul nsw i32 %a.ext, %c.ext
+ %add = add nsw i32 %sub, %mul.ac
+ %mul.bc = mul nsw i32 %b.ext, %c.ext
+ %sub.2 = sub i32 %add, %mul.bc
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+attributes #0 = { mustprogress noinline nounwind vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-x3" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+ete,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+trbe,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" }
+
+
+!0 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!1 = distinct !{!0}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[META4]] = distinct !{[[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+;.
More information about the llvm-commits
mailing list