[llvm] [LoopVectorizer] Add support for partial reductions (PR #92418)
Sam Tebbs via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 15 05:50:13 PDT 2024
https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/92418
>From 03ce622ed017db24733ebecd41d5c4ccdac8a50e Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy at arm.com>
Date: Fri, 17 May 2024 11:17:26 +0100
Subject: [PATCH 1/4] [LoopVectorizer] Add support for partial reductions
---
.../llvm/Analysis/TargetTransformInfo.h | 33 +
.../llvm/Analysis/TargetTransformInfoImpl.h | 9 +
llvm/lib/Analysis/TargetTransformInfo.cpp | 8 +
.../AArch64/AArch64TargetTransformInfo.h | 37 +
.../Transforms/Vectorize/LoopVectorize.cpp | 256 +++-
.../Transforms/Vectorize/VPRecipeBuilder.h | 4 +
llvm/lib/Transforms/Vectorize/VPlan.h | 52 +-
.../Transforms/Vectorize/VPlanAnalysis.cpp | 7 +-
llvm/lib/Transforms/Vectorize/VPlanAnalysis.h | 2 +
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 44 +-
llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 +
.../CodeGen/AArch64/partial-reduce-sdot.ll | 99 ++
.../AArch64/partial-reduce-dot-product.ll | 1322 +++++++++++++++++
.../LoopVectorize/AArch64/vplan-printing.ll | 77 +
14 files changed, 1935 insertions(+), 16 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0459941fe05cdc..f4c3535438c4da 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -24,6 +24,7 @@
#include "llvm/ADT/APInt.h"
#include "llvm/IR/FMF.h"
#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Pass.h"
#include "llvm/Support/AtomicOrdering.h"
@@ -210,6 +211,17 @@ typedef TargetTransformInfo TTI;
/// for IR-level transformations.
class TargetTransformInfo {
public:
+ enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend };
+
+ static PartialReductionExtendKind
+ getPartialReductionExtendKind(Instruction *I) {
+ if (isa<SExtInst>(I))
+ return PR_SignExtend;
+ if (isa<ZExtInst>(I))
+ return PR_ZeroExtend;
+ return PR_None;
+ }
+
/// Construct a TTI object using a type implementing the \c Concept
/// API below.
///
@@ -1260,6 +1272,12 @@ class TargetTransformInfo {
/// \return if target want to issue a prefetch in address space \p AS.
bool shouldPrefetchAddressSpace(unsigned AS) const;
+ InstructionCost
+ getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
+ ElementCount VF, PartialReductionExtendKind OpAExtend,
+ PartialReductionExtendKind OpBExtend,
+ std::optional<unsigned> BinOp = std::nullopt) const;
+
/// \return The maximum interleave factor that any transform should try to
/// perform for this target. This number depends on the level of parallelism
/// and the number of execution units in the CPU.
@@ -2063,6 +2081,12 @@ class TargetTransformInfo::Concept {
/// \return if target want to issue a prefetch in address space \p AS.
virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;
+ virtual InstructionCost
+ getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
+ ElementCount VF, PartialReductionExtendKind OpAExtend,
+ PartialReductionExtendKind OpBExtend,
+ std::optional<unsigned> BinOp) const = 0;
+
virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0;
virtual InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
@@ -2716,6 +2740,15 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
return Impl.shouldPrefetchAddressSpace(AS);
}
+ InstructionCost getPartialReductionCost(
+ unsigned Opcode, Type *InputType, Type *AccumType, ElementCount VF,
+ PartialReductionExtendKind OpAExtend,
+ PartialReductionExtendKind OpBExtend,
+ std::optional<unsigned> BinOp = std::nullopt) const override {
+ return Impl.getPartialReductionCost(Opcode, InputType, AccumType, VF,
+ OpAExtend, OpBExtend, BinOp);
+ }
+
unsigned getMaxInterleaveFactor(ElementCount VF) override {
return Impl.getMaxInterleaveFactor(VF);
}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index dbdfb4d8cdfa32..57b3ca3587bcb2 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -570,6 +570,15 @@ class TargetTransformInfoImplBase {
bool enableWritePrefetching() const { return false; }
bool shouldPrefetchAddressSpace(unsigned AS) const { return !AS; }
+ InstructionCost
+ getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
+ ElementCount VF,
+ TTI::PartialReductionExtendKind OpAExtend,
+ TTI::PartialReductionExtendKind OpBExtend,
+ std::optional<unsigned> BinOp = std::nullopt) const {
+ return InstructionCost::getInvalid();
+ }
+
unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
InstructionCost getArithmeticInstrCost(
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index a47462b61e03b2..ac1879cdd4b864 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -850,6 +850,14 @@ bool TargetTransformInfo::shouldPrefetchAddressSpace(unsigned AS) const {
return TTIImpl->shouldPrefetchAddressSpace(AS);
}
+InstructionCost TargetTransformInfo::getPartialReductionCost(
+ unsigned Opcode, Type *InputType, Type *AccumType, ElementCount VF,
+ PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend,
+ std::optional<unsigned> BinOp) const {
+ return TTIImpl->getPartialReductionCost(Opcode, InputType, AccumType, VF,
+ OpAExtend, OpBExtend, BinOp);
+}
+
unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
return TTIImpl->getMaxInterleaveFactor(VF);
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 1d09d67f6ec9e3..a2b6fbbd6bb824 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -342,6 +342,43 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
return BaseT::isLegalNTLoad(DataType, Alignment);
}
+ InstructionCost
+ getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
+ ElementCount VF,
+ TTI::PartialReductionExtendKind OpAExtend,
+ TTI::PartialReductionExtendKind OpBExtend,
+ std::optional<unsigned> BinOp) const {
+ InstructionCost Invalid = InstructionCost::getInvalid();
+
+ if (Opcode != Instruction::Add)
+ return Invalid;
+
+ EVT InputEVT = EVT::getEVT(InputType);
+ EVT AccumEVT = EVT::getEVT(AccumType);
+
+ if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable())
+ return Invalid;
+ if (VF.isFixed() && !ST->isNeonAvailable() && !ST->hasDotProd())
+ return Invalid;
+
+ if (InputEVT == MVT::i8) {
+ if (AccumEVT != MVT::i32)
+ return Invalid;
+ } else if (InputEVT == MVT::i16) {
+ if (AccumEVT != MVT::i64)
+ return Invalid;
+ } else
+ return Invalid;
+
+ if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None)
+ return Invalid;
+
+ if (!BinOp || (*BinOp) != Instruction::Mul)
+ return Invalid;
+
+ return InstructionCost::getMin();
+ }
+
bool enableOrderedReductions() const { return true; }
InstructionCost getInterleavedMemoryOpCost(
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 027ee21527d228..523614b5d835b7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1520,7 +1520,139 @@ class LoopVectorizationCostModel {
getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
TTI::TargetCostKind CostKind) const;
+ /// A chain of instructions that form a partial reduction.
+ /// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))),
+ /// accumulator)
+ struct PartialReductionChain {
+ /// The top-level binary operation that forms the reduction to a scalar
+ /// after the loop body
+ Instruction *Reduction;
+ /// The inner binary operation that forms the reduction to a vector value
+ /// within the loop body
+ Instruction *BinOp;
+ /// The extension of each of the inner binary operation's operands
+ Instruction *ExtendA;
+ Instruction *ExtendB;
+
+ /// The accumulator that is reduced to a scalar after the loop body
+ Value *Accumulator;
+
+ /// The scaling factor between the size of the reduction type and the
+ /// (possibly extended) inputs
+ unsigned ScaleFactor;
+ };
+
+ using PartialReductionList = DenseMap<Instruction *, PartialReductionChain>;
+
+ PartialReductionList getPartialReductionChains() {
+ return PartialReductionChains;
+ }
+
+ std::optional<PartialReductionChain>
+ getInstructionsPartialReduction(Instruction *I) const {
+ auto PairIt = PartialReductionChains.find(I);
+ if (PairIt == PartialReductionChains.end())
+ return std::nullopt;
+ return PairIt->second;
+ }
+
+ void removePartialReduction(Instruction *Instr) {
+ PartialReductionChains.erase(Instr);
+ }
+
+ void addPartialReductionIfSupported(Instruction *Instr, ElementCount VF) {
+
+ // Try to commutatively match:
+ // bin_op (one_use bin_op (z_or_sext, z_or_sext), phi)
+
+ auto *Root = dyn_cast<BinaryOperator>(Instr);
+ if (!Root)
+ return;
+
+ auto *BinOp = dyn_cast<BinaryOperator>(Root->getOperand(0));
+ auto *Phi = dyn_cast<PHINode>(Root->getOperand(1));
+ if (!BinOp) {
+ BinOp = dyn_cast<BinaryOperator>(Root->getOperand(1));
+ Phi = dyn_cast<PHINode>(Root->getOperand(0));
+ }
+ if (!BinOp || !BinOp->hasOneUse()) {
+ LLVM_DEBUG(
+ dbgs() << "Root was not a one-use binary operator, cannot create a "
+ "partial reduction.\n");
+ return;
+ }
+ if (!Phi) {
+ LLVM_DEBUG(dbgs() << "Expected Phi node was not a phi, cannot create a "
+ "partial reduction.\n");
+ return;
+ }
+
+ auto IsSextOrZext = [](Instruction *I) {
+ return I && (I->getOpcode() == Instruction::ZExt ||
+ I->getOpcode() == Instruction::SExt);
+ };
+
+ auto *ExtA = dyn_cast<Instruction>(BinOp->getOperand(0));
+ auto *ExtB = dyn_cast<Instruction>(BinOp->getOperand(1));
+ if (!IsSextOrZext(ExtA) || !IsSextOrZext(ExtB)) {
+ LLVM_DEBUG(dbgs() << "Expected extends were not extends, cannot create a "
+ "partial reduction.\n");
+ return;
+ }
+
+ Value *A = ExtA->getOperand(0);
+ Value *B = ExtB->getOperand(0);
+ // Check that the extends extend from the same type
+ if (A->getType() != B->getType()) {
+ LLVM_DEBUG(dbgs() << "Extends don't extend from the same type, cannot "
+ "create a partial reduction.\n");
+ return;
+ }
+
+ // Check that the extends extend to the same type
+ if (ExtA->getType() != ExtB->getType()) {
+ LLVM_DEBUG(
+ dbgs() << "Extends don't extend to the same type, cannot create "
+ "a partial reduction.\n");
+ return;
+ }
+
+ // Check that the second phi value is the instruction we're looking at
+ Instruction *MaybeAdd = dyn_cast<Instruction>(
+ Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
+ if (!MaybeAdd || MaybeAdd != Instr) {
+ LLVM_DEBUG(dbgs() << "Second PHI value is not the root binop, cannot "
+ "create a partial reduction.\n");
+ return;
+ }
+
+ TTI::PartialReductionExtendKind OpAExtend =
+ TargetTransformInfo::getPartialReductionExtendKind(ExtA);
+ TTI::PartialReductionExtendKind OpBExtend =
+ TargetTransformInfo::getPartialReductionExtendKind(ExtB);
+ InstructionCost Cost = TTI.getPartialReductionCost(
+ Instr->getOpcode(), A->getType(), Phi->getType(), VF, OpAExtend,
+ OpBExtend, std::make_optional(BinOp->getOpcode()));
+ if (Cost == InstructionCost::getInvalid())
+ return;
+
+ PartialReductionChain Chain;
+ Chain.Reduction = Instr;
+ Chain.BinOp = BinOp;
+ Chain.ExtendA = ExtA;
+ Chain.ExtendB = ExtB;
+ Chain.Accumulator = Phi;
+
+ unsigned InputSizeBits = A->getType()->getScalarSizeInBits();
+ unsigned ResultSizeBits = Chain.Reduction->getType()->getScalarSizeInBits();
+ Chain.ScaleFactor = ResultSizeBits / InputSizeBits;
+
+ PartialReductionChains[Instr] = Chain;
+ }
+
private:
+ PartialReductionList PartialReductionChains;
+
unsigned NumPredStores = 0;
/// \return An upper bound for the vectorization factors for both
@@ -4614,6 +4746,11 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
return false;
}
+ // Prevent epilogue vectorization if a partial reduction is involved
+ // TODO Is there a cleaner way to check this?
+ if (CM.getPartialReductionChains().size() > 0)
+ return false;
+
// Epilogue vectorization code has not been auditted to ensure it handles
// non-latch exits properly. It may be fine, but it needs auditted and
// tested.
@@ -6934,6 +7071,18 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
+
+ // Ignore any values that we know will be flattened
+ for (auto It : getPartialReductionChains()) {
+ PartialReductionChain Chain = It.second;
+ SmallVector<Value *> PartialReductionValues{Chain.Reduction, Chain.BinOp,
+ Chain.ExtendA, Chain.ExtendB,
+ Chain.Accumulator};
+ ValuesToIgnore.insert(PartialReductionValues.begin(),
+ PartialReductionValues.end());
+ VecValuesToIgnore.insert(PartialReductionValues.begin(),
+ PartialReductionValues.end());
+ }
}
void LoopVectorizationCostModel::collectInLoopReductions() {
@@ -7050,6 +7199,47 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
+
+ for (auto ReductionVar : Legal->getReductionVars()) {
+ auto *ReductionExitInstr = ReductionVar.second.getLoopExitInstr();
+ CM.addPartialReductionIfSupported(ReductionExitInstr, UserVF);
+ }
+
+ // Wider-than-legal vector types (coming from extends in partial reductions)
+ // should only be used by partial reductions so that they are lowered properly
+
+ // Build up a set of partial reduction bin ops for efficient use checking
+ SmallSet<Instruction *, 4> PartialReductionBinOps;
+ for (auto It : CM.getPartialReductionChains()) {
+ if (It.second.BinOp)
+ PartialReductionBinOps.insert(It.second.BinOp);
+ }
+
+ auto ExtendIsOnlyUsedByPartialReductions =
+ [PartialReductionBinOps](Instruction *Extend) {
+ for (auto *Use : Extend->users()) {
+ Instruction *UseInstr = dyn_cast<Instruction>(Use);
+ if (!PartialReductionBinOps.contains(UseInstr))
+ return false;
+ }
+ return true;
+ };
+
+ // Check if each use of a chain's two extends is a partial reduction
+ // and remove those that have non-partial reduction users
+ SmallSet<Instruction *, 4> PartialReductionsToRemove;
+ for (auto It : CM.getPartialReductionChains()) {
+ LoopVectorizationCostModel::PartialReductionChain Chain = It.second;
+ if (!ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) ||
+ !ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)) {
+ PartialReductionsToRemove.insert(Chain.Reduction);
+ LLVM_DEBUG(dbgs() << "Removing the partial reduction for an instruction "
+ "with an extend used by something other than a "
+ "partial reduction "
+ << *Chain.Reduction << "\n");
+ }
+ }
+
CM.collectValuesToIgnore();
CM.collectElementTypesForWidening();
@@ -7074,6 +7264,23 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
if (CM.foldTailByMasking())
Legal->prepareToFoldTailByMasking();
+ for (auto Pair : CM.getPartialReductionChains()) {
+ // TODO: Allow creating partial reductions when predicating. The select at
+ // the end of the loop chooses between the phi value and most recent partial
+ // reduction result, both of which have different VFs to the active lane
+ // mask.
+ Instruction *Instr = Pair.first;
+ if (CM.blockNeedsPredicationForAnyReason(Instr->getParent())) {
+ LLVM_DEBUG(dbgs() << "LV: Removing the partial reduction for an "
+ "instruction in a predicated block: "
+ << *Instr << "\n");
+ PartialReductionsToRemove.insert(Instr);
+ }
+ }
+
+ for (auto *Insn : PartialReductionsToRemove)
+ CM.removePartialReduction(Insn);
+
ElementCount MaxUserVF =
UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
if (UserVF) {
@@ -8593,6 +8800,18 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
return Recipe;
}
+unsigned getScaleFactorForReductionPhi(PHINode *Phi,
+ LoopVectorizationCostModel &CM) {
+ for (auto *User : Phi->users()) {
+ if (auto *I = dyn_cast<Instruction>(User)) {
+ if (auto Chain = CM.getInstructionsPartialReduction(I)) {
+ return Chain->ScaleFactor;
+ }
+ }
+ }
+ return 1;
+}
+
VPRecipeBase *
VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
ArrayRef<VPValue *> Operands,
@@ -8617,9 +8836,12 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
Legal->getReductionVars().find(Phi)->second;
assert(RdxDesc.getRecurrenceStartValue() ==
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
- PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
- CM.isInLoopReduction(Phi),
- CM.useOrderedReductions(RdxDesc));
+
+ // If the PHI is used by a partial reduction, set the scale factor
+ unsigned ScaleFactor = getScaleFactorForReductionPhi(Phi, CM);
+ PhiRecipe = new VPReductionPHIRecipe(
+ Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
+ CM.useOrderedReductions(RdxDesc), ScaleFactor);
} else {
// TODO: Currently fixed-order recurrences are modeled as chains of
// first-order recurrences. If there are no users of the intermediate
@@ -8671,6 +8893,24 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
return tryToWiden(Instr, Operands, VPBB);
}
+VPRecipeBase *
+VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
+ unsigned ScaleFactor,
+ ArrayRef<VPValue *> Operands) {
+ assert(Operands.size() == 2 &&
+ "Unexpected number of operands for partial reduction");
+
+ VPValue *BinOp = Operands[0];
+ VPValue *Phi = Operands[1];
+ VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
+ if (isa<VPReductionPHIRecipe>(BinOpRecipe))
+ std::swap(BinOp, Phi);
+
+ SmallVector<VPValue *, 2> OrderedOperands = {BinOp, Phi};
+ return new VPPartialReductionRecipe(
+ *Reduction, make_range(OrderedOperands.begin(), OrderedOperands.end()));
+}
+
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
ElementCount MaxVF) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -9064,8 +9304,14 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
continue;
}
- VPRecipeBase *Recipe =
- RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
+ VPRecipeBase *Recipe = nullptr;
+
+ if (auto Chain = CM.getInstructionsPartialReduction(Instr))
+ Recipe = RecipeBuilder.tryToCreatePartialReduction(
+ Chain->Reduction, Chain->ScaleFactor, Operands);
+ else if (!Recipe)
+ Recipe =
+ RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
if (!Recipe)
Recipe = RecipeBuilder.handleReplication(Instr, Range);
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 5d4a3b555981ce..136766eb455fde 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -125,6 +125,10 @@ class VPRecipeBuilder {
ArrayRef<VPValue *> Operands,
VFRange &Range, VPBasicBlock *VPBB);
+ VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
+ unsigned ScaleFactor,
+ ArrayRef<VPValue *> Operands);
+
/// Set the recipe created for given ingredient.
void setRecipe(Instruction *I, VPRecipeBase *R) {
assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 6a61ef63c2a054..b06c90daa7d89b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -900,6 +900,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenPointerInductionSC:
case VPRecipeBase::VPReductionPHISC:
case VPRecipeBase::VPScalarCastSC:
+ case VPRecipeBase::VPPartialReductionSC:
return true;
case VPRecipeBase::VPBranchOnMaskSC:
case VPRecipeBase::VPInterleaveSC:
@@ -2219,23 +2220,31 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// The phi is part of an ordered reduction. Requires IsInLoop to be true.
bool IsOrdered;
+ /// The scaling difference between the size of the output of the entire
+ /// reduction and the size of the input.
+
+ /// When expanding the reduction PHI, the plan's VF element count is divided
+ /// by this factor to form the reduction phi's VF.
+ unsigned VFScaleFactor = 1;
+
public:
/// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
/// RdxDesc.
VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
VPValue &Start, bool IsInLoop = false,
- bool IsOrdered = false)
+ bool IsOrdered = false, unsigned VFScaleFactor = 1)
: VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
- RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
+ RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
+ VFScaleFactor(VFScaleFactor) {
assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
}
~VPReductionPHIRecipe() override = default;
VPReductionPHIRecipe *clone() override {
- auto *R =
- new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc,
- *getOperand(0), IsInLoop, IsOrdered);
+ auto *R = new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()),
+ RdxDesc, *getOperand(0), IsInLoop,
+ IsOrdered, VFScaleFactor);
R->addOperand(getBackedgeValue());
return R;
}
@@ -2266,6 +2275,39 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
bool isInLoop() const { return IsInLoop; }
};
+/// A recipe for forming partial reductions. In the loop, an accumulator and
+/// vector operand are added together and passed to the next iteration as the
+/// next accumulator. After the loop body, the accumulator is reduced to a
+/// scalar value.
+class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
+ unsigned Opcode;
+ Instruction &Reduction;
+
+public:
+ template <typename IterT>
+ VPPartialReductionRecipe(Instruction &I, iterator_range<IterT> Operands)
+ : VPRecipeWithIRFlags(VPDef::VPPartialReductionSC, Operands, I),
+ Opcode(I.getOpcode()), Reduction(I) {
+ assert(isa<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe()) &&
+ "Unexpected operand order for partial reduction recipe");
+ }
+ ~VPPartialReductionRecipe() override = default;
+ VPPartialReductionRecipe *clone() override {
+ auto Ops = operands();
+ return new VPPartialReductionRecipe(Reduction,
+ make_range(Ops.begin(), Ops.end()));
+ }
+ VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
+ /// Generate the reduction in the loop
+ void execute(VPTransformState &State) override;
+ unsigned getOpcode() { return Opcode; }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A recipe for vectorizing a phi-node as a sequence of mask-based select
/// instructions.
class VPBlendRecipe : public VPSingleDefRecipe {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 5a5b3ac19c46ad..b63cce0b7a21ad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -233,6 +233,11 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) {
llvm_unreachable("Unhandled opcode");
}
+Type *
+VPTypeAnalysis::inferScalarTypeForRecipe(const VPPartialReductionRecipe *R) {
+ return R->getUnderlyingInstr()->getType();
+}
+
Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
if (Type *CachedTy = CachedTypes.lookup(V))
return CachedTy;
@@ -266,7 +271,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
})
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPWidenEVLRecipe,
VPReplicateRecipe, VPWidenCallRecipe, VPWidenMemoryRecipe,
- VPWidenSelectRecipe>(
+ VPWidenSelectRecipe, VPPartialReductionRecipe>(
[this](const auto *R) { return inferScalarTypeForRecipe(R); })
.Case<VPWidenIntrinsicRecipe>([](const VPWidenIntrinsicRecipe *R) {
return R->getResultType();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index cc21870bee2e3b..a34d9629eff9dd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -27,6 +27,7 @@ struct VPWidenSelectRecipe;
class VPReplicateRecipe;
class VPRecipeBase;
class VPlan;
+class VPPartialReductionRecipe;
class Type;
/// An analysis for type-inference for VPValues.
@@ -53,6 +54,7 @@ class VPTypeAnalysis {
Type *inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R);
Type *inferScalarTypeForRecipe(const VPWidenSelectRecipe *R);
Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R);
+ Type *inferScalarTypeForRecipe(const VPPartialReductionRecipe *R);
public:
VPTypeAnalysis(Type *CanonicalIVTy)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 368d6e58a5578e..67dd0a64913951 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -327,6 +327,38 @@ InstructionCost VPSingleDefRecipe::computeCost(ElementCount VF,
return UI ? Ctx.getLegacyCost(UI, VF) : 0;
}
+void VPPartialReductionRecipe::execute(VPTransformState &State) {
+ State.setDebugLocFrom(getDebugLoc());
+ auto &Builder = State.Builder;
+
+ assert(Opcode == Instruction::Add && "Unhandled partial reduction opcode");
+
+ Value *BinOpVal = State.get(getOperand(0), 0);
+ Value *PhiVal = State.get(getOperand(1), 0);
+ assert(PhiVal && BinOpVal && "Phi and Mul must be set");
+
+ Type *RetTy = PhiVal->getType();
+
+ CallInst *V = Builder.CreateIntrinsic(
+ RetTy, Intrinsic::experimental_vector_partial_reduce_add,
+ {PhiVal, BinOpVal}, nullptr, Twine("partial.reduce"));
+
+ // Use this vector value for all users of the original instruction.
+ State.set(this, V, 0);
+ State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "PARTIAL-REDUCE ";
+ printAsOperand(O, SlotTracker);
+ O << " = " << Instruction::getOpcodeName(Opcode);
+ printFlags(O);
+ printOperands(O, SlotTracker);
+}
+#endif
+
FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
assert(OpType == OperationType::FPMathOp &&
"recipe doesn't have fast math flags");
@@ -3174,6 +3206,8 @@ void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
void VPReductionPHIRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
+ auto VF = State.VF.divideCoefficientBy(VFScaleFactor);
+
// Reductions do not have to start at zero. They can start with
// any loop invariant values.
VPValue *StartVPV = getStartValue();
@@ -3183,9 +3217,9 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
// stage #1: We create a new vector PHI node with no incoming edges. We'll use
// this value when we vectorize all of the instructions that use the PHI.
- bool ScalarPHI = State.VF.isScalar() || IsInLoop;
- Type *VecTy = ScalarPHI ? StartV->getType()
- : VectorType::get(StartV->getType(), State.VF);
+ bool ScalarPHI = VF.isScalar() || IsInLoop;
+ Type *VecTy =
+ ScalarPHI ? StartV->getType() : VectorType::get(StartV->getType(), VF);
BasicBlock *HeaderBB = State.CFG.PrevBB;
assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
@@ -3219,13 +3253,13 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
// Create start and identity vector values for the reduction in the
// preheader.
// TODO: Introduce recipes in VPlan preheader to create initial values.
- Iden = Builder.CreateVectorSplat(State.VF, Iden);
+ Iden = Builder.CreateVectorSplat(VF, Iden);
IRBuilderBase::InsertPointGuard IPBuilder(Builder);
Builder.SetInsertPoint(VectorPH->getTerminator());
Constant *Zero = Builder.getInt32(0);
StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
} else {
- Iden = Builder.CreateVectorSplat(State.VF, Iden);
+ Iden = Builder.CreateVectorSplat(VF, Iden);
}
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index f2978b0a758b6a..2e39a83dbfd922 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -342,6 +342,7 @@ class VPDef {
VPInterleaveSC,
VPReductionEVLSC,
VPReductionSC,
+ VPPartialReductionSC,
VPReplicateSC,
VPScalarCastSC,
VPScalarIVStepsSC,
diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll b/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
new file mode 100644
index 00000000000000..fc6e3239a1b43c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define void @dotp(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define void @dotp(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP19:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP25]], align 1
+; CHECK-NEXT: [[TMP27:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP29:%.*]] = mul <vscale x 16 x i32> [[TMP27]], [[TMP19]]
+; CHECK-NEXT: [[TMP14]] = add <vscale x 16 x i32> [[TMP29]], [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP14]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[TMP20:%.*]] = lshr i32 [[ADD_LCSSA]], 0
+; CHECK-NEXT: ret void
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ACC_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP18]] to i32
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP22]] to i32
+; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[CONV3]], [[CONV]]
+; CHECK-NEXT: [[ADD]] = add i32 [[MUL]], [[ACC_010]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 0
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %0 = lshr i32 %add, 0
+ ret void
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %acc.010 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr i8, ptr %a, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %1 to i32
+ %arrayidx2 = getelementptr i8, ptr %b, i64 %indvars.iv
+ %2 = load i8, ptr %arrayidx2, align 1
+ %conv3 = zext i8 %2 to i32
+ %mul = mul i32 %conv3, %conv
+ %add = add i32 %mul, %acc.010
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 0
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+
+; uselistorder directives
+ uselistorder i32 %add, { 1, 0 }
+}
+
+attributes #0 = { "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+v8a" }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
new file mode 100644
index 00000000000000..bcac4d674123d8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -0,0 +1,1322 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVE1
+; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVED
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @dotp(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @dotp(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP13]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1: middle.block:
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP15]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul <vscale x 16 x i32> [[TMP19]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 16 x i32> [[TMP20]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP22]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %result = lshr i32 %add, 0
+ ret i32 %result
+
+for.body: ; preds = %for.body, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %gep.a = getelementptr i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = zext i8 %load.a to i32
+ %gep.b = getelementptr i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %ext.b = zext i8 %load.b to i32
+ %mul = mul i32 %ext.b, %ext.a
+ %add = add i32 %mul, %accum
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 0
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @not_dotp_different_types(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3
+; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5
+; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6
+; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7
+; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9
+; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10
+; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11
+; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12
+; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13
+; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14
+; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15
+; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVE1: middle.block:
+; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define void @not_dotp_different_types(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 17
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 18
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 19
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 20
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 21
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 22
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 23
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 24
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 25
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = add i64 [[INDEX]], 26
+; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 27
+; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], 28
+; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 29
+; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30
+; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31
+; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
+; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
+; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
+; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
+; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
+; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
+; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
+; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
+; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
+; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
+; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i16, ptr [[TMP37]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = load i16, ptr [[TMP38]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = load i16, ptr [[TMP39]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = load i16, ptr [[TMP40]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = load i16, ptr [[TMP41]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i16, ptr [[TMP42]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = load i16, ptr [[TMP43]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = load i16, ptr [[TMP44]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = load i16, ptr [[TMP45]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = load i16, ptr [[TMP46]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i16, ptr [[TMP47]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = load i16, ptr [[TMP48]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = load i16, ptr [[TMP49]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP50]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP51]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
+; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
+; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
+; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
+; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
+; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
+; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
+; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
+; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
+; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
+; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
+; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
+; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
+; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
+; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP53]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP54]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP55]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP56]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP57]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP58]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP59]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP60]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP61]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP62]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP63]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP64]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP65]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP66]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP67]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP68]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
+; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
+; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
+; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
+; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
+; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
+; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
+; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
+; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
+; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
+; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
+; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
+; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
+; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
+; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
+; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]]
+; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
+; CHECK-INTERLEAVED-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
+; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %0 = lshr i32 %add, 0
+ ret void
+
+for.body: ; preds = %for.body, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %gep.a = getelementptr i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = zext i8 %load.a to i32
+ %gep.b = getelementptr i8, ptr %b, i64 %iv
+ %load.b = load i16, ptr %gep.b, align 2
+ %ext.b = zext i16 %load.b to i32
+ %mul = mul i32 %ext.b, %ext.a
+ %add = add i32 %mul, %accum
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 0
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @not_dotp_not_loop_carried(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP16]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP16]], i32 -1)
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define void @not_dotp_not_loop_carried(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1)
+; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %0 = lshr i32 %add, 0
+ ret void
+
+for.body: ; preds = %for.body, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ]
+ %gep.a = getelementptr i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = zext i8 %load.a to i32
+ %gep.b = getelementptr i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %ext.b = zext i8 %load.b to i32
+ %mul = mul i32 %ext.b, %ext.a
+ %add = add i32 %mul, %accum
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 0
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @not_dotp_not_phi(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @not_dotp_not_phi(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP17]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define void @not_dotp_not_phi(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP19]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <vscale x 8 x i32> [[TMP22]], [[TMP21]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %0 = lshr i32 %add, 0
+ ret void
+
+for.body: ; preds = %for.body, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %gep.a = getelementptr i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %ext.a = zext i8 %load.a to i32
+ %gep.b = getelementptr i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %ext.b = zext i8 %load.b to i32
+ %mul = mul i32 %ext.b, %ext.a
+ %add = add i32 %mul, %ext.b
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 0
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %w, ptr %scales, ptr %u, ptr %v) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @dotp_unrolled(
+; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i32 [[NUM_IN:%.*]], ptr [[W:%.*]], ptr [[SCALES:%.*]], ptr [[U:%.*]], ptr [[V:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[CMP154:%.*]] = icmp sgt i32 [[NUM_OUT]], 3
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP154]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END98:%.*]]
+; CHECK-INTERLEAVE1: for.body.lr.ph:
+; CHECK-INTERLEAVE1-NEXT: [[DIV:%.*]] = sdiv i32 [[NUM_OUT]], 4
+; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = shl nsw i32 [[DIV]], 2
+; CHECK-INTERLEAVE1-NEXT: [[CMP11145:%.*]] = icmp sgt i32 [[NUM_IN]], 0
+; CHECK-INTERLEAVE1-NEXT: [[IDXPROM44:%.*]] = sext i32 [[NUM_IN]] to i64
+; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext nneg i32 [[MUL]] to i64
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP11145]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK-INTERLEAVE1: for.body.preheader:
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_END98]]
+; CHECK-INTERLEAVE1: for.body.us.preheader:
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[NUM_IN]] to i64
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY_US:%.*]]
+; CHECK-INTERLEAVE1: for.body.us:
+; CHECK-INTERLEAVE1-NEXT: [[INDVARS_IV164:%.*]] = phi i64 [ 0, [[FOR_BODY_US_PREHEADER]] ], [ [[INDVARS_IV_NEXT165:%.*]], [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US:%.*]] ]
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[INDVARS_IV164]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = or disjoint i64 [[INDVARS_IV164]], 1
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX3_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX3_US]], align 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = or disjoint i64 [[INDVARS_IV164]], 2
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[ARRAYIDX6_US]], align 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = or disjoint i64 [[INDVARS_IV164]], 3
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX9_US]], align 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE181:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI172:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE179:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI173:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE177:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI174:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD175:%.*]] = load <vscale x 16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD175]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nsw <vscale x 16 x i32> [[TMP20]], [[TMP17]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI174]], <vscale x 16 x i32> [[TMP21]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD176:%.*]] = load <vscale x 16 x i8>, ptr [[TMP23]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD176]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = mul nsw <vscale x 16 x i32> [[TMP24]], [[TMP20]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE177]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI173]], <vscale x 16 x i32> [[TMP25]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD178:%.*]] = load <vscale x 16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD178]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP28]], [[TMP20]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE179]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI172]], <vscale x 16 x i32> [[TMP29]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD180:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD180]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP32]], [[TMP20]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE181]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP33]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK-INTERLEAVE1: middle.block:
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE177_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE177]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE179_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE179]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE181_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE181]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE181_LCSSA]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE179_LCSSA]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE177_LCSSA]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_LCSSA]])
+; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1: scalar.ph:
+; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX182:%.*]] = phi i32 [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX183:%.*]] = phi i32 [ [[TMP37]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX184:%.*]] = phi i32 [ [[TMP38]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[BC_RESUME_VAL]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = mul i64 [[TMP40]], 16
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[TMP39]], [[TMP41]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK7]], label [[SCALAR_PH6:%.*]], label [[VECTOR_PH8:%.*]]
+; CHECK-INTERLEAVE1: vector.ph8:
+; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 16
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF9:%.*]] = urem i64 [[TMP39]], [[TMP43]]
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC10:%.*]] = sub i64 [[TMP39]], [[N_MOD_VF9]]
+; CHECK-INTERLEAVE1-NEXT: [[IND_END:%.*]] = add i64 [[BC_RESUME_VAL]], [[N_VEC10]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX182]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX183]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX184]], i32 0
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY12:%.*]]
+; CHECK-INTERLEAVE1: vector.body12:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX13:%.*]] = phi i64 [ 0, [[VECTOR_PH8]] ], [ [[INDEX_NEXT27:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI14:%.*]] = phi <vscale x 4 x i32> [ [[TMP46]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE26:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI15:%.*]] = phi <vscale x 4 x i32> [ [[TMP47]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE24:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI16:%.*]] = phi <vscale x 4 x i32> [ [[TMP48]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE22:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI17:%.*]] = phi <vscale x 4 x i32> [ [[TMP49]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE20:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[BC_RESUME_VAL]], [[INDEX13]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP50]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP51]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD18:%.*]] = load <vscale x 16 x i8>, ptr [[TMP52]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD18]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP50]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD19:%.*]] = load <vscale x 16 x i8>, ptr [[TMP55]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD19]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = mul nsw <vscale x 16 x i32> [[TMP56]], [[TMP53]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE20]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI17]], <vscale x 16 x i32> [[TMP57]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP50]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP58]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD21:%.*]] = load <vscale x 16 x i8>, ptr [[TMP59]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD21]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = mul nsw <vscale x 16 x i32> [[TMP60]], [[TMP56]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE22]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI16]], <vscale x 16 x i32> [[TMP61]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP50]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[TMP62]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD23:%.*]] = load <vscale x 16 x i8>, ptr [[TMP63]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD23]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = mul nsw <vscale x 16 x i32> [[TMP64]], [[TMP56]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE24]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI15]], <vscale x 16 x i32> [[TMP65]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP50]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, ptr [[TMP66]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD25:%.*]] = load <vscale x 16 x i8>, ptr [[TMP67]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD25]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP69:%.*]] = mul nsw <vscale x 16 x i32> [[TMP68]], [[TMP56]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE26]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI14]], <vscale x 16 x i32> [[TMP69]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT27]] = add nuw i64 [[INDEX13]], [[TMP45]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT27]], [[N_VEC10]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK5:%.*]], label [[VECTOR_BODY12]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVE1: middle.block5:
+; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE26]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP72:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE24]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE22]])
+; CHECK-INTERLEAVE1-NEXT: [[TMP74:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE20]])
+; CHECK-INTERLEAVE1-NEXT: [[CMP_N28:%.*]] = icmp eq i64 [[TMP39]], [[N_VEC10]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N28]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US_LOOPEXIT:%.*]], label [[SCALAR_PH6]]
+;
+; CHECK-INTERLEAVED-LABEL: define void @dotp_unrolled(
+; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i32 [[NUM_IN:%.*]], ptr [[W:%.*]], ptr [[SCALES:%.*]], ptr [[U:%.*]], ptr [[V:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[CMP154:%.*]] = icmp sgt i32 [[NUM_OUT]], 3
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP154]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END98:%.*]]
+; CHECK-INTERLEAVED: for.body.lr.ph:
+; CHECK-INTERLEAVED-NEXT: [[DIV:%.*]] = sdiv i32 [[NUM_OUT]], 4
+; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = shl nsw i32 [[DIV]], 2
+; CHECK-INTERLEAVED-NEXT: [[CMP11145:%.*]] = icmp sgt i32 [[NUM_IN]], 0
+; CHECK-INTERLEAVED-NEXT: [[IDXPROM44:%.*]] = sext i32 [[NUM_IN]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext nneg i32 [[MUL]] to i64
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP11145]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK-INTERLEAVED: for.body.preheader:
+; CHECK-INTERLEAVED-NEXT: br label [[FOR_END98]]
+; CHECK-INTERLEAVED: for.body.us.preheader:
+; CHECK-INTERLEAVED-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[NUM_IN]] to i64
+; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY_US:%.*]]
+; CHECK-INTERLEAVED: for.body.us:
+; CHECK-INTERLEAVED-NEXT: [[INDVARS_IV164:%.*]] = phi i64 [ 0, [[FOR_BODY_US_PREHEADER]] ], [ [[INDVARS_IV_NEXT165:%.*]], [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US:%.*]] ]
+; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[INDVARS_IV164]]
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = or disjoint i64 [[INDVARS_IV164]], 1
+; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX3_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX3_US]], align 8
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = or disjoint i64 [[INDVARS_IV164]], 2
+; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = load ptr, ptr [[ARRAYIDX6_US]], align 8
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = or disjoint i64 [[INDVARS_IV164]], 3
+; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX9_US]], align 8
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP11]]
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE181:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI172:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE179:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI173:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE177:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI174:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD175:%.*]] = load <vscale x 16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD175]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <vscale x 16 x i32> [[TMP20]], [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI174]], <vscale x 16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD176:%.*]] = load <vscale x 16 x i8>, ptr [[TMP23]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD176]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nsw <vscale x 16 x i32> [[TMP24]], [[TMP20]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE177]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI173]], <vscale x 16 x i32> [[TMP25]])
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD178:%.*]] = load <vscale x 16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD178]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP28]], [[TMP20]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE179]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI172]], <vscale x 16 x i32> [[TMP29]])
+; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD180:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD180]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP32]], [[TMP20]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE181]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP33]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE177_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE177]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE179_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE179]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE181_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE181]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE181_LCSSA]])
+; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE179_LCSSA]])
+; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE177_LCSSA]])
+; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_LCSSA]])
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED: scalar.ph:
+; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX182:%.*]] = phi i32 [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX183:%.*]] = phi i32 [ [[TMP37]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX184:%.*]] = phi i32 [ [[TMP38]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[BC_RESUME_VAL]]
+; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = mul i64 [[TMP40]], 32
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[TMP39]], [[TMP41]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK7]], label [[SCALAR_PH6:%.*]], label [[VECTOR_PH8:%.*]]
+; CHECK-INTERLEAVED: vector.ph8:
+; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 32
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF9:%.*]] = urem i64 [[TMP39]], [[TMP43]]
+; CHECK-INTERLEAVED-NEXT: [[N_VEC10:%.*]] = sub i64 [[TMP39]], [[N_MOD_VF9]]
+; CHECK-INTERLEAVED-NEXT: [[IND_END:%.*]] = add i64 [[BC_RESUME_VAL]], [[N_VEC10]]
+; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX182]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX183]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX184]], i32 0
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY12:%.*]]
+; CHECK-INTERLEAVED: vector.body12:
+; CHECK-INTERLEAVED-NEXT: [[INDEX13:%.*]] = phi i64 [ 0, [[VECTOR_PH8]] ], [ [[INDEX_NEXT40:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI14:%.*]] = phi <vscale x 4 x i32> [ [[TMP46]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE38:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI15:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE39:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI16:%.*]] = phi <vscale x 4 x i32> [ [[TMP47]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE34:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI17:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE35:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI18:%.*]] = phi <vscale x 4 x i32> [ [[TMP48]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE30:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI19:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE31:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI20:%.*]] = phi <vscale x 4 x i32> [ [[TMP49]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE26:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI21:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE27:%.*]], [[VECTOR_BODY12]] ]
+; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[BC_RESUME_VAL]], [[INDEX13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP50]]
+; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP51]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = mul i64 [[TMP53]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP51]], i64 [[TMP54]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD22:%.*]] = load <vscale x 16 x i8>, ptr [[TMP52]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD23:%.*]] = load <vscale x 16 x i8>, ptr [[TMP55]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD22]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD23]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP50]]
+; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP58]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = mul i64 [[TMP60]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[TMP58]], i64 [[TMP61]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD24:%.*]] = load <vscale x 16 x i8>, ptr [[TMP59]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <vscale x 16 x i8>, ptr [[TMP62]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD24]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD25]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = mul nsw <vscale x 16 x i32> [[TMP63]], [[TMP56]]
+; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = mul nsw <vscale x 16 x i32> [[TMP64]], [[TMP57]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE26]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI20]], <vscale x 16 x i32> [[TMP65]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE27]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI21]], <vscale x 16 x i32> [[TMP66]])
+; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP50]]
+; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[TMP67]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = getelementptr inbounds i8, ptr [[TMP67]], i64 [[TMP70]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD28:%.*]] = load <vscale x 16 x i8>, ptr [[TMP68]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD29:%.*]] = load <vscale x 16 x i8>, ptr [[TMP71]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD28]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD29]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = mul nsw <vscale x 16 x i32> [[TMP72]], [[TMP63]]
+; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = mul nsw <vscale x 16 x i32> [[TMP73]], [[TMP64]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE30]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI18]], <vscale x 16 x i32> [[TMP74]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE31]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI19]], <vscale x 16 x i32> [[TMP75]])
+; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP50]]
+; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = getelementptr inbounds i8, ptr [[TMP76]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = mul i64 [[TMP78]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = getelementptr inbounds i8, ptr [[TMP76]], i64 [[TMP79]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD32:%.*]] = load <vscale x 16 x i8>, ptr [[TMP77]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD33:%.*]] = load <vscale x 16 x i8>, ptr [[TMP80]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD32]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD33]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = mul nsw <vscale x 16 x i32> [[TMP81]], [[TMP63]]
+; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = mul nsw <vscale x 16 x i32> [[TMP82]], [[TMP64]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE34]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI16]], <vscale x 16 x i32> [[TMP83]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE35]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI17]], <vscale x 16 x i32> [[TMP84]])
+; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP50]]
+; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = getelementptr inbounds i8, ptr [[TMP85]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = getelementptr inbounds i8, ptr [[TMP85]], i64 [[TMP88]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD36:%.*]] = load <vscale x 16 x i8>, ptr [[TMP86]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD37:%.*]] = load <vscale x 16 x i8>, ptr [[TMP89]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD36]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD37]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = mul nsw <vscale x 16 x i32> [[TMP90]], [[TMP63]]
+; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = mul nsw <vscale x 16 x i32> [[TMP91]], [[TMP64]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE38]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI14]], <vscale x 16 x i32> [[TMP92]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE39]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI15]], <vscale x 16 x i32> [[TMP93]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT40]] = add nuw i64 [[INDEX13]], [[TMP45]]
+; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = icmp eq i64 [[INDEX_NEXT40]], [[N_VEC10]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP94]], label [[MIDDLE_BLOCK5:%.*]], label [[VECTOR_BODY12]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block5:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE39]], [[PARTIAL_REDUCE38]]
+; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX41:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE35]], [[PARTIAL_REDUCE34]]
+; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX41]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX42:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE31]], [[PARTIAL_REDUCE30]]
+; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX42]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX43:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE27]], [[PARTIAL_REDUCE26]]
+; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX43]])
+; CHECK-INTERLEAVED-NEXT: [[CMP_N44:%.*]] = icmp eq i64 [[TMP39]], [[N_VEC10]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N44]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US_LOOPEXIT:%.*]], label [[SCALAR_PH6]]
+;
+entry:
+ %cmp154 = icmp sgt i32 %num_out, 3
+ br i1 %cmp154, label %for.body.lr.ph, label %for.end98
+
+for.body.lr.ph: ; preds = %entry
+ %div = sdiv i32 %num_out, 4
+ %mul = shl nsw i32 %div, 2
+ %cmp11145 = icmp sgt i32 %num_in, 0
+ %idxprom44 = sext i32 %num_in to i64
+ %0 = zext nneg i32 %mul to i64
+ br i1 %cmp11145, label %for.body.us.preheader, label %for.body.preheader
+
+for.body.preheader: ; preds = %for.body.lr.ph
+ br label %for.end98
+
+for.body.us.preheader: ; preds = %for.body.lr.ph
+ %wide.trip.count = zext nneg i32 %num_in to i64
+ br label %for.body.us
+
+for.body.us: ; preds = %for.body.us.preheader, %for.cond10.for.cond.cleanup_crit_edge.us
+ %indvars.iv164 = phi i64 [ 0, %for.body.us.preheader ], [ %indvars.iv.next165, %for.cond10.for.cond.cleanup_crit_edge.us ]
+ %arrayidx.us = getelementptr inbounds ptr, ptr %w, i64 %indvars.iv164
+ %1 = load ptr, ptr %arrayidx.us, align 8
+ %2 = or disjoint i64 %indvars.iv164, 1
+ %arrayidx3.us = getelementptr inbounds ptr, ptr %w, i64 %2
+ %3 = load ptr, ptr %arrayidx3.us, align 8
+ %4 = or disjoint i64 %indvars.iv164, 2
+ %arrayidx6.us = getelementptr inbounds ptr, ptr %w, i64 %4
+ %5 = load ptr, ptr %arrayidx6.us, align 8
+ %6 = or disjoint i64 %indvars.iv164, 3
+ %arrayidx9.us = getelementptr inbounds ptr, ptr %w, i64 %6
+ %7 = load ptr, ptr %arrayidx9.us, align 8
+ %8 = call i64 @llvm.vscale.i64()
+ %9 = mul i64 %8, 16
+ %min.iters.check = icmp ult i64 %wide.trip.count, %9
+ br i1 %min.iters.check, label %scalar.ph, label %vector.ph
+
+vector.ph: ; preds = %for.body.us
+ %10 = call i64 @llvm.vscale.i64()
+ %11 = mul i64 %10, 16
+ %n.mod.vf = urem i64 %wide.trip.count, %11
+ %n.vec = sub i64 %wide.trip.count, %n.mod.vf
+ %12 = call i64 @llvm.vscale.i64()
+ %13 = mul i64 %12, 16
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce181, %vector.body ]
+ %vec.phi172 = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce179, %vector.body ]
+ %vec.phi173 = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce177, %vector.body ]
+ %vec.phi174 = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce, %vector.body ]
+ %14 = add i64 %index, 0
+ %15 = getelementptr inbounds i8, ptr %1, i64 %14
+ %16 = getelementptr inbounds i8, ptr %15, i32 0
+ %wide.load = load <vscale x 16 x i8>, ptr %16, align 1
+ %17 = sext <vscale x 16 x i8> %wide.load to <vscale x 16 x i32>
+ %18 = getelementptr inbounds i8, ptr %u, i64 %14
+ %19 = getelementptr inbounds i8, ptr %18, i32 0
+ %wide.load175 = load <vscale x 16 x i8>, ptr %19, align 1
+ %20 = sext <vscale x 16 x i8> %wide.load175 to <vscale x 16 x i32>
+ %21 = mul nsw <vscale x 16 x i32> %20, %17
+ %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi174, <vscale x 16 x i32> %21)
+ %22 = getelementptr inbounds i8, ptr %3, i64 %14
+ %23 = getelementptr inbounds i8, ptr %22, i32 0
+ %wide.load176 = load <vscale x 16 x i8>, ptr %23, align 1
+ %24 = sext <vscale x 16 x i8> %wide.load176 to <vscale x 16 x i32>
+ %25 = mul nsw <vscale x 16 x i32> %24, %20
+ %partial.reduce177 = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi173, <vscale x 16 x i32> %25)
+ %26 = getelementptr inbounds i8, ptr %5, i64 %14
+ %27 = getelementptr inbounds i8, ptr %26, i32 0
+ %wide.load178 = load <vscale x 16 x i8>, ptr %27, align 1
+ %28 = sext <vscale x 16 x i8> %wide.load178 to <vscale x 16 x i32>
+ %29 = mul nsw <vscale x 16 x i32> %28, %20
+ %partial.reduce179 = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi172, <vscale x 16 x i32> %29)
+ %30 = getelementptr inbounds i8, ptr %7, i64 %14
+ %31 = getelementptr inbounds i8, ptr %30, i32 0
+ %wide.load180 = load <vscale x 16 x i8>, ptr %31, align 1
+ %32 = sext <vscale x 16 x i8> %wide.load180 to <vscale x 16 x i32>
+ %33 = mul nsw <vscale x 16 x i32> %32, %20
+ %partial.reduce181 = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %33)
+ %index.next = add nuw i64 %index, %13
+ %34 = icmp eq i64 %index.next, %n.vec
+ br i1 %34, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %35 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce181)
+ %36 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce179)
+ %37 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce177)
+ %38 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce)
+ %cmp.n = icmp eq i64 %wide.trip.count, %n.vec
+ br i1 %cmp.n, label %for.cond10.for.cond.cleanup_crit_edge.us, label %scalar.ph
+
+scalar.ph: ; preds = %middle.block, %for.body.us
+ %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.us ]
+ %bc.merge.rdx = phi i32 [ %35, %middle.block ], [ 0, %for.body.us ]
+ %bc.merge.rdx182 = phi i32 [ %36, %middle.block ], [ 0, %for.body.us ]
+ %bc.merge.rdx183 = phi i32 [ %37, %middle.block ], [ 0, %for.body.us ]
+ %bc.merge.rdx184 = phi i32 [ %38, %middle.block ], [ 0, %for.body.us ]
+ br label %for.body12.us
+
+for.body12.us: ; preds = %scalar.ph, %for.body12.us
+ %indvars.iv161 = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next162, %for.body12.us ]
+ %total3.0149.us = phi i32 [ %bc.merge.rdx, %scalar.ph ], [ %add43.us, %for.body12.us ]
+ %total2.0148.us = phi i32 [ %bc.merge.rdx182, %scalar.ph ], [ %add35.us, %for.body12.us ]
+ %total1.0147.us = phi i32 [ %bc.merge.rdx183, %scalar.ph ], [ %add27.us, %for.body12.us ]
+ %total0.0146.us = phi i32 [ %bc.merge.rdx184, %scalar.ph ], [ %add19.us, %for.body12.us ]
+ %arrayidx14.us = getelementptr inbounds i8, ptr %1, i64 %indvars.iv161
+ %39 = load i8, ptr %arrayidx14.us, align 1
+ %conv.us = sext i8 %39 to i32
+ %arrayidx16.us = getelementptr inbounds i8, ptr %u, i64 %indvars.iv161
+ %40 = load i8, ptr %arrayidx16.us, align 1
+ %conv17.us = sext i8 %40 to i32
+ %mul18.us = mul nsw i32 %conv17.us, %conv.us
+ %add19.us = add nsw i32 %mul18.us, %total0.0146.us
+ %arrayidx21.us = getelementptr inbounds i8, ptr %3, i64 %indvars.iv161
+ %41 = load i8, ptr %arrayidx21.us, align 1
+ %conv22.us = sext i8 %41 to i32
+ %mul26.us = mul nsw i32 %conv22.us, %conv17.us
+ %add27.us = add nsw i32 %mul26.us, %total1.0147.us
+ %arrayidx29.us = getelementptr inbounds i8, ptr %5, i64 %indvars.iv161
+ %42 = load i8, ptr %arrayidx29.us, align 1
+ %conv30.us = sext i8 %42 to i32
+ %mul34.us = mul nsw i32 %conv30.us, %conv17.us
+ %add35.us = add nsw i32 %mul34.us, %total2.0148.us
+ %arrayidx37.us = getelementptr inbounds i8, ptr %7, i64 %indvars.iv161
+ %43 = load i8, ptr %arrayidx37.us, align 1
+ %conv38.us = sext i8 %43 to i32
+ %mul42.us = mul nsw i32 %conv38.us, %conv17.us
+ %add43.us = add nsw i32 %mul42.us, %total3.0149.us
+ %indvars.iv.next162 = add nuw nsw i64 %indvars.iv161, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next162, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond10.for.cond.cleanup_crit_edge.us, label %for.body12.us
+
+for.cond10.for.cond.cleanup_crit_edge.us: ; preds = %middle.block, %for.body12.us
+ %add19.us.lcssa = phi i32 [ %add19.us, %for.body12.us ], [ %38, %middle.block ]
+ %add27.us.lcssa = phi i32 [ %add27.us, %for.body12.us ], [ %37, %middle.block ]
+ %add35.us.lcssa = phi i32 [ %add35.us, %for.body12.us ], [ %36, %middle.block ]
+ %add43.us.lcssa = phi i32 [ %add43.us, %for.body12.us ], [ %35, %middle.block ]
+ %arrayidx45.us = getelementptr inbounds i8, ptr %1, i64 %idxprom44
+ %44 = load i8, ptr %arrayidx45.us, align 1
+ %conv46.us = sext i8 %44 to i32
+ %mul47.us = mul nsw i32 %conv46.us, 127
+ %add48.us = add nsw i32 %mul47.us, %add19.us.lcssa
+ %conv49.us = sitofp i32 %add48.us to float
+ %arrayidx52.us = getelementptr inbounds float, ptr %scales, i64 %indvars.iv164
+ %45 = load float, ptr %arrayidx52.us, align 4
+ %mul53.us = fmul float %45, %conv49.us
+ %arrayidx56.us = getelementptr inbounds float, ptr %v, i64 %indvars.iv164
+ store float %mul53.us, ptr %arrayidx56.us, align 4
+ %arrayidx58.us = getelementptr inbounds i8, ptr %3, i64 %idxprom44
+ %46 = load i8, ptr %arrayidx58.us, align 1
+ %conv59.us = sext i8 %46 to i32
+ %mul60.us = mul nsw i32 %conv59.us, 127
+ %add61.us = add nsw i32 %mul60.us, %add27.us.lcssa
+ %conv62.us = sitofp i32 %add61.us to float
+ %arrayidx65.us = getelementptr inbounds float, ptr %scales, i64 %2
+ %47 = load float, ptr %arrayidx65.us, align 4
+ %mul66.us = fmul float %47, %conv62.us
+ %arrayidx69.us = getelementptr inbounds float, ptr %v, i64 %2
+ store float %mul66.us, ptr %arrayidx69.us, align 4
+ %arrayidx71.us = getelementptr inbounds i8, ptr %5, i64 %idxprom44
+ %48 = load i8, ptr %arrayidx71.us, align 1
+ %conv72.us = sext i8 %48 to i32
+ %mul73.us = mul nsw i32 %conv72.us, 127
+ %add74.us = add nsw i32 %mul73.us, %add35.us.lcssa
+ %conv75.us = sitofp i32 %add74.us to float
+ %arrayidx78.us = getelementptr inbounds float, ptr %scales, i64 %4
+ %49 = load float, ptr %arrayidx78.us, align 4
+ %mul79.us = fmul float %49, %conv75.us
+ %arrayidx82.us = getelementptr inbounds float, ptr %v, i64 %4
+ store float %mul79.us, ptr %arrayidx82.us, align 4
+ %arrayidx84.us = getelementptr inbounds i8, ptr %7, i64 %idxprom44
+ %50 = load i8, ptr %arrayidx84.us, align 1
+ %conv85.us = sext i8 %50 to i32
+ %mul86.us = mul nsw i32 %conv85.us, 127
+ %add87.us = add nsw i32 %mul86.us, %add43.us.lcssa
+ %conv88.us = sitofp i32 %add87.us to float
+ %arrayidx91.us = getelementptr inbounds float, ptr %scales, i64 %6
+ %51 = load float, ptr %arrayidx91.us, align 4
+ %mul92.us = fmul float %51, %conv88.us
+ %arrayidx95.us = getelementptr inbounds float, ptr %v, i64 %6
+ store float %mul92.us, ptr %arrayidx95.us, align 4
+ %indvars.iv.next165 = add nuw nsw i64 %indvars.iv164, 4
+ %cmp.us = icmp ult i64 %indvars.iv.next165, %0
+ br i1 %cmp.us, label %for.body.us, label %for.end98
+
+for.end98: ; preds = %for.end98.loopexit171, %for.end98.loopexit, %entry
+ ret void
+}
+
+define i32 @not_dotp_predicated(i32 %N, ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define i32 @not_dotp_predicated(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[REM:%.*]] = srem i32 [[N]], 16
+; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[REM]], 0
+; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[REM]] to i64
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP7]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; CHECK-NEXT: [[TMP8:%.*]] = sext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i32>
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP10]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 8 x i32>
+; CHECK-NEXT: [[TMP12:%.*]] = mul nsw <vscale x 8 x i32> [[TMP11]], [[TMP8]]
+; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 8 x i32> [[TMP12]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP14]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i32> [[TMP13]], <vscale x 8 x i32> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 8 x i1> [[TMP15]], i32 0
+; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP14]])
+; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+ %rem = srem i32 %N, 16
+ %cmp8 = icmp sgt i32 %rem, 0
+ br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext nneg i32 %rem to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %add.lcssa = phi i32 [ %add, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %total.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+ ret i32 %total.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %total.09 = phi i32 [ 0, %for.body.preheader ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = sext i8 %0 to i32
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx2, align 1
+ %conv3 = sext i8 %1 to i32
+ %mul = mul nsw i32 %conv3, %conv
+ %add = add nsw i32 %mul, %total.09
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define i32 @not_dotp_predicated_pragma(i32 %N, ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP8_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP7]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT: [[TMP8:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP10]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP12:%.*]] = mul nsw <vscale x 16 x i32> [[TMP11]], [[TMP8]]
+; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 16 x i32> [[TMP12]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP14]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP13]], <vscale x 16 x i32> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[TMP15]], i32 0
+; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP14]])
+; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+ %cmp8.not = icmp eq i32 %N, 0
+ br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %add.lcssa = phi i32 [ %add, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %total.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+ ret i32 %total.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %total.09 = phi i32 [ 0, %for.body.preheader ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = sext i8 %0 to i32
+ %arrayidx2 = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx2, align 1
+ %conv3 = sext i8 %1 to i32
+ %mul = mul nsw i32 %conv3, %conv
+ %add = add nsw i32 %mul, %total.09
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !7
+}
+
+!7 = distinct !{!7, !8, !9, !10}
+!8 = !{!"llvm.loop.mustprogress"}
+!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!10 = !{!"llvm.loop.vectorize.enable", i1 true}
+attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
new file mode 100644
index 00000000000000..7fcb33b8584f33
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -0,0 +1,77 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -disable-output %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+; Tests for printing VPlans that are enabled under AArch64
+
+define void @print_partial_reduction(ptr %a, ptr %b) {
+; CHECK-LABEL: Checking a loop in 'print_partial_reduction'
+; CHECK: VPlan 'Initial VPlan for VF={2,4,8,16},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<0> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<%add>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<%4> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT: WIDEN ir<%1> = load vp<%4>
+; CHECK-NEXT: WIDEN-CAST ir<%conv> = zext ir<%1> to i32
+; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%b>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<%5> = vector-pointer ir<%arrayidx2>
+; CHECK-NEXT: WIDEN ir<%2> = load vp<%5>
+; CHECK-NEXT: WIDEN-CAST ir<%conv3> = zext ir<%2> to i32
+; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%conv3>, ir<%conv>
+; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%mul>, ir<%acc.010>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<%8> = compute-reduction-result ir<[[ACC]]>, ir<%add>
+; CHECK-NEXT: EMIT vp<%9> = extract-from-end vp<%8>, ir<1>
+; CHECK-NEXT: EMIT vp<%10> = icmp eq ir<0>, vp<%1>
+; CHECK-NEXT: EMIT branch-on-cond vp<%10>
+; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%9>)
+; CHECK-NEXT: IR %0 = lshr i32 %add.lcssa, 0
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %0 = lshr i32 %add, 0
+ ret void
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %acc.010 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr i8, ptr %a, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %1 to i32
+ %arrayidx2 = getelementptr i8, ptr %b, i64 %indvars.iv
+ %2 = load i8, ptr %arrayidx2, align 1
+ %conv3 = zext i8 %2 to i32
+ %mul = mul i32 %conv3, %conv
+ %add = add i32 %mul, %acc.010
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 0
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
>From 915c200ab5d3f345e4861dbd313529a04507db80 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 10 Oct 2024 17:14:27 +0100
Subject: [PATCH 2/4] Use a different recipe for each VF range
---
.../AArch64/AArch64TargetTransformInfo.h | 26 +-
.../Transforms/Vectorize/LoopVectorize.cpp | 259 ++--
.../Transforms/Vectorize/VPRecipeBuilder.h | 10 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 12 +
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +
.../Transforms/Vectorize/VPlanTransforms.cpp | 1 +
.../Transforms/Vectorize/VPlanTransforms.h | 3 +
.../AArch64/partial-reduce-dot-product.ll | 1109 ++++++++---------
.../LoopVectorize/AArch64/vplan-printing.ll | 48 +-
9 files changed, 690 insertions(+), 780 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a2b6fbbd6bb824..25517f1ccf42ce 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/InstructionCost.h"
#include <cstdint>
#include <optional>
@@ -348,7 +349,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
TTI::PartialReductionExtendKind OpAExtend,
TTI::PartialReductionExtendKind OpBExtend,
std::optional<unsigned> BinOp) const {
+
InstructionCost Invalid = InstructionCost::getInvalid();
+ InstructionCost Cost(TTI::TCC_Basic);
if (Opcode != Instruction::Add)
return Invalid;
@@ -361,11 +364,28 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
if (VF.isFixed() && !ST->isNeonAvailable() && !ST->hasDotProd())
return Invalid;
+ // FIXME: There should be a nicer way of doing this?
if (InputEVT == MVT::i8) {
- if (AccumEVT != MVT::i32)
+ switch (VF.getKnownMinValue()) {
+ default:
return Invalid;
+ case 8:
+ if (AccumEVT == MVT::i32)
+ Cost *= 2;
+ else if (AccumEVT != MVT::i64)
+ return Invalid;
+ break;
+ case 16:
+ if (AccumEVT == MVT::i64)
+ Cost *= 2;
+ else if (AccumEVT != MVT::i32)
+ return Invalid;
+ break;
+ }
} else if (InputEVT == MVT::i16) {
- if (AccumEVT != MVT::i64)
+ // FIXME: Allow i32 accumulator but increase cost, as we would extend
+ // it to i64.
+ if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64)
return Invalid;
} else
return Invalid;
@@ -376,7 +396,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
if (!BinOp || (*BinOp) != Instruction::Mul)
return Invalid;
- return InstructionCost::getMin();
+ return Cost;
}
bool enableOrderedReductions() const { return true; }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 523614b5d835b7..0e821219770efd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1556,100 +1556,6 @@ class LoopVectorizationCostModel {
return PairIt->second;
}
- void removePartialReduction(Instruction *Instr) {
- PartialReductionChains.erase(Instr);
- }
-
- void addPartialReductionIfSupported(Instruction *Instr, ElementCount VF) {
-
- // Try to commutatively match:
- // bin_op (one_use bin_op (z_or_sext, z_or_sext), phi)
-
- auto *Root = dyn_cast<BinaryOperator>(Instr);
- if (!Root)
- return;
-
- auto *BinOp = dyn_cast<BinaryOperator>(Root->getOperand(0));
- auto *Phi = dyn_cast<PHINode>(Root->getOperand(1));
- if (!BinOp) {
- BinOp = dyn_cast<BinaryOperator>(Root->getOperand(1));
- Phi = dyn_cast<PHINode>(Root->getOperand(0));
- }
- if (!BinOp || !BinOp->hasOneUse()) {
- LLVM_DEBUG(
- dbgs() << "Root was not a one-use binary operator, cannot create a "
- "partial reduction.\n");
- return;
- }
- if (!Phi) {
- LLVM_DEBUG(dbgs() << "Expected Phi node was not a phi, cannot create a "
- "partial reduction.\n");
- return;
- }
-
- auto IsSextOrZext = [](Instruction *I) {
- return I && (I->getOpcode() == Instruction::ZExt ||
- I->getOpcode() == Instruction::SExt);
- };
-
- auto *ExtA = dyn_cast<Instruction>(BinOp->getOperand(0));
- auto *ExtB = dyn_cast<Instruction>(BinOp->getOperand(1));
- if (!IsSextOrZext(ExtA) || !IsSextOrZext(ExtB)) {
- LLVM_DEBUG(dbgs() << "Expected extends were not extends, cannot create a "
- "partial reduction.\n");
- return;
- }
-
- Value *A = ExtA->getOperand(0);
- Value *B = ExtB->getOperand(0);
- // Check that the extends extend from the same type
- if (A->getType() != B->getType()) {
- LLVM_DEBUG(dbgs() << "Extends don't extend from the same type, cannot "
- "create a partial reduction.\n");
- return;
- }
-
- // Check that the extends extend to the same type
- if (ExtA->getType() != ExtB->getType()) {
- LLVM_DEBUG(
- dbgs() << "Extends don't extend to the same type, cannot create "
- "a partial reduction.\n");
- return;
- }
-
- // Check that the second phi value is the instruction we're looking at
- Instruction *MaybeAdd = dyn_cast<Instruction>(
- Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
- if (!MaybeAdd || MaybeAdd != Instr) {
- LLVM_DEBUG(dbgs() << "Second PHI value is not the root binop, cannot "
- "create a partial reduction.\n");
- return;
- }
-
- TTI::PartialReductionExtendKind OpAExtend =
- TargetTransformInfo::getPartialReductionExtendKind(ExtA);
- TTI::PartialReductionExtendKind OpBExtend =
- TargetTransformInfo::getPartialReductionExtendKind(ExtB);
- InstructionCost Cost = TTI.getPartialReductionCost(
- Instr->getOpcode(), A->getType(), Phi->getType(), VF, OpAExtend,
- OpBExtend, std::make_optional(BinOp->getOpcode()));
- if (Cost == InstructionCost::getInvalid())
- return;
-
- PartialReductionChain Chain;
- Chain.Reduction = Instr;
- Chain.BinOp = BinOp;
- Chain.ExtendA = ExtA;
- Chain.ExtendB = ExtB;
- Chain.Accumulator = Phi;
-
- unsigned InputSizeBits = A->getType()->getScalarSizeInBits();
- unsigned ResultSizeBits = Chain.Reduction->getType()->getScalarSizeInBits();
- Chain.ScaleFactor = ResultSizeBits / InputSizeBits;
-
- PartialReductionChains[Instr] = Chain;
- }
-
private:
PartialReductionList PartialReductionChains;
@@ -7071,18 +6977,6 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
-
- // Ignore any values that we know will be flattened
- for (auto It : getPartialReductionChains()) {
- PartialReductionChain Chain = It.second;
- SmallVector<Value *> PartialReductionValues{Chain.Reduction, Chain.BinOp,
- Chain.ExtendA, Chain.ExtendB,
- Chain.Accumulator};
- ValuesToIgnore.insert(PartialReductionValues.begin(),
- PartialReductionValues.end());
- VecValuesToIgnore.insert(PartialReductionValues.begin(),
- PartialReductionValues.end());
- }
}
void LoopVectorizationCostModel::collectInLoopReductions() {
@@ -7200,46 +7094,6 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
- for (auto ReductionVar : Legal->getReductionVars()) {
- auto *ReductionExitInstr = ReductionVar.second.getLoopExitInstr();
- CM.addPartialReductionIfSupported(ReductionExitInstr, UserVF);
- }
-
- // Wider-than-legal vector types (coming from extends in partial reductions)
- // should only be used by partial reductions so that they are lowered properly
-
- // Build up a set of partial reduction bin ops for efficient use checking
- SmallSet<Instruction *, 4> PartialReductionBinOps;
- for (auto It : CM.getPartialReductionChains()) {
- if (It.second.BinOp)
- PartialReductionBinOps.insert(It.second.BinOp);
- }
-
- auto ExtendIsOnlyUsedByPartialReductions =
- [PartialReductionBinOps](Instruction *Extend) {
- for (auto *Use : Extend->users()) {
- Instruction *UseInstr = dyn_cast<Instruction>(Use);
- if (!PartialReductionBinOps.contains(UseInstr))
- return false;
- }
- return true;
- };
-
- // Check if each use of a chain's two extends is a partial reduction
- // and remove those that have non-partial reduction users
- SmallSet<Instruction *, 4> PartialReductionsToRemove;
- for (auto It : CM.getPartialReductionChains()) {
- LoopVectorizationCostModel::PartialReductionChain Chain = It.second;
- if (!ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) ||
- !ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)) {
- PartialReductionsToRemove.insert(Chain.Reduction);
- LLVM_DEBUG(dbgs() << "Removing the partial reduction for an instruction "
- "with an extend used by something other than a "
- "partial reduction "
- << *Chain.Reduction << "\n");
- }
- }
-
CM.collectValuesToIgnore();
CM.collectElementTypesForWidening();
@@ -7264,23 +7118,6 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
if (CM.foldTailByMasking())
Legal->prepareToFoldTailByMasking();
- for (auto Pair : CM.getPartialReductionChains()) {
- // TODO: Allow creating partial reductions when predicating. The select at
- // the end of the loop chooses between the phi value and most recent partial
- // reduction result, both of which have different VFs to the active lane
- // mask.
- Instruction *Instr = Pair.first;
- if (CM.blockNeedsPredicationForAnyReason(Instr->getParent())) {
- LLVM_DEBUG(dbgs() << "LV: Removing the partial reduction for an "
- "instruction in a predicated block: "
- << *Instr << "\n");
- PartialReductionsToRemove.insert(Instr);
- }
- }
-
- for (auto *Insn : PartialReductionsToRemove)
- CM.removePartialReduction(Insn);
-
ElementCount MaxUserVF =
UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
if (UserVF) {
@@ -8800,16 +8637,74 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
return Recipe;
}
-unsigned getScaleFactorForReductionPhi(PHINode *Phi,
- LoopVectorizationCostModel &CM) {
- for (auto *User : Phi->users()) {
- if (auto *I = dyn_cast<Instruction>(User)) {
- if (auto Chain = CM.getInstructionsPartialReduction(I)) {
- return Chain->ScaleFactor;
- }
- }
- }
- return 1;
+/// Examines reduction operations to see if the target can use a cheaper
+/// operation with a wider per-iteration input VF and narrower PHI VF.
+/// Returns the ratio between the two VFs (1 by default).
+static unsigned getReductionScaleFactor(PHINode *PHI,
+ const RecurrenceDescriptor &Rdx,
+ const TargetTransformInfo *TTI,
+ VFRange &Range,
+ LoopVectorizationCostModel &CM) {
+ // FIXME: Should we move this to VPRecipeBuilder and cache the values needed
+ // for the TTI query?
+ unsigned DefaultScaleFactor = 1;
+
+ // TODO: Allow scaling reductions when predicating. The select at
+ // the end of the loop chooses between the phi value and most recent
+ // reduction result, both of which have different VFs to the active lane
+ // mask when scaling.
+ if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
+ return DefaultScaleFactor;
+
+ auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
+ if (!Update)
+ return DefaultScaleFactor;
+
+ Value *Op = Update->getOperand(0);
+ if (Op == PHI)
+ Op = Update->getOperand(1);
+
+ // Match dot product pattern
+ auto *BinOp = dyn_cast<BinaryOperator>(Op);
+ if (!BinOp || !BinOp->hasOneUse())
+ return DefaultScaleFactor;
+
+ auto IsSextOrZext = [](Instruction *I) {
+ return I && (I->getOpcode() == Instruction::ZExt ||
+ I->getOpcode() == Instruction::SExt);
+ };
+
+ auto *ExtA = dyn_cast<Instruction>(BinOp->getOperand(0));
+ auto *ExtB = dyn_cast<Instruction>(BinOp->getOperand(1));
+ if (!IsSextOrZext(ExtA) || !IsSextOrZext(ExtB))
+ return DefaultScaleFactor;
+
+ Value *A = ExtA->getOperand(0);
+ Value *B = ExtB->getOperand(0);
+ // Check that the extends extend from the same type
+ if (A->getType() != B->getType())
+ return DefaultScaleFactor;
+
+ unsigned TargetScaleFactor =
+ PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
+ A->getType()->getPrimitiveSizeInBits());
+
+ TTI::PartialReductionExtendKind OpAExtend =
+ TargetTransformInfo::getPartialReductionExtendKind(ExtA);
+ TTI::PartialReductionExtendKind OpBExtend =
+ TargetTransformInfo::getPartialReductionExtendKind(ExtB);
+
+ if (LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) {
+ InstructionCost Cost = TTI->getPartialReductionCost(
+ Update->getOpcode(), A->getType(), PHI->getType(), VF,
+ OpAExtend, OpBExtend, std::make_optional(BinOp->getOpcode()));
+ return Cost.isValid();
+ },
+ Range))
+ return TargetScaleFactor;
+
+ return DefaultScaleFactor;
}
VPRecipeBase *
@@ -8838,7 +8733,11 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
// If the PHI is used by a partial reduction, set the scale factor
- unsigned ScaleFactor = getScaleFactorForReductionPhi(Phi, CM);
+ unsigned ScaleFactor =
+ getReductionScaleFactor(Phi, RdxDesc, TTI, Range, CM);
+ Instruction *ReductionInstr = RdxDesc.getLoopExitInstr();
+ if (ScaleFactor != 1)
+ Plan.addScaledReductionExitInstr(RdxDesc.getLoopExitInstr());
PhiRecipe = new VPReductionPHIRecipe(
Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
CM.useOrderedReductions(RdxDesc), ScaleFactor);
@@ -8873,6 +8772,9 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range);
+ if (Plan.isScaledReductionExitInstr(Instr))
+ return tryToCreatePartialReduction(Instr, Operands);
+
if (!shouldWiden(Instr, Range))
return nullptr;
@@ -8895,7 +8797,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
VPRecipeBase *
VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
- unsigned ScaleFactor,
ArrayRef<VPValue *> Operands) {
assert(Operands.size() == 2 &&
"Unexpected number of operands for partial reduction");
@@ -9213,7 +9114,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
bool HasNUW = Style == TailFoldingStyle::None;
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
- VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
+ VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
+ Builder);
// ---------------------------------------------------------------------------
// Pre-construction: record ingredients whose recipes we'll need to further
@@ -9306,10 +9208,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPRecipeBase *Recipe = nullptr;
- if (auto Chain = CM.getInstructionsPartialReduction(Instr))
- Recipe = RecipeBuilder.tryToCreatePartialReduction(
- Chain->Reduction, Chain->ScaleFactor, Operands);
- else if (!Recipe)
+ if (!Recipe)
Recipe =
RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
if (!Recipe)
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 136766eb455fde..b93b8df8653ef2 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -22,6 +22,7 @@ class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class TargetLibraryInfo;
struct HistogramInfo;
+class TargetTransformInfo;
/// Helper class to create VPRecipies from IR instructions.
class VPRecipeBuilder {
@@ -34,6 +35,9 @@ class VPRecipeBuilder {
/// Target Library Info.
const TargetLibraryInfo *TLI;
+ // Target Transform Info
+ const TargetTransformInfo *TTI;
+
/// The legality analysis.
LoopVectorizationLegality *Legal;
@@ -113,11 +117,12 @@ class VPRecipeBuilder {
public:
VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI,
LoopVectorizationLegality *Legal,
LoopVectorizationCostModel &CM,
PredicatedScalarEvolution &PSE, VPBuilder &Builder)
- : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM),
- PSE(PSE), Builder(Builder) {}
+ : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal),
+ CM(CM), PSE(PSE), Builder(Builder) {}
/// Create and return a widened recipe for \p I if one can be created within
/// the given VF \p Range.
@@ -126,7 +131,6 @@ class VPRecipeBuilder {
VFRange &Range, VPBasicBlock *VPBB);
VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
- unsigned ScaleFactor,
ArrayRef<VPValue *> Operands);
/// Set the recipe created for given ingredient.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index b06c90daa7d89b..ad4594ea918c59 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3639,6 +3639,10 @@ class VPlan {
/// been modeled in VPlan directly.
DenseMap<const SCEV *, VPValue *> SCEVToExpansion;
+ /// Stores the set of reduction exit instructions that will be scaled to
+ /// a smaller VF in this plan via partial reductions.
+ SmallPtrSet<const Instruction *, 2> ScaledReductionExitInstrs;
+
public:
/// Construct a VPlan with original preheader \p Preheader, trip count \p TC
/// and \p Entry to the plan. At the moment, \p Preheader and \p Entry need to
@@ -3835,6 +3839,14 @@ class VPlan {
/// Clone the current VPlan, update all VPValues of the new VPlan and cloned
/// recipes to refer to the clones, and return it.
VPlan *duplicate();
+
+ void addScaledReductionExitInstr(const Instruction *ExitInst) {
+ ScaledReductionExitInstrs.insert(ExitInst);
+ }
+
+ bool isScaledReductionExitInstr(const Instruction *ExitInst) {
+ return ScaledReductionExitInstrs.contains(ExitInst);
+ }
};
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 67dd0a64913951..6a05a8c9acf78f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3277,6 +3277,8 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
printAsOperand(O, SlotTracker);
O << " = phi ";
printOperands(O, SlotTracker);
+ if (VFScaleFactor != 1)
+ O << " (VF scaled by 1/" << VFScaleFactor << ")";
}
#endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 379bfc0a4394bf..4be1d26bf853e5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -25,6 +25,7 @@
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/PatternMatch.h"
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 3b792ee32dce6e..1dc425494858b7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -15,10 +15,12 @@
#include "VPlan.h"
#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
namespace llvm {
class InductionDescriptor;
+class RecurrenceDescriptor;
class Instruction;
class PHINode;
class ScalarEvolution;
@@ -26,6 +28,7 @@ class PredicatedScalarEvolution;
class TargetLibraryInfo;
class VPBuilder;
class VPRecipeBuilder;
+class TargetTransformInfo;
struct VPlanTransforms {
/// Replaces the VPInstructions in \p Plan with corresponding
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index bcac4d674123d8..9d7dda55a45c41 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVE1
-; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVED
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-target-instruction-cost=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVE1
+; RUN: opt -passes=loop-vectorize -force-target-instruction-cost=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INTERLEAVED
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-none-unknown-elf"
@@ -8,91 +8,151 @@ target triple = "aarch64-none-unknown-elf"
define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-LABEL: define i32 @dotp(
; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: iter.check:
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.main.loop.iter.check:
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY1:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE1:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP23]], [[TMP19]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1: middle.block:
+; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE1]])
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-INTERLEAVE1: vec.epilog.iter.check:
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-INTERLEAVE1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-INTERLEAVE1: vec.epilog.ph:
+; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP28]], 4
; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1: vector.body:
-; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1: vec.epilog.vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP13]])
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul <vscale x 4 x i32> [[TMP25]], [[TMP22]]
+; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = add <vscale x 4 x i32> [[TMP26]], [[VEC_PHI3]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-INTERLEAVE1: middle.block:
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVE1: vec.epilog.middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[SCALAR_PH]]
;
; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: iter.check:
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.main.loop.iter.check:
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY1:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[TMP20]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP28]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul <16 x i32> [[TMP11]], [[TMP25]]
+; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul <16 x i32> [[TMP12]], [[TMP26]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP13]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP29]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 0
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX1:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX1]])
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-INTERLEAVED: vec.epilog.iter.check:
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-INTERLEAVED-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP18]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-INTERLEAVED: vec.epilog.ph:
+; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP19]], 4
; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED: vector.body:
-; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED: vec.epilog.vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <vscale x 4 x i32> [ [[TMP33]], [[VEC_EPILOG_PH]] ], [ [[BIN_RDX:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 16
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 16
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP15]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul <vscale x 16 x i32> [[TMP19]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 16 x i32> [[TMP20]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP21]])
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP22]])
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP15]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul <vscale x 4 x i32> [[TMP30]], [[TMP27]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX]] = add <vscale x 4 x i32> [[TMP31]], [[VEC_PHI7]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVED: vec.epilog.middle.block:
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[SCALAR_PH]]
;
entry:
br label %for.body
@@ -120,7 +180,12 @@ for.body: ; preds = %for.body, %entry
define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-LABEL: define void @not_dotp_different_types(
; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: iter.check:
+; CHECK-INTERLEAVE1-NEXT: [[TMP72:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP73:%.*]] = mul i64 [[TMP72]], 4
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP73]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-INTERLEAVE1: vector.main.loop.iter.check:
; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -200,14 +265,89 @@ define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK-INTERLEAVE1: middle.block:
; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
-; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-INTERLEAVE1: vec.epilog.iter.check:
+; CHECK-INTERLEAVE1-NEXT: [[TMP74:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP75:%.*]] = mul i64 [[TMP74]], 4
+; CHECK-INTERLEAVE1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP75]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1: vec.epilog.ph:
+; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP76:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP77:%.*]] = mul i64 [[TMP76]], 4
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP77]]
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP78:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP79:%.*]] = mul i64 [[TMP78]], 4
+; CHECK-INTERLEAVE1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
+; CHECK-INTERLEAVE1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT: [[TMP80:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; CHECK-INTERLEAVE1-NEXT: [[TMP81:%.*]] = add <vscale x 4 x i64> [[TMP80]], zeroinitializer
+; CHECK-INTERLEAVE1-NEXT: [[TMP82:%.*]] = mul <vscale x 4 x i64> [[TMP81]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-INTERLEAVE1-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT]], [[TMP82]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP83:%.*]] = mul i64 1, [[TMP79]]
+; CHECK-INTERLEAVE1-NEXT: [[DOTSPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP83]], i64 0
+; CHECK-INTERLEAVE1-NEXT: [[DOTSPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT: [[TMP84:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-INTERLEAVE1-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vec.epilog.vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI5:%.*]] = phi <vscale x 4 x i32> [ [[TMP84]], [[SCALAR_PH]] ], [ [[TMP92:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP85:%.*]] = add i64 [[INDEX2]], 0
+; CHECK-INTERLEAVE1-NEXT: [[TMP86:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP85]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP87:%.*]] = getelementptr i8, ptr [[TMP86]], i32 0
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP87]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP88:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD6]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP89:%.*]] = getelementptr i8, ptr [[B]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP89]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
+; CHECK-INTERLEAVE1-NEXT: [[TMP90:%.*]] = zext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP91:%.*]] = mul <vscale x 4 x i32> [[TMP90]], [[TMP88]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP92]] = add <vscale x 4 x i32> [[TMP91]], [[VEC_PHI5]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX2]], [[TMP79]]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT4]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP93:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP93]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVE1: vec.epilog.middle.block:
+; CHECK-INTERLEAVE1-NEXT: [[TMP94:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP92]])
+; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-INTERLEAVE1: vec.epilog.scalar.ph:
+; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX8:%.*]] = phi i32 [ [[TMP94]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: for.cond.cleanup.loopexit:
+; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP71]], [[MIDDLE_BLOCK]] ], [ [[TMP94]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP95:%.*]] = lshr i32 [[ADD_LCSSA]], 0
+; CHECK-INTERLEAVE1-NEXT: ret void
+; CHECK-INTERLEAVE1: for.body:
+; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX8]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2
+; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
;
; CHECK-INTERLEAVED-LABEL: define void @not_dotp_different_types(
; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: iter.check:
+; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 4
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP38]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-INTERLEAVED: vector.main.loop.iter.check:
; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -254,54 +394,54 @@ define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
-; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
-; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
-; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
-; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
-; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
-; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
-; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
-; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
-; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
-; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
-; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
-; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
-; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
-; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
-; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
-; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
-; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
-; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
-; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
-; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i16, ptr [[TMP37]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = load i16, ptr [[TMP38]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = load i16, ptr [[TMP39]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = load i16, ptr [[TMP40]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = load i16, ptr [[TMP41]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i16, ptr [[TMP42]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = load i16, ptr [[TMP43]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = load i16, ptr [[TMP44]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = load i16, ptr [[TMP45]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = load i16, ptr [[TMP46]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i16, ptr [[TMP47]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = load i16, ptr [[TMP48]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = load i16, ptr [[TMP49]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP50]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP51]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
+; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
+; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
+; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
+; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
+; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
+; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
+; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
+; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
+; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
+; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i16, ptr [[TMP39]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = load i16, ptr [[TMP40]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = load i16, ptr [[TMP41]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = load i16, ptr [[TMP42]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = load i16, ptr [[TMP43]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i16, ptr [[TMP44]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = load i16, ptr [[TMP45]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = load i16, ptr [[TMP46]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = load i16, ptr [[TMP47]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = load i16, ptr [[TMP48]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i16, ptr [[TMP49]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = load i16, ptr [[TMP50]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = load i16, ptr [[TMP51]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP53]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP54]], align 2
; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
@@ -318,22 +458,22 @@ define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
-; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP53]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP54]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP55]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP56]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP57]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP58]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP59]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP60]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP61]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP62]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP63]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP64]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP65]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP66]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP67]], align 2
-; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP68]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP55]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP56]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP57]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP58]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP59]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP60]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP61]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP62]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP63]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP64]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP65]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP66]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP67]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP68]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP141]], align 2
+; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP142]], align 2
; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
@@ -358,11 +498,11 @@ define void @not_dotp_different_types(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
;
entry:
br label %for.body
@@ -392,88 +532,88 @@ define void @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 0, i32 [[TMP8]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 16 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP16]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP16]], i32 -1)
-; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP16]] = mul <vscale x 16 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> [[VECTOR_RECUR]], <vscale x 16 x i32> [[TMP16]], i32 -1)
+; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add <vscale x 16 x i32> [[TMP16]], [[TMP17]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
;
; CHECK-INTERLEAVED-LABEL: define void @not_dotp_not_loop_carried(
; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32
; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 0, i32 [[TMP8]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 16 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT: [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
-; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1)
-; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul <vscale x 16 x i32> [[TMP22]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP25]] = mul <vscale x 16 x i32> [[TMP23]], [[TMP16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> [[TMP24]], <vscale x 16 x i32> [[TMP25]], i32 -1)
+; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add <vscale x 16 x i32> [[TMP25]], [[TMP26]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
;
entry:
br label %for.body
@@ -503,81 +643,81 @@ define void @not_dotp_not_phi(ptr %a, ptr %b) #0 {
; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVE1-NEXT: entry:
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVE1: vector.ph:
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 16
; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 0, i32 [[TMP8]]
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVE1: vector.body:
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 16 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP17]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul <vscale x 16 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT: [[TMP17]] = add <vscale x 16 x i32> [[TMP16]], [[TMP15]]
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
;
; CHECK-INTERLEAVED-LABEL: define void @not_dotp_not_phi(
; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32
; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 0, i32 [[TMP8]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 16 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 16
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP19]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP20]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <vscale x 8 x i32> [[TMP22]], [[TMP21]]
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 16 x i32> [[TMP21]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <vscale x 16 x i32> [[TMP22]], [[TMP21]]
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
;
entry:
br label %for.body
@@ -603,370 +743,165 @@ for.body: ; preds = %for.body, %entry
}
define void @dotp_unrolled(i32 %num_out, i32 %num_in, ptr %w, ptr %scales, ptr %u, ptr %v) #0 {
-; CHECK-INTERLEAVE1-LABEL: define void @dotp_unrolled(
-; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i32 [[NUM_IN:%.*]], ptr [[W:%.*]], ptr [[SCALES:%.*]], ptr [[U:%.*]], ptr [[V:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT: entry:
-; CHECK-INTERLEAVE1-NEXT: [[CMP154:%.*]] = icmp sgt i32 [[NUM_OUT]], 3
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP154]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END98:%.*]]
-; CHECK-INTERLEAVE1: for.body.lr.ph:
-; CHECK-INTERLEAVE1-NEXT: [[DIV:%.*]] = sdiv i32 [[NUM_OUT]], 4
-; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = shl nsw i32 [[DIV]], 2
-; CHECK-INTERLEAVE1-NEXT: [[CMP11145:%.*]] = icmp sgt i32 [[NUM_IN]], 0
-; CHECK-INTERLEAVE1-NEXT: [[IDXPROM44:%.*]] = sext i32 [[NUM_IN]] to i64
-; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext nneg i32 [[MUL]] to i64
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP11145]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK-INTERLEAVE1: for.body.preheader:
-; CHECK-INTERLEAVE1-NEXT: br label [[FOR_END98]]
-; CHECK-INTERLEAVE1: for.body.us.preheader:
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[NUM_IN]] to i64
-; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY_US:%.*]]
-; CHECK-INTERLEAVE1: for.body.us:
-; CHECK-INTERLEAVE1-NEXT: [[INDVARS_IV164:%.*]] = phi i64 [ 0, [[FOR_BODY_US_PREHEADER]] ], [ [[INDVARS_IV_NEXT165:%.*]], [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US:%.*]] ]
-; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[INDVARS_IV164]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
-; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = or disjoint i64 [[INDVARS_IV164]], 1
-; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX3_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP2]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX3_US]], align 8
-; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = or disjoint i64 [[INDVARS_IV164]], 2
-; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP4]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[ARRAYIDX6_US]], align 8
-; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = or disjoint i64 [[INDVARS_IV164]], 3
-; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX9_US]], align 8
-; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
-; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1: vector.ph:
-; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP11]]
-; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16
-; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1: vector.body:
-; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE181:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI172:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE179:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI173:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE177:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI174:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD175:%.*]] = load <vscale x 16 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD175]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nsw <vscale x 16 x i32> [[TMP20]], [[TMP17]]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI174]], <vscale x 16 x i32> [[TMP21]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD176:%.*]] = load <vscale x 16 x i8>, ptr [[TMP23]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD176]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = mul nsw <vscale x 16 x i32> [[TMP24]], [[TMP20]]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE177]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI173]], <vscale x 16 x i32> [[TMP25]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD178:%.*]] = load <vscale x 16 x i8>, ptr [[TMP27]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD178]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP28]], [[TMP20]]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE179]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI172]], <vscale x 16 x i32> [[TMP29]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD180:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD180]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP32]], [[TMP20]]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE181]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP33]])
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
-; CHECK-INTERLEAVE1: middle.block:
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE177_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE177]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE179_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE179]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE181_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE181]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE181_LCSSA]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE179_LCSSA]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE177_LCSSA]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_LCSSA]])
-; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[SCALAR_PH]]
-; CHECK-INTERLEAVE1: scalar.ph:
-; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX182:%.*]] = phi i32 [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX183:%.*]] = phi i32 [ [[TMP37]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX184:%.*]] = phi i32 [ [[TMP38]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[BC_RESUME_VAL]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = mul i64 [[TMP40]], 16
-; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[TMP39]], [[TMP41]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK7]], label [[SCALAR_PH6:%.*]], label [[VECTOR_PH8:%.*]]
-; CHECK-INTERLEAVE1: vector.ph8:
-; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 16
-; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF9:%.*]] = urem i64 [[TMP39]], [[TMP43]]
-; CHECK-INTERLEAVE1-NEXT: [[N_VEC10:%.*]] = sub i64 [[TMP39]], [[N_MOD_VF9]]
-; CHECK-INTERLEAVE1-NEXT: [[IND_END:%.*]] = add i64 [[BC_RESUME_VAL]], [[N_VEC10]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 16
-; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX182]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX183]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX184]], i32 0
-; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY12:%.*]]
-; CHECK-INTERLEAVE1: vector.body12:
-; CHECK-INTERLEAVE1-NEXT: [[INDEX13:%.*]] = phi i64 [ 0, [[VECTOR_PH8]] ], [ [[INDEX_NEXT27:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI14:%.*]] = phi <vscale x 4 x i32> [ [[TMP46]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE26:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI15:%.*]] = phi <vscale x 4 x i32> [ [[TMP47]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE24:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI16:%.*]] = phi <vscale x 4 x i32> [ [[TMP48]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE22:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI17:%.*]] = phi <vscale x 4 x i32> [ [[TMP49]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE20:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[BC_RESUME_VAL]], [[INDEX13]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP50]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP51]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD18:%.*]] = load <vscale x 16 x i8>, ptr [[TMP52]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD18]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP50]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD19:%.*]] = load <vscale x 16 x i8>, ptr [[TMP55]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD19]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = mul nsw <vscale x 16 x i32> [[TMP56]], [[TMP53]]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE20]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI17]], <vscale x 16 x i32> [[TMP57]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP50]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP58]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD21:%.*]] = load <vscale x 16 x i8>, ptr [[TMP59]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD21]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = mul nsw <vscale x 16 x i32> [[TMP60]], [[TMP56]]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE22]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI16]], <vscale x 16 x i32> [[TMP61]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP50]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[TMP62]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD23:%.*]] = load <vscale x 16 x i8>, ptr [[TMP63]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD23]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = mul nsw <vscale x 16 x i32> [[TMP64]], [[TMP56]]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE24]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI15]], <vscale x 16 x i32> [[TMP65]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP50]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, ptr [[TMP66]], i32 0
-; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD25:%.*]] = load <vscale x 16 x i8>, ptr [[TMP67]], align 1
-; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD25]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVE1-NEXT: [[TMP69:%.*]] = mul nsw <vscale x 16 x i32> [[TMP68]], [[TMP56]]
-; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE26]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI14]], <vscale x 16 x i32> [[TMP69]])
-; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT27]] = add nuw i64 [[INDEX13]], [[TMP45]]
-; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT27]], [[N_VEC10]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK5:%.*]], label [[VECTOR_BODY12]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-INTERLEAVE1: middle.block5:
-; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE26]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP72:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE24]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE22]])
-; CHECK-INTERLEAVE1-NEXT: [[TMP74:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE20]])
-; CHECK-INTERLEAVE1-NEXT: [[CMP_N28:%.*]] = icmp eq i64 [[TMP39]], [[N_VEC10]]
-; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N28]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US_LOOPEXIT:%.*]], label [[SCALAR_PH6]]
-;
-; CHECK-INTERLEAVED-LABEL: define void @dotp_unrolled(
-; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i32 [[NUM_IN:%.*]], ptr [[W:%.*]], ptr [[SCALES:%.*]], ptr [[U:%.*]], ptr [[V:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT: entry:
-; CHECK-INTERLEAVED-NEXT: [[CMP154:%.*]] = icmp sgt i32 [[NUM_OUT]], 3
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP154]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END98:%.*]]
-; CHECK-INTERLEAVED: for.body.lr.ph:
-; CHECK-INTERLEAVED-NEXT: [[DIV:%.*]] = sdiv i32 [[NUM_OUT]], 4
-; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = shl nsw i32 [[DIV]], 2
-; CHECK-INTERLEAVED-NEXT: [[CMP11145:%.*]] = icmp sgt i32 [[NUM_IN]], 0
-; CHECK-INTERLEAVED-NEXT: [[IDXPROM44:%.*]] = sext i32 [[NUM_IN]] to i64
-; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext nneg i32 [[MUL]] to i64
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP11145]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK-INTERLEAVED: for.body.preheader:
-; CHECK-INTERLEAVED-NEXT: br label [[FOR_END98]]
-; CHECK-INTERLEAVED: for.body.us.preheader:
-; CHECK-INTERLEAVED-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[NUM_IN]] to i64
-; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY_US:%.*]]
-; CHECK-INTERLEAVED: for.body.us:
-; CHECK-INTERLEAVED-NEXT: [[INDVARS_IV164:%.*]] = phi i64 [ 0, [[FOR_BODY_US_PREHEADER]] ], [ [[INDVARS_IV_NEXT165:%.*]], [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US:%.*]] ]
-; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[INDVARS_IV164]]
-; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
-; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = or disjoint i64 [[INDVARS_IV164]], 1
-; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX3_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP2]]
-; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX3_US]], align 8
-; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = or disjoint i64 [[INDVARS_IV164]], 2
-; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP4]]
-; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = load ptr, ptr [[ARRAYIDX6_US]], align 8
-; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = or disjoint i64 [[INDVARS_IV164]], 3
-; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX9_US]], align 8
-; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP9]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP11]]
-; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16
-; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED: vector.body:
-; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE181:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI172:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE179:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI173:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE177:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI174:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD175:%.*]] = load <vscale x 16 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD175]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <vscale x 16 x i32> [[TMP20]], [[TMP17]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI174]], <vscale x 16 x i32> [[TMP21]])
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD176:%.*]] = load <vscale x 16 x i8>, ptr [[TMP23]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD176]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nsw <vscale x 16 x i32> [[TMP24]], [[TMP20]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE177]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI173]], <vscale x 16 x i32> [[TMP25]])
-; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD178:%.*]] = load <vscale x 16 x i8>, ptr [[TMP27]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD178]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP28]], [[TMP20]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE179]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI172]], <vscale x 16 x i32> [[TMP29]])
-; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD180:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD180]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP32]], [[TMP20]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE181]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP33]])
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
-; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE177_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE177]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE179_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE179]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE181_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE181]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE181_LCSSA]])
-; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE179_LCSSA]])
-; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE177_LCSSA]])
-; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_LCSSA]])
-; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[SCALAR_PH]]
-; CHECK-INTERLEAVED: scalar.ph:
-; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX182:%.*]] = phi i32 [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX183:%.*]] = phi i32 [ [[TMP37]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX184:%.*]] = phi i32 [ [[TMP38]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[BC_RESUME_VAL]]
-; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = mul i64 [[TMP40]], 32
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[TMP39]], [[TMP41]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK7]], label [[SCALAR_PH6:%.*]], label [[VECTOR_PH8:%.*]]
-; CHECK-INTERLEAVED: vector.ph8:
-; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 32
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF9:%.*]] = urem i64 [[TMP39]], [[TMP43]]
-; CHECK-INTERLEAVED-NEXT: [[N_VEC10:%.*]] = sub i64 [[TMP39]], [[N_MOD_VF9]]
-; CHECK-INTERLEAVED-NEXT: [[IND_END:%.*]] = add i64 [[BC_RESUME_VAL]], [[N_VEC10]]
-; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 32
-; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX182]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX183]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX184]], i32 0
-; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY12:%.*]]
-; CHECK-INTERLEAVED: vector.body12:
-; CHECK-INTERLEAVED-NEXT: [[INDEX13:%.*]] = phi i64 [ 0, [[VECTOR_PH8]] ], [ [[INDEX_NEXT40:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI14:%.*]] = phi <vscale x 4 x i32> [ [[TMP46]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE38:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI15:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE39:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI16:%.*]] = phi <vscale x 4 x i32> [ [[TMP47]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE34:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI17:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE35:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI18:%.*]] = phi <vscale x 4 x i32> [ [[TMP48]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE30:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI19:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE31:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI20:%.*]] = phi <vscale x 4 x i32> [ [[TMP49]], [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE26:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT: [[VEC_PHI21:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH8]] ], [ [[PARTIAL_REDUCE27:%.*]], [[VECTOR_BODY12]] ]
-; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[BC_RESUME_VAL]], [[INDEX13]]
-; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP50]]
-; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP51]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = mul i64 [[TMP53]], 16
-; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP51]], i64 [[TMP54]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD22:%.*]] = load <vscale x 16 x i8>, ptr [[TMP52]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD23:%.*]] = load <vscale x 16 x i8>, ptr [[TMP55]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD22]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD23]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP50]]
-; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP58]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = mul i64 [[TMP60]], 16
-; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[TMP58]], i64 [[TMP61]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD24:%.*]] = load <vscale x 16 x i8>, ptr [[TMP59]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <vscale x 16 x i8>, ptr [[TMP62]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD24]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD25]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = mul nsw <vscale x 16 x i32> [[TMP63]], [[TMP56]]
-; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = mul nsw <vscale x 16 x i32> [[TMP64]], [[TMP57]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE26]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI20]], <vscale x 16 x i32> [[TMP65]])
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE27]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI21]], <vscale x 16 x i32> [[TMP66]])
-; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP50]]
-; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[TMP67]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 16
-; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = getelementptr inbounds i8, ptr [[TMP67]], i64 [[TMP70]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD28:%.*]] = load <vscale x 16 x i8>, ptr [[TMP68]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD29:%.*]] = load <vscale x 16 x i8>, ptr [[TMP71]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD28]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD29]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = mul nsw <vscale x 16 x i32> [[TMP72]], [[TMP63]]
-; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = mul nsw <vscale x 16 x i32> [[TMP73]], [[TMP64]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE30]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI18]], <vscale x 16 x i32> [[TMP74]])
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE31]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI19]], <vscale x 16 x i32> [[TMP75]])
-; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP50]]
-; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = getelementptr inbounds i8, ptr [[TMP76]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = mul i64 [[TMP78]], 16
-; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = getelementptr inbounds i8, ptr [[TMP76]], i64 [[TMP79]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD32:%.*]] = load <vscale x 16 x i8>, ptr [[TMP77]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD33:%.*]] = load <vscale x 16 x i8>, ptr [[TMP80]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD32]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD33]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = mul nsw <vscale x 16 x i32> [[TMP81]], [[TMP63]]
-; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = mul nsw <vscale x 16 x i32> [[TMP82]], [[TMP64]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE34]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI16]], <vscale x 16 x i32> [[TMP83]])
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE35]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI17]], <vscale x 16 x i32> [[TMP84]])
-; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP50]]
-; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = getelementptr inbounds i8, ptr [[TMP85]], i32 0
-; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 16
-; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = getelementptr inbounds i8, ptr [[TMP85]], i64 [[TMP88]]
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD36:%.*]] = load <vscale x 16 x i8>, ptr [[TMP86]], align 1
-; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD37:%.*]] = load <vscale x 16 x i8>, ptr [[TMP89]], align 1
-; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD36]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD37]] to <vscale x 16 x i32>
-; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = mul nsw <vscale x 16 x i32> [[TMP90]], [[TMP63]]
-; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = mul nsw <vscale x 16 x i32> [[TMP91]], [[TMP64]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE38]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI14]], <vscale x 16 x i32> [[TMP92]])
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE39]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI15]], <vscale x 16 x i32> [[TMP93]])
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT40]] = add nuw i64 [[INDEX13]], [[TMP45]]
-; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = icmp eq i64 [[INDEX_NEXT40]], [[N_VEC10]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[TMP94]], label [[MIDDLE_BLOCK5:%.*]], label [[VECTOR_BODY12]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-INTERLEAVED: middle.block5:
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE39]], [[PARTIAL_REDUCE38]]
-; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX41:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE35]], [[PARTIAL_REDUCE34]]
-; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX41]])
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX42:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE31]], [[PARTIAL_REDUCE30]]
-; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX42]])
-; CHECK-INTERLEAVED-NEXT: [[BIN_RDX43:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE27]], [[PARTIAL_REDUCE26]]
-; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX43]])
-; CHECK-INTERLEAVED-NEXT: [[CMP_N44:%.*]] = icmp eq i64 [[TMP39]], [[N_VEC10]]
-; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N44]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US_LOOPEXIT:%.*]], label [[SCALAR_PH6]]
+; CHECK-LABEL: define void @dotp_unrolled(
+; CHECK-SAME: i32 [[NUM_OUT:%.*]], i32 [[NUM_IN:%.*]], ptr [[W:%.*]], ptr [[SCALES:%.*]], ptr [[U:%.*]], ptr [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP154:%.*]] = icmp sgt i32 [[NUM_OUT]], 3
+; CHECK-NEXT: br i1 [[CMP154]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END98:%.*]]
+; CHECK: for.body.lr.ph:
+; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[NUM_OUT]], 4
+; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[DIV]], 2
+; CHECK-NEXT: [[CMP11145:%.*]] = icmp sgt i32 [[NUM_IN]], 0
+; CHECK-NEXT: [[IDXPROM44:%.*]] = sext i32 [[NUM_IN]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = zext nneg i32 [[MUL]] to i64
+; CHECK-NEXT: br i1 [[CMP11145]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: br label [[FOR_END98]]
+; CHECK: for.body.us.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[NUM_IN]] to i64
+; CHECK-NEXT: br label [[FOR_BODY_US:%.*]]
+; CHECK: for.body.us:
+; CHECK-NEXT: [[INDVARS_IV164:%.*]] = phi i64 [ 0, [[FOR_BODY_US_PREHEADER]] ], [ [[INDVARS_IV_NEXT165:%.*]], [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US:%.*]] ]
+; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[INDVARS_IV164]]
+; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i64 [[INDVARS_IV164]], 1
+; CHECK-NEXT: [[ARRAYIDX3_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX3_US]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[INDVARS_IV164]], 2
+; CHECK-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[ARRAYIDX6_US]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = or disjoint i64 [[INDVARS_IV164]], 3
+; CHECK-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds ptr, ptr [[W]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX9_US]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP9]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[ITER_CHECK:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP11]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE181:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI172:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE179:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI173:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE177:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI174:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1
+; CHECK-NEXT: [[TMP17:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP14]]
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD175:%.*]] = load <vscale x 16 x i8>, ptr [[TMP19]], align 1
+; CHECK-NEXT: [[TMP20:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD175]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP21:%.*]] = mul nsw <vscale x 16 x i32> [[TMP20]], [[TMP17]]
+; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI174]], <vscale x 16 x i32> [[TMP21]])
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP14]]
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD176:%.*]] = load <vscale x 16 x i8>, ptr [[TMP23]], align 1
+; CHECK-NEXT: [[TMP24:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD176]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP25:%.*]] = mul nsw <vscale x 16 x i32> [[TMP24]], [[TMP20]]
+; CHECK-NEXT: [[PARTIAL_REDUCE177]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI173]], <vscale x 16 x i32> [[TMP25]])
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP14]]
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD178:%.*]] = load <vscale x 16 x i8>, ptr [[TMP27]], align 1
+; CHECK-NEXT: [[TMP28:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD178]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP29:%.*]] = mul nsw <vscale x 16 x i32> [[TMP28]], [[TMP20]]
+; CHECK-NEXT: [[PARTIAL_REDUCE179]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI172]], <vscale x 16 x i32> [[TMP29]])
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP14]]
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD180:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-NEXT: [[TMP32:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD180]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP32]], [[TMP20]]
+; CHECK-NEXT: [[PARTIAL_REDUCE181]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP33]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
+; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[PARTIAL_REDUCE_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[PARTIAL_REDUCE177_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE177]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[PARTIAL_REDUCE179_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE179]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[PARTIAL_REDUCE181_LCSSA:%.*]] = phi <vscale x 4 x i32> [ [[PARTIAL_REDUCE181]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE181_LCSSA]])
+; CHECK-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE179_LCSSA]])
+; CHECK-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE177_LCSSA]])
+; CHECK-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_LCSSA]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US]], label [[ITER_CHECK]]
+; CHECK: iter.check:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX182:%.*]] = phi i32 [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX183:%.*]] = phi i32 [ [[TMP37]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX184:%.*]] = phi i32 [ [[TMP38]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-NEXT: [[TMP39:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[BC_RESUME_VAL]]
+; CHECK-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP40]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[TMP39]], [[TMP41]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK: vector.main.loop.iter.check:
+; CHECK-NEXT: [[MIN_ITERS_CHECK9:%.*]] = icmp ult i64 [[TMP39]], 16
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK9]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH10:%.*]]
+; CHECK: vector.ph10:
+; CHECK-NEXT: [[N_MOD_VF11:%.*]] = urem i64 [[TMP39]], 16
+; CHECK-NEXT: [[N_VEC12:%.*]] = sub i64 [[TMP39]], [[N_MOD_VF11]]
+; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX182]], i32 0
+; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX183]], i32 0
+; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX184]], i32 0
+; CHECK-NEXT: br label [[VECTOR_BODY13:%.*]]
+; CHECK: vector.body13:
+; CHECK-NEXT: [[INDEX14:%.*]] = phi i64 [ 0, [[VECTOR_PH10]] ], [ [[INDEX_NEXT28:%.*]], [[VECTOR_BODY13]] ]
+; CHECK-NEXT: [[VEC_PHI15:%.*]] = phi <4 x i32> [ [[TMP42]], [[VECTOR_PH10]] ], [ [[PARTIAL_REDUCE27:%.*]], [[VECTOR_BODY13]] ]
+; CHECK-NEXT: [[VEC_PHI16:%.*]] = phi <4 x i32> [ [[TMP43]], [[VECTOR_PH10]] ], [ [[PARTIAL_REDUCE25:%.*]], [[VECTOR_BODY13]] ]
+; CHECK-NEXT: [[VEC_PHI17:%.*]] = phi <4 x i32> [ [[TMP44]], [[VECTOR_PH10]] ], [ [[PARTIAL_REDUCE23:%.*]], [[VECTOR_BODY13]] ]
+; CHECK-NEXT: [[VEC_PHI18:%.*]] = phi <4 x i32> [ [[TMP45]], [[VECTOR_PH10]] ], [ [[PARTIAL_REDUCE21:%.*]], [[VECTOR_BODY13]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[BC_RESUME_VAL]], [[INDEX14]]
+; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP47:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP46]]
+; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[TMP47]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP48]], align 1
+; CHECK-NEXT: [[TMP49:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
+; CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 [[TMP46]]
+; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[TMP50]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP51]], align 1
+; CHECK-NEXT: [[TMP52:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
+; CHECK-NEXT: [[TMP53:%.*]] = mul nsw <16 x i32> [[TMP52]], [[TMP49]]
+; CHECK-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI18]], <16 x i32> [[TMP53]])
+; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP46]]
+; CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP54]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD22:%.*]] = load <16 x i8>, ptr [[TMP55]], align 1
+; CHECK-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD22]] to <16 x i32>
+; CHECK-NEXT: [[TMP57:%.*]] = mul nsw <16 x i32> [[TMP56]], [[TMP52]]
+; CHECK-NEXT: [[PARTIAL_REDUCE23]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI17]], <16 x i32> [[TMP57]])
+; CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP46]]
+; CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP58]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP59]], align 1
+; CHECK-NEXT: [[TMP60:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
+; CHECK-NEXT: [[TMP61:%.*]] = mul nsw <16 x i32> [[TMP60]], [[TMP52]]
+; CHECK-NEXT: [[PARTIAL_REDUCE25]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI16]], <16 x i32> [[TMP61]])
+; CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP46]]
+; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[TMP62]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP63]], align 1
+; CHECK-NEXT: [[TMP64:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
+; CHECK-NEXT: [[TMP65:%.*]] = mul nsw <16 x i32> [[TMP64]], [[TMP52]]
+; CHECK-NEXT: [[PARTIAL_REDUCE27]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI15]], <16 x i32> [[TMP65]])
+; CHECK-NEXT: [[INDEX_NEXT28]] = add nuw i64 [[INDEX14]], 16
+; CHECK-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC12]]
+; CHECK-NEXT: br i1 [[TMP66]], label [[MIDDLE_BLOCK5:%.*]], label [[VECTOR_BODY13]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK: middle.block5:
+; CHECK-NEXT: [[TMP67:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE27]])
+; CHECK-NEXT: [[TMP68:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE25]])
+; CHECK-NEXT: [[TMP69:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE23]])
+; CHECK-NEXT: [[TMP70:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
+; CHECK-NEXT: [[CMP_N29:%.*]] = icmp eq i64 [[TMP39]], [[N_VEC12]]
+; CHECK-NEXT: br i1 [[CMP_N29]], label [[FOR_COND10_FOR_COND_CLEANUP_CRIT_EDGE_US_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
;
entry:
%cmp154 = icmp sgt i32 %num_out, 3
@@ -1161,7 +1096,7 @@ for.end98: ; preds = %for.end98.loopexit1
define i32 @not_dotp_predicated(i32 %N, ptr %a, ptr %b) #0 {
; CHECK-LABEL: define i32 @not_dotp_predicated(
-; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[REM:%.*]] = srem i32 [[N]], 16
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[REM]], 0
@@ -1170,39 +1105,34 @@ define i32 @not_dotp_predicated(i32 %N, ptr %a, ptr %b) #0 {
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[REM]] to i64
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 15
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP7]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
-; CHECK-NEXT: [[TMP8:%.*]] = sext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i32>
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP10]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
-; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP12:%.*]] = mul nsw <vscale x 8 x i32> [[TMP11]], [[TMP8]]
-; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 8 x i32> [[TMP12]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP14]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i32> [[TMP13]], <vscale x 8 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 8 x i1> [[TMP15]], i32 0
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP10]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT: [[TMP16:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
+; CHECK-NEXT: [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = add <16 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP15]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP8]], <16 x i32> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT: [[TMP18:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x i1> [[TMP18]], i32 0
+; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP14]])
+; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]])
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
;
entry:
@@ -1248,39 +1178,34 @@ define i32 @not_dotp_predicated_pragma(i32 %N, ptr %a, ptr %b) #0 {
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 15
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 16
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP7]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
-; CHECK-NEXT: [[TMP8:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP10]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
-; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 16 x i32>
-; CHECK-NEXT: [[TMP12:%.*]] = mul nsw <vscale x 16 x i32> [[TMP11]], [[TMP8]]
-; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 16 x i32> [[TMP12]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP14]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP13]], <vscale x 16 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[TMP15]], i32 0
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP10]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT: [[TMP16:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
+; CHECK-NEXT: [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = add <16 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP15]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP8]], <16 x i32> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT: [[TMP18:%.*]] = xor <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x i1> [[TMP18]], i32 0
+; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP14]])
+; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]])
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index 7fcb33b8584f33..48eaa2b280624a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -8,7 +8,7 @@ target triple = "aarch64-none-unknown-elf"
define void @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-LABEL: Checking a loop in 'print_partial_reduction'
-; CHECK: VPlan 'Initial VPlan for VF={2,4,8,16},UF>=1' {
+; CHECK: VPlan 'Initial VPlan for VF={2,4},UF>=1' {
; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
; CHECK-NEXT: Live-in ir<0> = original trip-count
@@ -30,7 +30,51 @@ define void @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-NEXT: WIDEN ir<%2> = load vp<%5>
; CHECK-NEXT: WIDEN-CAST ir<%conv3> = zext ir<%2> to i32
; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%conv3>, ir<%conv>
-; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%mul>, ir<%acc.010>
+; CHECK-NEXT: WIDEN ir<%add> = add ir<%mul>, ir<[[ACC]]>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<%8> = compute-reduction-result ir<[[ACC]]>, ir<%add>
+; CHECK-NEXT: EMIT vp<%9> = extract-from-end vp<%8>, ir<1>
+; CHECK-NEXT: EMIT vp<%10> = icmp eq ir<0>, vp<%1>
+; CHECK-NEXT: EMIT branch-on-cond vp<%10>
+; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%9>)
+; CHECK-NEXT: IR %0 = lshr i32 %add.lcssa, 0
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<0> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<%add> (VF scaled by 1/4)
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<%4> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT: WIDEN ir<%1> = load vp<%4>
+; CHECK-NEXT: WIDEN-CAST ir<%conv> = zext ir<%1> to i32
+; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%b>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<%5> = vector-pointer ir<%arrayidx2>
+; CHECK-NEXT: WIDEN ir<%2> = load vp<%5>
+; CHECK-NEXT: WIDEN-CAST ir<%conv3> = zext ir<%2> to i32
+; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%conv3>, ir<%conv>
+; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%mul>, ir<[[ACC]]>
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
; CHECK-NEXT: No successors
>From 3ac2b53e760bd3d581a7341415fcb5c874fe688a Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Fri, 11 Oct 2024 15:57:19 +0100
Subject: [PATCH 3/4] Get recipe-based version working with multiple-use
extends
---
.../Transforms/Vectorize/LoopVectorize.cpp | 116 +++++++-----------
llvm/lib/Transforms/Vectorize/VPlan.h | 70 ++++++++++-
2 files changed, 110 insertions(+), 76 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0e821219770efd..8a69a1cf061337 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1520,45 +1520,7 @@ class LoopVectorizationCostModel {
getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
TTI::TargetCostKind CostKind) const;
- /// A chain of instructions that form a partial reduction.
- /// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))),
- /// accumulator)
- struct PartialReductionChain {
- /// The top-level binary operation that forms the reduction to a scalar
- /// after the loop body
- Instruction *Reduction;
- /// The inner binary operation that forms the reduction to a vector value
- /// within the loop body
- Instruction *BinOp;
- /// The extension of each of the inner binary operation's operands
- Instruction *ExtendA;
- Instruction *ExtendB;
-
- /// The accumulator that is reduced to a scalar after the loop body
- Value *Accumulator;
-
- /// The scaling factor between the size of the reduction type and the
- /// (possibly extended) inputs
- unsigned ScaleFactor;
- };
-
- using PartialReductionList = DenseMap<Instruction *, PartialReductionChain>;
-
- PartialReductionList getPartialReductionChains() {
- return PartialReductionChains;
- }
-
- std::optional<PartialReductionChain>
- getInstructionsPartialReduction(Instruction *I) const {
- auto PairIt = PartialReductionChains.find(I);
- if (PairIt == PartialReductionChains.end())
- return std::nullopt;
- return PairIt->second;
- }
-
private:
- PartialReductionList PartialReductionChains;
-
unsigned NumPredStores = 0;
/// \return An upper bound for the vectorization factors for both
@@ -4652,11 +4614,6 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
return false;
}
- // Prevent epilogue vectorization if a partial reduction is involved
- // TODO Is there a cleaner way to check this?
- if (CM.getPartialReductionChains().size() > 0)
- return false;
-
// Epilogue vectorization code has not been auditted to ensure it handles
// non-latch exits properly. It may be fine, but it needs auditted and
// tested.
@@ -7093,7 +7050,6 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
-
CM.collectValuesToIgnore();
CM.collectElementTypesForWidening();
@@ -8639,26 +8595,24 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
/// Examines reduction operations to see if the target can use a cheaper
/// operation with a wider per-iteration input VF and narrower PHI VF.
-/// Returns the ratio between the two VFs (1 by default).
-static unsigned getReductionScaleFactor(PHINode *PHI,
- const RecurrenceDescriptor &Rdx,
- const TargetTransformInfo *TTI,
- VFRange &Range,
- LoopVectorizationCostModel &CM) {
+/// Returns a struct containing the ratio between the two VFs and other cached
+/// information, or null if no scalable reduction was found.
+static std::optional<PartialReductionChain>
+getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx,
+ const TargetTransformInfo *TTI, VFRange &Range,
+ LoopVectorizationCostModel &CM) {
// FIXME: Should we move this to VPRecipeBuilder and cache the values needed
// for the TTI query?
- unsigned DefaultScaleFactor = 1;
-
// TODO: Allow scaling reductions when predicating. The select at
// the end of the loop chooses between the phi value and most recent
// reduction result, both of which have different VFs to the active lane
// mask when scaling.
if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
- return DefaultScaleFactor;
+ return std::nullopt;
auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
if (!Update)
- return DefaultScaleFactor;
+ return std::nullopt;
Value *Op = Update->getOperand(0);
if (Op == PHI)
@@ -8667,7 +8621,7 @@ static unsigned getReductionScaleFactor(PHINode *PHI,
// Match dot product pattern
auto *BinOp = dyn_cast<BinaryOperator>(Op);
if (!BinOp || !BinOp->hasOneUse())
- return DefaultScaleFactor;
+ return std::nullopt;
auto IsSextOrZext = [](Instruction *I) {
return I && (I->getOpcode() == Instruction::ZExt ||
@@ -8677,13 +8631,13 @@ static unsigned getReductionScaleFactor(PHINode *PHI,
auto *ExtA = dyn_cast<Instruction>(BinOp->getOperand(0));
auto *ExtB = dyn_cast<Instruction>(BinOp->getOperand(1));
if (!IsSextOrZext(ExtA) || !IsSextOrZext(ExtB))
- return DefaultScaleFactor;
+ return std::nullopt;
Value *A = ExtA->getOperand(0);
Value *B = ExtB->getOperand(0);
// Check that the extends extend from the same type
if (A->getType() != B->getType())
- return DefaultScaleFactor;
+ return std::nullopt;
unsigned TargetScaleFactor =
PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
@@ -8694,6 +8648,13 @@ static unsigned getReductionScaleFactor(PHINode *PHI,
TTI::PartialReductionExtendKind OpBExtend =
TargetTransformInfo::getPartialReductionExtendKind(ExtB);
+ PartialReductionChain Chain;
+ Chain.Reduction = Rdx.getLoopExitInstr();
+ Chain.ExtendA = ExtA;
+ Chain.ExtendB = ExtB;
+ Chain.ScaleFactor = TargetScaleFactor;
+ Chain.BinOp = dyn_cast<Instruction>(Op);
+
if (LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) {
InstructionCost Cost = TTI->getPartialReductionCost(
@@ -8702,9 +8663,9 @@ static unsigned getReductionScaleFactor(PHINode *PHI,
return Cost.isValid();
},
Range))
- return TargetScaleFactor;
+ return Chain;
- return DefaultScaleFactor;
+ return std::nullopt;
}
VPRecipeBase *
@@ -8733,11 +8694,9 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
// If the PHI is used by a partial reduction, set the scale factor
- unsigned ScaleFactor =
- getReductionScaleFactor(Phi, RdxDesc, TTI, Range, CM);
- Instruction *ReductionInstr = RdxDesc.getLoopExitInstr();
- if (ScaleFactor != 1)
- Plan.addScaledReductionExitInstr(RdxDesc.getLoopExitInstr());
+ std::optional<PartialReductionChain> Chain =
+ getScaledReduction(Phi, RdxDesc, TTI, Range, CM);
+ unsigned ScaleFactor = Chain ? Chain->ScaleFactor : 1;
PhiRecipe = new VPReductionPHIRecipe(
Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
CM.useOrderedReductions(RdxDesc), ScaleFactor);
@@ -8772,7 +8731,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range);
- if (Plan.isScaledReductionExitInstr(Instr))
+ if (Plan.getScaledReductionForInstr(Instr))
return tryToCreatePartialReduction(Instr, Operands);
if (!shouldWiden(Instr, Range))
@@ -9161,9 +9120,29 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
return Legal->blockNeedsPredication(BB) || NeedsBlends;
});
+
+ // Cache the partial reductions up front so we can remove the invalid ones
+ // before creating the recipes
+ for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
+ for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
+ Instruction *Instr = &I;
+ auto *Phi = dyn_cast<PHINode>(Instr);
+ if (!Phi || !Legal->isReductionVariable(Phi))
+ continue;
+ const RecurrenceDescriptor &RdxDesc =
+ Legal->getReductionVars().find(Phi)->second;
+ std::optional<PartialReductionChain> Chain =
+ getScaledReduction(Phi, RdxDesc, &TTI, Range, CM);
+ if (Chain.has_value())
+ Plan->addScaledReductionExitInstr(*Chain);
+ }
+ }
+ Plan->removeInvalidScaledReductionExitInstrs();
+
auto *MiddleVPBB =
cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getSingleSuccessor());
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
+
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
// Relevant instructions from basic block BB will be grouped into VPRecipe
// ingredients and fill a new VPBasicBlock.
@@ -9206,11 +9185,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
continue;
}
- VPRecipeBase *Recipe = nullptr;
-
- if (!Recipe)
- Recipe =
- RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
+ VPRecipeBase *Recipe =
+ RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
if (!Recipe)
Recipe = RecipeBuilder.handleReplication(Instr, Range);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index ad4594ea918c59..5f9df54c24d763 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2300,7 +2300,6 @@ class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
/// Generate the reduction in the loop
void execute(VPTransformState &State) override;
- unsigned getOpcode() { return Opcode; }
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
@@ -3576,6 +3575,24 @@ class VPRegionBlock : public VPBlockBase {
VPRegionBlock *clone() override;
};
+/// A chain of instructions that form a partial reduction.
+/// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))),
+/// accumulator)
+struct PartialReductionChain {
+ /// The top-level binary operation that forms the reduction to a scalar
+ /// after the loop body
+ Instruction *Reduction;
+ /// The extension of each of the inner binary operation's operands
+ Instruction *ExtendA;
+ Instruction *ExtendB;
+
+ Instruction *BinOp;
+
+ /// The scaling factor between the size of the reduction type and the
+ /// (possibly extended) inputs
+ unsigned ScaleFactor;
+};
+
/// VPlan models a candidate for vectorization, encoding various decisions take
/// to produce efficient output IR, including which branches, basic-blocks and
/// output IR instructions to generate, and their cost. VPlan holds a
@@ -3641,7 +3658,8 @@ class VPlan {
/// Stores the set of reduction exit instructions that will be scaled to
/// a smaller VF in this plan via partial reductions.
- SmallPtrSet<const Instruction *, 2> ScaledReductionExitInstrs;
+ DenseMap<const Instruction *, PartialReductionChain>
+ ScaledReductionExitInstrs;
public:
/// Construct a VPlan with original preheader \p Preheader, trip count \p TC
@@ -3840,12 +3858,52 @@ class VPlan {
/// recipes to refer to the clones, and return it.
VPlan *duplicate();
- void addScaledReductionExitInstr(const Instruction *ExitInst) {
- ScaledReductionExitInstrs.insert(ExitInst);
+ void addScaledReductionExitInstr(PartialReductionChain Chain) {
+ ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Chain));
}
- bool isScaledReductionExitInstr(const Instruction *ExitInst) {
- return ScaledReductionExitInstrs.contains(ExitInst);
+ std::optional<PartialReductionChain>
+ getScaledReductionForInstr(const Instruction *ExitInst) {
+ auto It = ScaledReductionExitInstrs.find(ExitInst);
+ return It == ScaledReductionExitInstrs.end()
+ ? std::nullopt
+ : std::make_optional(It->second);
+ }
+
+ void removeInvalidScaledReductionExitInstrs() {
+ // A partial reduction is invalid if any of its extends are used by
+ // something that isn't another partial reduction. This is because the
+ // extends are intended to be lowered along with the reduction itself.
+
+ // Build up a set of partial reduction bin ops for efficient use checking
+ SmallSet<Instruction *, 4> PartialReductionBinOps;
+ for (auto It : ScaledReductionExitInstrs) {
+ if (It.second.BinOp)
+ PartialReductionBinOps.insert(It.second.BinOp);
+ }
+
+ auto ExtendIsOnlyUsedByPartialReductions =
+ [PartialReductionBinOps](Instruction *Extend) {
+ for (auto *Use : Extend->users()) {
+ Instruction *UseInstr = dyn_cast<Instruction>(Use);
+ if (!PartialReductionBinOps.contains(UseInstr))
+ return false;
+ }
+ return true;
+ };
+
+ // Check if each use of a chain's two extends is a partial reduction
+ // and remove those that have non-partial reduction users
+ SmallSet<Instruction *, 4> PartialReductionsToRemove;
+ for (auto It : ScaledReductionExitInstrs) {
+ PartialReductionChain Chain = It.second;
+ if (!ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) ||
+ !ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
+ PartialReductionsToRemove.insert(Chain.Reduction);
+ }
+
+ for (auto *Instr : PartialReductionsToRemove)
+ ScaledReductionExitInstrs.erase(Instr);
}
};
>From 22337d4d333cb375a9690063733db91e6c6146b9 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 15 Oct 2024 11:41:05 +0100
Subject: [PATCH 4/4] Use cached partial reduction when getting the scale
factor for phi
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8a69a1cf061337..cdec370b34a24d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8695,7 +8695,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
// If the PHI is used by a partial reduction, set the scale factor
std::optional<PartialReductionChain> Chain =
- getScaledReduction(Phi, RdxDesc, TTI, Range, CM);
+ Plan.getScaledReductionForInstr(RdxDesc.getLoopExitInstr());
unsigned ScaleFactor = Chain ? Chain->ScaleFactor : 1;
PhiRecipe = new VPReductionPHIRecipe(
Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
More information about the llvm-commits
mailing list