[llvm] [LoopVectorize] Add support for vectorisation of more early exit loops (PR #88385)
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 8 05:21:38 PDT 2024
https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/88385
>From 39185b45a7434993e126d52cacf09ee86475ef98 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 8 Oct 2024 12:18:05 +0000
Subject: [PATCH] [LoopVectorize] Add support for vectorisation of more early
exit loops
This patch follows on from PR #107004 by adding support for
vectorisation of a simple class of loops that typically involves
searching for something, i.e.
for (int i = 0; i < n; i++) {
if (p[i] == val)
return i;
}
return n;
or
for (int i = 0; i < n; i++) {
if (p1[i] != p2[i])
return i;
}
return n;
In this initial commit we will only vectorise early exit loops legal
if they follow these criteria:
1. There are no stores in the loop.
2. The loop must have only one early uncountable exit like those shown
in the above example.
3. The early exit block dominates the latch block.
4. The latch block must have an exact exit count.
6. The loop must not contain reductions or recurrences.
7. We must be able to prove at compile-time that loops will not contain
faulting loads.
For point 7 once this patch lands I intend to follow up by supporting
some limited cases of faulting loops where we can version the loop based
on pointer alignment. For example, it turns out in the SPEC2017 benchmark
(xalancbmk) there is a std::find loop that we can vectorise provided we
add SCEV checks for the initial pointer being aligned to a multiple of
the VF. In practice, the pointer is regularly aligned to at least 32/64
bytes and since the VF is a power of 2, any vector loads <= 32/64 bytes
in size will always fault on the first lane, following the same behaviour
as the scalar loop. Given we already do such speculative versioning for
loops with unknown strides, alignment-based versioning doesn't seem to be
any worse at least for loops with only one load.
This patch makes use of the existing experimental_cttz_elems intrinsic
that's required in the vectorised early exit block to determine the first
lane that triggered the exit. This intrinsic has generic lowering support
so it's guaranteed to work for all targets.
Tests have been updated here:
Transforms/LoopVectorize/simple_early_exit.ll
---
llvm/include/llvm/Support/GenericLoopInfo.h | 4 +
.../llvm/Support/GenericLoopInfoImpl.h | 10 +
.../Vectorize/LoopVectorizationLegality.cpp | 20 +-
.../Transforms/Vectorize/LoopVectorize.cpp | 420 ++++-
llvm/lib/Transforms/Vectorize/VPlan.cpp | 79 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 99 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 51 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 6 +-
.../Transforms/Vectorize/VPlanVerifier.cpp | 55 +-
.../LoopVectorize/simple_early_exit.ll | 1461 ++++++++++++++---
.../Vectorize/VPlanVerifierTest.cpp | 137 ++
11 files changed, 1979 insertions(+), 363 deletions(-)
diff --git a/llvm/include/llvm/Support/GenericLoopInfo.h b/llvm/include/llvm/Support/GenericLoopInfo.h
index d560ca648132c9..0fa13e2a3d0e15 100644
--- a/llvm/include/llvm/Support/GenericLoopInfo.h
+++ b/llvm/include/llvm/Support/GenericLoopInfo.h
@@ -294,6 +294,10 @@ template <class BlockT, class LoopT> class LoopBase {
/// Otherwise return null.
BlockT *getUniqueExitBlock() const;
+ /// Return the unique exit block for the latch, or null if there are multiple
+ /// different exit blocks.
+ BlockT *getUniqueLatchExitBlock() const;
+
/// Return true if this loop does not have any exit blocks.
bool hasNoExitBlocks() const;
diff --git a/llvm/include/llvm/Support/GenericLoopInfoImpl.h b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
index d19022729ace32..4945ea30950d23 100644
--- a/llvm/include/llvm/Support/GenericLoopInfoImpl.h
+++ b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
@@ -159,6 +159,16 @@ BlockT *LoopBase<BlockT, LoopT>::getUniqueExitBlock() const {
return getExitBlockHelper(this, true).first;
}
+template <class BlockT, class LoopT>
+BlockT *LoopBase<BlockT, LoopT>::getUniqueLatchExitBlock() const {
+ const BlockT *Latch = getLoopLatch();
+ assert(Latch && "Latch block must exists");
+ SmallVector<BlockT *, 4> ExitBlocks;
+ getUniqueExitBlocksHelper(this, ExitBlocks,
+ [Latch](const BlockT *BB) { return BB == Latch; });
+ return ExitBlocks.size() == 1 ? ExitBlocks[0] : nullptr;
+}
+
/// getExitEdges - Return all pairs of (_inside_block_,_outside_block_).
template <class BlockT, class LoopT>
void LoopBase<BlockT, LoopT>::getExitEdges(
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 43be72f0f34d45..f16e32b1f2d9cd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -83,6 +83,12 @@ static cl::opt<bool> EnableHistogramVectorization(
"enable-histogram-loop-vectorization", cl::init(false), cl::Hidden,
cl::desc("Enables autovectorization of some loops containing histograms"));
+static cl::opt<bool> AssumeNoMemFault(
+ "vectorizer-no-mem-fault", cl::init(false), cl::Hidden,
+ cl::desc("Assume vectorized loops will not have memory faults, which is "
+ "potentially unsafe but can be useful for testing vectorization "
+ "of early exit loops."));
+
/// Maximum vectorization interleave count.
static const unsigned MaxInterleaveFactor = 16;
@@ -1710,11 +1716,15 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
Predicates.clear();
if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
&Predicates)) {
- reportVectorizationFailure(
- "Loop may fault",
- "Cannot vectorize potentially faulting early exit loop",
- "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
- return false;
+ if (!AssumeNoMemFault) {
+ reportVectorizationFailure(
+ "Loop may fault",
+ "Cannot vectorize potentially faulting early exit loop",
+ "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+ return false;
+ } else
+ LLVM_DEBUG(dbgs() << "LV: Assuming early exit vector loop will not "
+ << "fault\n");
}
[[maybe_unused]] const SCEV *SymbolicMaxBTC =
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 001c8987667df8..1f8b074208bacd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -178,6 +178,10 @@ static cl::opt<bool> EnableEpilogueVectorization(
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
cl::desc("Enable vectorization of epilogue loops."));
+static cl::opt<bool> EnableEarlyExitVectorization(
+ "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
+ cl::desc("Enable vectorization of early exit loops."));
+
static cl::opt<unsigned> EpilogueVectorizationForceVF(
"epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
cl::desc("When epilogue vectorization is enabled, and a value greater than "
@@ -547,6 +551,10 @@ class InnerLoopVectorizer {
BasicBlock *MiddleBlock, VPlan &Plan,
VPTransformState &State);
+ void fixupEarlyExitIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
+ BasicBlock *VectorEarlyExitBB, VPlan &Plan,
+ VPTransformState &State);
+
/// Iteratively sink the scalarized operands of a predicated instruction into
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
@@ -637,10 +645,6 @@ class InnerLoopVectorizer {
/// Middle Block between the vector and the scalar.
BasicBlock *LoopMiddleBlock;
- /// The unique ExitBlock of the scalar loop if one exists. Note that
- /// there can be multiple exiting edges reaching this block.
- BasicBlock *LoopExitBlock;
-
/// The scalar loop body.
BasicBlock *LoopScalarBody;
@@ -1362,11 +1366,28 @@ class LoopVectorizationCostModel {
}
// If we might exit from anywhere but the latch, must run the exiting
// iteration in scalar form.
- if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+ if (!Legal->hasUncountableEarlyExit() &&
+ TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
LLVM_DEBUG(
dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
return true;
}
+ // If this is a loop with a uncountable early exit, then we may validly
+ // exit from a non-latch block and not require a scalar epilogue for the
+ // last iteration, since these exits are handled specially. However, since
+ // we could have both countable and uncountable exits we must search all
+ // the exits.
+ if (Legal->hasUncountableEarlyExit()) {
+ const SmallVector<BasicBlock *, 4> &CountableExitingBlocks =
+ Legal->getCountableExitingBlocks();
+ unsigned NumBlocks = CountableExitingBlocks.size();
+ if (NumBlocks > 1 || (NumBlocks == 1 && CountableExitingBlocks[0] !=
+ TheLoop->getLoopLatch())) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
+ return true;
+ }
+ }
if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
"interleaved group requires scalar epilogue\n");
@@ -2004,8 +2025,7 @@ class GeneratedRTChecks {
/// adjusts the branches to branch to the vector preheader or \p Bypass,
/// depending on the generated condition.
BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
- BasicBlock *LoopVectorPreHeader,
- BasicBlock *LoopExitBlock) {
+ BasicBlock *LoopVectorPreHeader) {
if (!SCEVCheckCond)
return nullptr;
@@ -2478,7 +2498,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
BasicBlock *const SCEVCheckBlock =
- RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
+ RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
if (!SCEVCheckBlock)
return nullptr;
@@ -2533,8 +2553,8 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopScalarBody = OrigLoop->getHeader();
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
assert(LoopVectorPreHeader && "Invalid loop structure");
- LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
- assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
+ assert((OrigLoop->getUniqueLatchExitBlock() ||
+ Cost->requiresScalarEpilogue(VF.isVector())) &&
"multiple exit loop without required epilogue?");
LoopMiddleBlock =
@@ -2717,18 +2737,39 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
// value (the value that feeds into the phi from the loop latch).
// We allow both, but they, obviously, have different values.
- assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
+ assert((OrigLoop->getUniqueExitBlock() || Legal->hasUncountableEarlyExit()) &&
+ "Expected a single exit block");
DenseMap<Value *, Value *> MissingVals;
+ BasicBlock *OrigLoopLatch = OrigLoop->getLoopLatch();
+ auto IsUseFromUncountableExit = [&](Value *V, Instruction *UI) -> bool {
+ PHINode *PHI = dyn_cast<PHINode>(UI);
+ assert(PHI && "Expected LCSSA form");
+ if (!Legal->hasUncountableEarlyExit())
+ return false;
+
+ // If this loop has an uncountable early exit then there could be a
+ // user of OrigPhi with either:
+ // 1. Multiple uses, because each exiting block (countable or
+ // uncountable) jumps to the same exit block, or ..
+ // 2. A single use with an incoming value from an uncountable exit
+ // block.
+ // In both cases there is no guarantee this came from a normal, countable
+ // exit. Currently if a loop has an uncountable early exit then it must
+ // have a latch with a countable exit.
+ int Index = PHI->getBasicBlockIndex(OrigLoopLatch);
+ return (Index == -1 || PHI->getIncomingValue(Index) != V);
+ };
+
// An external user of the last iteration's value should see the value that
// the remainder loop uses to initialize its own IV.
- Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
+ Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoopLatch);
for (User *U : PostInc->users()) {
Instruction *UI = cast<Instruction>(U);
if (!OrigLoop->contains(UI)) {
- assert(isa<PHINode>(UI) && "Expected LCSSA form");
- MissingVals[UI] = EndValue;
+ if (!IsUseFromUncountableExit(PostInc, UI))
+ MissingVals[cast<PHINode>(UI)] = EndValue;
}
}
@@ -2738,7 +2779,9 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
for (User *U : OrigPhi->users()) {
auto *UI = cast<Instruction>(U);
if (!OrigLoop->contains(UI)) {
- assert(isa<PHINode>(UI) && "Expected LCSSA form");
+ if (IsUseFromUncountableExit(OrigPhi, UI))
+ continue;
+
IRBuilder<> B(MiddleBlock->getTerminator());
// Fast-math-flags propagate from the original induction instruction.
@@ -2773,6 +2816,95 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
}
}
+void InnerLoopVectorizer::fixupEarlyExitIVUsers(PHINode *OrigPhi,
+ const InductionDescriptor &II,
+ BasicBlock *VectorEarlyExitBB,
+ VPlan &Plan,
+ VPTransformState &State) {
+ // There are two kinds of external IV usages - those that use the value
+ // computed in the last iteration (the PHI) and those that use the penultimate
+ // value (the value that feeds into the phi from the loop latch).
+ // We allow both, but they, obviously, have different values.
+ DenseMap<Value *, Value *> MissingVals;
+ BasicBlock *OrigEarlyExitingBlock = Legal->getUncountableEarlyExitingBlock();
+ BasicBlock *OrigLoopLatch = OrigLoop->getLoopLatch();
+ Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoopLatch);
+
+ auto FixUpPhi = [&](Instruction *UI, bool PostInc) -> Value * {
+ IRBuilder<> B(VectorEarlyExitBB->getTerminator());
+ assert(isa<PHINode>(UI) && "Expected LCSSA form");
+
+ // Fast-math-flags propagate from the original induction instruction.
+ if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
+ B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
+
+ // We need the mask that led us into the early exit block.
+ Value *EarlyExitMask =
+ State.get(Plan.getVectorLoopRegion()->getVectorEarlyExitCond());
+ VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
+ Type *CtzType = CanonicalIV->getStartValue()->getLiveInIRValue()->getType();
+ Value *Ctz;
+ if (EarlyExitMask)
+ Ctz = B.CreateCountTrailingZeroElems(CtzType, EarlyExitMask);
+ else
+ Ctz = ConstantInt::get(CtzType, 0);
+ Ctz = B.CreateAdd(Ctz, cast<PHINode>(State.get(
+ CanonicalIV->getVPSingleValue(), VPLane(0))));
+ if (PostInc)
+ Ctz = B.CreateAdd(Ctz, ConstantInt::get(CtzType, 1));
+
+ Value *Escape = nullptr;
+ VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
+ assert(StepVPV && "step must have been expanded during VPlan execution");
+ Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
+ : State.get(StepVPV, VPLane(0));
+ Escape = emitTransformedIndex(B, Ctz, II.getStartValue(), Step,
+ II.getKind(), II.getInductionBinOp());
+ Escape->setName("ind.early.escape");
+
+ return Escape;
+ };
+
+ const Loop *L = this->OrigLoop;
+ auto isUsedInEarlyExitBlock =
+ [&L, &OrigEarlyExitingBlock](Value *V, Instruction *UI) -> bool {
+ if (!L->contains(UI)) {
+ PHINode *PHI = dyn_cast<PHINode>(UI);
+ assert(PHI && "Expected LCSSA form");
+ int Index = PHI->getBasicBlockIndex(OrigEarlyExitingBlock);
+ if (Index != -1 && PHI->getIncomingValue(Index) == V)
+ return true;
+ }
+ return false;
+ };
+
+ for (User *U : PostInc->users()) {
+ auto *UI = cast<Instruction>(U);
+ if (isUsedInEarlyExitBlock(PostInc, UI))
+ MissingVals[UI] = FixUpPhi(UI, true);
+ }
+
+ for (User *U : OrigPhi->users()) {
+ auto *UI = cast<Instruction>(U);
+ if (isUsedInEarlyExitBlock(OrigPhi, UI))
+ MissingVals[UI] = FixUpPhi(UI, false);
+ }
+
+ VPBasicBlock *EarlyExitVPBB = Plan.getVectorLoopRegion()->getEarlyExit();
+ for (auto &I : MissingVals) {
+ PHINode *PHI = cast<PHINode>(I.first);
+ // One corner case we have to handle is two IVs "chasing" each-other,
+ // that is %IV2 = phi [...], [ %IV1, %latch ]
+ // In this case, if IV1 has an external use, we need to avoid adding both
+ // "last value of IV1" and "penultimate value of IV2". So, verify that we
+ // don't already have an incoming value for the middle block.
+ if (PHI->getBasicBlockIndex(VectorEarlyExitBB) == -1) {
+ PHI->addIncoming(I.second, VectorEarlyExitBB);
+ Plan.removeLiveOut(PHI, EarlyExitVPBB);
+ }
+ }
+}
+
namespace {
struct CSEDenseMapInfo {
@@ -2902,6 +3034,21 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
for (PHINode &PN : Exit->phis())
PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
+ VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
+ if (VectorRegion->getEarlyExit()) {
+ // Fix-up external users of the induction variables.
+ VPBasicBlock *VectorEarlyExitVPBB =
+ cast<VPBasicBlock>(VectorRegion->getEarlyExit());
+ BasicBlock *VectorEarlyExitBB = State.CFG.VPBB2IRBB[VectorEarlyExitVPBB];
+ for (const auto &Entry : Legal->getInductionVars())
+ fixupEarlyExitIVUsers(Entry.first, Entry.second, VectorEarlyExitBB, Plan,
+ State);
+
+ BasicBlock *OrigEarlyExitBB = Legal->getUncountableEarlyExitBlock();
+ if (Loop *EEL = LI->getLoopFor(OrigEarlyExitBB))
+ EEL->addBasicBlockToLoop(VectorEarlyExitBB, *LI);
+ }
+
if (Cost->requiresScalarEpilogue(VF.isVector())) {
// No edge from the middle block to the unique exit block has been inserted
// and there is nothing to fix from vector loop; phis should have incoming
@@ -2928,7 +3075,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
for (Instruction *PI : PredicatedInstructions)
sinkScalarOperands(&*PI);
- VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
@@ -3546,9 +3692,12 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
SmallVector<BasicBlock *> Exiting;
TheLoop->getExitingBlocks(Exiting);
for (BasicBlock *E : Exiting) {
- auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
- if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
- AddToWorklistIfAllowed(Cmp);
+ if (!Legal->hasUncountableEarlyExit() ||
+ E != Legal->getUncountableEarlyExitingBlock()) {
+ auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
+ if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+ AddToWorklistIfAllowed(Cmp);
+ }
}
auto PrevVF = VF.divideCoefficientBy(2);
@@ -3998,7 +4147,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// a bottom-test and a single exiting block. We'd have to handle the fact
// that not every instruction executes on the last iteration. This will
// require a lane mask which varies through the vector loop body. (TODO)
- if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+ if (Legal->hasUncountableEarlyExit() ||
+ TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
// If there was a tail-folding hint/switch, but we can't fold the tail by
// masking, fallback to a vectorization with a scalar epilogue.
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
@@ -4603,7 +4753,9 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
// Epilogue vectorization code has not been auditted to ensure it handles
// non-latch exits properly. It may be fine, but it needs auditted and
// tested.
- if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
+ // TODO: Add support for loops with an early exit.
+ if (Legal->hasUncountableEarlyExit() ||
+ OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
return false;
return true;
@@ -4839,6 +4991,10 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
if (!Legal->isSafeForAnyVectorWidth())
return 1;
+ // We don't attempt to perform interleaving for early exit loops.
+ if (Legal->hasUncountableEarlyExit())
+ return 1;
+
auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
const bool HasReductions = !Legal->getReductionVars().empty();
@@ -6425,9 +6581,23 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
// The back-edge branch will remain, as will all scalar branches.
return TTI.getCFInstrCost(Instruction::Br, CostKind);
-
- // This branch will be eliminated by if-conversion.
- return 0;
+ else if (Legal->hasUncountableEarlyExit() &&
+ I->getParent() == Legal->getUncountableEarlyExitingBlock()) {
+ // In order to determine whether we take an early exit or not we have to
+ // perform an or reduction of the vector predicate.
+ auto *Vec_i1Ty =
+ VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
+ InstructionCost EECost = TTI.getArithmeticReductionCost(
+ Instruction::Or, Vec_i1Ty, std::nullopt, CostKind);
+ // Add on the cost of the conditional branch, which will remain.
+ EECost += TTI.getCFInstrCost(Instruction::Br, CostKind);
+ // TODO: The vector loop early exit block also needs to do work to
+ // determine the first lane that triggered the exit. We should probably
+ // add that somehow, but the cost will be negligible for long loops.
+ return EECost;
+ } else
+ // This branch will be eliminated by if-conversion.
+ return 0;
// Note: We currently assume zero cost for an unconditional branch inside
// a predicated block since it will become a fall-through, although we
// may decide in the future to call TTI for all branches.
@@ -7531,6 +7701,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
BestVPlan.getPreheader()->execute(&State);
}
+ if (Legal->hasUncountableEarlyExit())
+ State.CFG.EarlyExitBB = Legal->getUncountableEarlyExitBlock();
+
if (!ILV.getTripCount())
ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
else
@@ -7584,7 +7757,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// 2.5 Collect reduction resume values.
auto *ExitVPBB =
- cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
+ cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSuccessors()[0]);
for (VPRecipeBase &R : *ExitVPBB) {
createAndCollectMergePhiForReduction(
dyn_cast<VPInstruction>(&R), State, OrigLoop,
@@ -7807,7 +7980,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
// If there is an epilogue which must run, there's no edge from the
// middle block to exit blocks and thus no need to update the immediate
// dominator of the exit blocks.
- DT->changeImmediateDominator(LoopExitBlock,
+ DT->changeImmediateDominator(OrigLoop->getUniqueLatchExitBlock(),
EPI.EpilogueIterationCountCheck);
// Keep track of bypass blocks, as they feed start values to the induction and
@@ -8672,7 +8845,7 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlock(
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
auto *MiddleVPBB =
- cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSuccessors()[0]);
// No edge from the middle block to the unique exit block has been inserted
// and there is nothing to fix from vector loop; phis should have incoming
// from scalar loop only.
@@ -8680,7 +8853,12 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlock(
return {};
SetVector<VPIRInstruction *> ExitUsersToFix;
VPBasicBlock *ExitVPBB = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
- BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
+ BasicBlock *ExitingBB;
+ if (Plan.getVectorLoopRegion()->getEarlyExit())
+ ExitingBB = OrigLoop->getLoopLatch();
+ else
+ ExitingBB = OrigLoop->getExitingBlock();
+
for (VPRecipeBase &R : *ExitVPBB) {
auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
if (!ExitIRI)
@@ -8719,7 +8897,7 @@ addUsersInExitBlock(VPlan &Plan,
return;
auto *MiddleVPBB =
- cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSuccessors()[0]);
BasicBlock *ExitBB =
cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock();
VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
@@ -8758,7 +8936,7 @@ static void addLiveOutsForFirstOrderRecurrences(
// TODO: Should be replaced by
// Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
// scalar region is modeled as well.
- auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
+ auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSuccessors()[0]);
VPBasicBlock *ScalarPHVPBB = nullptr;
if (MiddleVPBB->getNumSuccessors() == 2) {
// Order is strict: first is the exit block, second is the scalar preheader.
@@ -8784,6 +8962,8 @@ static void addLiveOutsForFirstOrderRecurrences(
if (!FOR)
continue;
+ assert(VectorRegion->getNumSuccessors() == 1 &&
+ "Cannot handle multiple successors");
// This is the second phase of vectorizing first-order recurrences, creating
// extract for users outside the loop. An overview of the transformation is
// described below. Suppose we have the following loop with some use after
@@ -8876,6 +9056,59 @@ static void addLiveOutsForFirstOrderRecurrences(
}
}
+// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
+// original exit block.
+static void addUsersInEarlyExitBlock(BasicBlock *EarlyExitingBB,
+ BasicBlock *EarlyExitBB,
+ VPRecipeBuilder &RecipeBuilder,
+ VPlan &Plan) {
+ VPBasicBlock *EarlyExitVPBB = Plan.getVectorLoopRegion()->getEarlyExit();
+ VPBuilder B(EarlyExitVPBB);
+ for (PHINode &ExitPhi : EarlyExitBB->phis()) {
+ Value *IncomingValue = ExitPhi.getIncomingValueForBlock(EarlyExitingBB);
+ VPValue *V = RecipeBuilder.getVPValueOrAddLiveIn(IncomingValue);
+ VPValue *EarlyExitMask =
+ Plan.getVectorLoopRegion()->getVectorEarlyExitCond();
+ VPValue *Ext =
+ B.createNaryOp(VPInstruction::ExtractHighestActive, {V, EarlyExitMask});
+
+ Plan.addLiveOut(&ExitPhi, Ext, EarlyExitVPBB);
+ }
+}
+
+static VPValue *getConditionForVectorEarlyExit(Loop *OrigLoop,
+ BasicBlock *ExitingBB,
+ VPlan &Plan, VPBuilder &Builder,
+ VPRecipeBuilder &RecipeBuilder,
+ VPRecipeBase *VPEarlyExitCond) {
+ // To make things easier we canonicalise the condition so that 'true'
+ // means take the early exit.
+ auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+
+ // If the true destination is in the loop then we want to invert the
+ // condition so that true means early exit.
+ bool NeedsInvert = OrigLoop->contains(BI->getSuccessor(0));
+
+ VPValue *ScalarExitCond;
+ if (!VPEarlyExitCond) {
+ // If we didn't find the exit condition, then this must have been
+ // defined outside the loop and is loop invariant.
+ ScalarExitCond = RecipeBuilder.getVPValueOrAddLiveIn(BI->getCondition());
+ if (NeedsInvert)
+ ScalarExitCond = Builder.createNot(ScalarExitCond);
+ Plan.getVectorLoopRegion()->setVectorEarlyExitCond(ScalarExitCond);
+ } else {
+ VPValue *EarlyExitMask = VPEarlyExitCond->getVPSingleValue();
+ if (NeedsInvert)
+ EarlyExitMask = Builder.createNot(EarlyExitMask);
+ Plan.getVectorLoopRegion()->setVectorEarlyExitCond(EarlyExitMask);
+ // If any lane of EarlyExitMask would be true we should exit the loop.
+ ScalarExitCond =
+ Builder.createNaryOp(VPInstruction::OrReduction, {EarlyExitMask});
+ }
+ return ScalarExitCond;
+}
+
VPlanPtr
LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
@@ -8898,9 +9131,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
return !CM.requiresScalarEpilogue(VF.isVector());
},
Range);
- VPlanPtr Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(),
- PSE, RequiresScalarEpilogueCheck,
- CM.foldTailByMasking(), OrigLoop);
+ VPlanPtr Plan = VPlan::createInitialVPlan(
+ Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck,
+ CM.foldTailByMasking(), OrigLoop, Legal->hasUncountableEarlyExit());
// Don't use getDecisionAndClampRange here, because we don't know the UF
// so this function is better to be conservative, rather than to split
@@ -8964,8 +9197,20 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
return Legal->blockNeedsPredication(BB) || NeedsBlends;
});
auto *MiddleVPBB =
- cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getSingleSuccessor());
+ cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getSuccessors()[0]);
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
+
+ // If we find the recipe for the early exit condition we need to record it
+ // so that we can then generate the new vector exit condition.
+ VPRecipeBase *VPEarlyExitCond = nullptr;
+ Value *EarlyExitCond = nullptr;
+ BasicBlock *EarlyExitingBB = nullptr;
+ if (Legal->hasUncountableEarlyExit()) {
+ EarlyExitingBB = Legal->getUncountableEarlyExitingBlock();
+ BranchInst *BI = cast<BranchInst>(EarlyExitingBB->getTerminator());
+ EarlyExitCond = BI->getCondition();
+ }
+
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
// Relevant instructions from basic block BB will be grouped into VPRecipe
// ingredients and fill a new VPBasicBlock.
@@ -9013,6 +9258,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
if (!Recipe)
Recipe = RecipeBuilder.handleReplication(Instr, Range);
+ if (&I == EarlyExitCond)
+ VPEarlyExitCond = Recipe;
+
RecipeBuilder.setRecipe(Instr, Recipe);
if (isa<VPHeaderPHIRecipe>(Recipe)) {
// VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
@@ -9031,6 +9279,22 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPBB->appendRecipe(Recipe);
}
+ // If this is an early exit block we need to do more work to generate the
+ // actual exit condition. We generate an or reduction of the vector
+ // condition so that we exit the loop if any lane of the vector would cause
+ // us to exit.
+ if (BB == EarlyExitingBB) {
+ VPValue *ScalarExitCond = getConditionForVectorEarlyExit(
+ OrigLoop, BB, *Plan, Builder, RecipeBuilder, VPEarlyExitCond);
+
+ // Branch to early exit BB.
+ auto *NewBR =
+ new VPInstruction(VPInstruction::BranchOnCond, {ScalarExitCond});
+ RecipeBuilder.setRecipe(cast<BranchInst>(BB->getTerminator()), NewBR);
+ VPBB->appendRecipe(NewBR);
+
+ Plan->getVectorLoopRegion()->setEarlyExiting(VPBB);
+ }
VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
}
@@ -9049,6 +9313,14 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
addUsersInExitBlock(*Plan, ExitUsersToFix);
+ // First-order recurrences are not currently permitted with early exits, so
+ // there is currently no need to collect a list of early exit users in the
+ // same way as above.
+ if (EarlyExitingBB)
+ addUsersInEarlyExitBlock(EarlyExitingBB,
+ Legal->getUncountableEarlyExitBlock(),
+ RecipeBuilder, *Plan);
+
// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
// bring the VPlan to its final state.
@@ -9176,11 +9448,13 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
VPBasicBlock *MiddleVPBB =
- cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
+ cast<VPBasicBlock>(VectorLoopRegion->getSuccessors()[0]);
for (VPRecipeBase &R : Header->phis()) {
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
continue;
+ assert(VectorLoopRegion->getNumSuccessors() == 1 &&
+ "Cannot handle multiple succesors!");
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
RecurKind Kind = RdxDesc.getRecurrenceKind();
@@ -9645,15 +9919,71 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
}
}
-static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
- VectorizationFactor &VF,
- std::optional<unsigned> VScale, Loop *L,
- ScalarEvolution &SE,
- ScalarEpilogueLowering SEL) {
+static InstructionCost calculateEarlyExitCost(const TargetTransformInfo *TTI,
+ LoopVectorizationLegality *Legal,
+ Loop *L, ElementCount VF) {
+ unsigned NumCttzElemCalls = 0;
+ BasicBlock *OrigEarlyExitingBlock = Legal->getUncountableEarlyExitingBlock();
+ BasicBlock *OrigLoopLatch = L->getLoopLatch();
+
+ auto isUsedInEarlyExitBlock = [&L, &OrigEarlyExitingBlock](Value *V,
+ User *U) -> bool {
+ auto *UI = cast<Instruction>(U);
+ if (!L->contains(UI)) {
+ PHINode *PHI = dyn_cast<PHINode>(UI);
+ assert(PHI && "Expected LCSSA form");
+ int Index = PHI->getBasicBlockIndex(OrigEarlyExitingBlock);
+ if (Index != -1 && PHI->getIncomingValue(Index) == V)
+ return true;
+ }
+ return false;
+ };
+
+ for (const auto &Entry : Legal->getInductionVars()) {
+ PHINode *OrigPhi = Entry.first;
+ Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoopLatch);
+
+ for (User *U : PostInc->users())
+ if (isUsedInEarlyExitBlock(PostInc, U))
+ NumCttzElemCalls++;
+
+ for (User *U : OrigPhi->users())
+ if (isUsedInEarlyExitBlock(OrigPhi, U))
+ NumCttzElemCalls++;
+ }
+
+ InstructionCost Cost = 0;
+ if (NumCttzElemCalls) {
+ LLVMContext &Context = L->getHeader()->getContext();
+ // Ideally we'd query the vplan for the canonical IV type, but we don't
+ // have a vplan yet so let's assume it's 64-bit.
+ auto CtzType = IntegerType::getIntNTy(Context, 64);
+ auto VecI1Type = VectorType::get(IntegerType::getInt1Ty(Context), VF);
+
+ IntrinsicCostAttributes Attrs(
+ Intrinsic::experimental_cttz_elts, CtzType,
+ {PoisonValue::get(VecI1Type), ConstantInt::getTrue(Context)});
+ Cost = TTI->getIntrinsicInstrCost(Attrs, TTI::TCK_RecipThroughput);
+ Cost *= NumCttzElemCalls;
+ }
+ return Cost;
+}
+
+static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
+ VectorizationFactor &VF,
+ std::optional<unsigned> VScale, Loop *L,
+ ScalarEvolution &SE,
+ ScalarEpilogueLowering SEL,
+ InstructionCost EarlyExitCost) {
InstructionCost CheckCost = Checks.getCost();
if (!CheckCost.isValid())
return false;
+ // Add on the cost of work required in the vector early exit block, if one
+ // exists.
+ if (EarlyExitCost.isValid())
+ CheckCost += EarlyExitCost;
+
// When interleaving only scalar and vector cost will be equal, which in turn
// would lead to a divide by 0. Fall back to hard threshold.
if (VF.Width.isScalar()) {
@@ -9802,7 +10132,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
- if (LVL.hasUncountableEarlyExit()) {
+ if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
"early exit is not yet supported",
"Auto-vectorization of loops with uncountable "
@@ -9950,12 +10280,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (VF.Width.isVector() || SelectedIC > 1)
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
+ InstructionCost EarlyExitCost = InstructionCost::getInvalid();
+ if (VF.Width.isVector() && LVL.hasUncountableEarlyExit())
+ EarlyExitCost = calculateEarlyExitCost(TTI, &LVL, L, VF.Width);
+
// Check if it is profitable to vectorize with runtime checks.
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
if (!ForceVectorization &&
- !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
- *PSE.getSE(), SEL)) {
+ !isOutsideLoopWorkProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
+ *PSE.getSE(), SEL, EarlyExitCost)) {
ORE->emit([&]() {
return OptimizationRemarkAnalysisAliasing(
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 5e3a6388094940..98de6ff28637e3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -419,7 +419,14 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
// Hook up the new basic block to its predecessors.
for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
- VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
+ auto *VPRB = dyn_cast<VPRegionBlock>(PredVPBlock);
+
+ // The exiting block that leads to this block might be an early exit from
+ // a loop region.
+ VPBasicBlock *PredVPBB = VPRB && VPRB->getEarlyExit() == this
+ ? cast<VPBasicBlock>(VPRB->getEarlyExiting())
+ : PredVPBlock->getExitingBasicBlock();
+
auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
@@ -441,6 +448,11 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
// Set each forward successor here when it is created, excluding
// backedges. A backward successor is set when the branch is created.
unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
+ VPRegionBlock *PredParentRegion =
+ dyn_cast_or_null<VPRegionBlock>(PredVPBB->getParent());
+ if (PredParentRegion->getEarlyExiting() == PredVPBB) {
+ idx = 1 - idx;
+ }
assert(!TermBr->getSuccessor(idx) &&
"Trying to reset an existing successor block.");
TermBr->setSuccessor(idx, NewBB);
@@ -497,6 +509,7 @@ void VPBasicBlock::execute(VPTransformState *State) {
!((SingleHPred = getSingleHierarchicalPredecessor()) &&
SingleHPred->getExitingBasicBlock() == PrevVPBB &&
PrevVPBB->getSingleHierarchicalSuccessor() &&
+ PrevVPBB != getEnclosingLoopRegion()->getEarlyExiting() &&
(SingleHPred->getParent() == getEnclosingLoopRegion() &&
!IsLoopRegion(SingleHPred))) && /* B */
!(Replica && getPredecessors().empty())) { /* C */
@@ -515,7 +528,8 @@ void VPBasicBlock::execute(VPTransformState *State) {
UnreachableInst *Terminator = State->Builder.CreateUnreachable();
// Register NewBB in its loop. In innermost loops its the same for all
// BB's.
- if (State->CurrentVectorLoop)
+ if (State->CurrentVectorLoop &&
+ this != getEnclosingLoopRegion()->getEarlyExit())
State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI);
State->Builder.SetInsertPoint(Terminator);
State->CFG.PrevBB = NewBB;
@@ -633,7 +647,11 @@ const VPRecipeBase *VPBasicBlock::getTerminator() const {
}
bool VPBasicBlock::isExiting() const {
- return getParent() && getParent()->getExitingBasicBlock() == this;
+ const VPRegionBlock *VPRB = getParent();
+ if (!VPRB)
+ return false;
+ return VPRB->getExitingBasicBlock() == this ||
+ VPRB->getEarlyExiting() == this;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -874,7 +892,8 @@ VPIRBasicBlock *VPIRBasicBlock::fromBasicBlock(BasicBlock *IRBB) {
VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
PredicatedScalarEvolution &PSE,
bool RequiresScalarEpilogueCheck,
- bool TailFolded, Loop *TheLoop) {
+ bool TailFolded, Loop *TheLoop,
+ bool HasEarlyExit) {
VPIRBasicBlock *Entry =
VPIRBasicBlock::fromBasicBlock(TheLoop->getLoopPreheader());
VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
@@ -887,8 +906,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
// uncountable exits whilst also ensuring the symbolic maximum and known
// back-edge taken count remain identical for loops with countable exits.
const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
- assert((!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
- BackedgeTakenCountSCEV == PSE.getBackedgeTakenCount()) &&
+ assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
"Invalid loop count");
ScalarEvolution &SE = *PSE.getSE();
const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV,
@@ -908,6 +926,12 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
+ if (HasEarlyExit) {
+ VPBasicBlock *EarlyExitVPBB = new VPBasicBlock("vector.early.exit");
+ TopRegion->setEarlyExit(EarlyExitVPBB);
+ VPBlockUtils::connectBlocks(TopRegion, EarlyExitVPBB);
+ }
+
VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
if (!RequiresScalarEpilogueCheck) {
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -922,7 +946,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
// 2) If we require a scalar epilogue, there is no conditional branch as
// we unconditionally branch to the scalar preheader. Do nothing.
// 3) Otherwise, construct a runtime check.
- BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
+ BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock();
auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
// The connection order corresponds to the operands of the conditional branch.
VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
@@ -998,7 +1022,8 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
-static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
+static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB,
+ BasicBlock *IRBB) {
VPIRBasicBlock *IRVPBB = VPIRBasicBlock::fromBasicBlock(IRBB);
for (auto &R : make_early_inc_range(*VPBB)) {
assert(!R.isPhi() && "Tried to move phi recipe to end of block");
@@ -1012,6 +1037,7 @@ static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
VPBlockUtils::disconnectBlocks(VPBB, Succ);
}
delete VPBB;
+ return IRVPBB;
}
/// Generate the code inside the preheader and body of the vectorized loop.
@@ -1035,7 +1061,7 @@ void VPlan::execute(VPTransformState *State) {
// VPlan execution rather than earlier during VPlan construction.
BasicBlock *MiddleBB = State->CFG.ExitBB;
VPBasicBlock *MiddleVPBB =
- cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+ cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[0]);
// Find the VPBB for the scalar preheader, relying on the current structure
// when creating the middle block and its successrs: if there's a single
// predecessor, it must be the scalar preheader. Otherwise, the second
@@ -1049,7 +1075,14 @@ void VPlan::execute(VPTransformState *State) {
assert(!isa<VPIRBasicBlock>(ScalarPhVPBB) &&
"scalar preheader cannot be wrapped already");
replaceVPBBWithIRVPBB(ScalarPhVPBB, ScalarPh);
- replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
+ MiddleVPBB = replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
+
+ // Ensure the middle block is still the first successor.
+ for (auto *Succ : getVectorLoopRegion()->getSuccessors())
+ if (Succ == MiddleVPBB) {
+ getVectorLoopRegion()->moveSuccessorToFront(MiddleVPBB);
+ break;
+ }
// Disconnect the middle block from its single successor (the scalar loop
// header) in both the CFG and DT. The branch will be recreated during VPlan
@@ -1110,6 +1143,20 @@ void VPlan::execute(VPTransformState *State) {
cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
}
+ // Patch up early exiting vector block to jump to the original scalar loop's
+ // early exit block.
+ if (getVectorLoopRegion()->getEarlyExit()) {
+ VPBasicBlock *EarlyExitVPBB =
+ cast<VPBasicBlock>(getVectorLoopRegion()->getEarlyExit());
+ BasicBlock *VectorEarlyExitBB = State->CFG.VPBB2IRBB[EarlyExitVPBB];
+ BasicBlock *OrigEarlyExitBB = State->CFG.EarlyExitBB;
+ BranchInst *BI = BranchInst::Create(OrigEarlyExitBB);
+ BI->insertBefore(VectorEarlyExitBB->getTerminator());
+ VectorEarlyExitBB->getTerminator()->eraseFromParent();
+ State->CFG.DTU.applyUpdates(
+ {{DominatorTree::Insert, VectorEarlyExitBB, OrigEarlyExitBB}});
+ }
+
State->CFG.DTU.flush();
assert(State->CFG.DTU.getDomTree().verify(
DominatorTree::VerificationLevel::Fast) &&
@@ -1218,9 +1265,10 @@ LLVM_DUMP_METHOD
void VPlan::dump() const { print(dbgs()); }
#endif
-void VPlan::addLiveOut(PHINode *PN, VPValue *V) {
- assert(LiveOuts.count(PN) == 0 && "an exit value for PN already exists");
- LiveOuts.insert({PN, new VPLiveOut(PN, V)});
+void VPlan::addLiveOut(PHINode *PN, VPValue *V, VPBasicBlock *IncomingBlock) {
+ auto Key = std::pair<PHINode *, VPBasicBlock *>(PN, IncomingBlock);
+ assert(LiveOuts.count(Key) == 0 && "an exit value for PN already exists");
+ LiveOuts.insert({Key, new VPLiveOut(PN, V, IncomingBlock)});
}
static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
@@ -1291,8 +1339,9 @@ VPlan *VPlan::duplicate() {
remapOperands(Entry, NewEntry, Old2NewVPValues);
// Clone live-outs.
- for (const auto &[_, LO] : LiveOuts)
- NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)]);
+ for (const auto &[Key, LO] : LiveOuts)
+ NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)],
+ Key.second);
// Initialize remaining fields of cloned VPlan.
NewPlan->VFs = VFs;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8c5246d613c13d..7bf1dac06a8911 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -345,6 +345,11 @@ struct VPTransformState {
/// vector loop.
BasicBlock *ExitBB = nullptr;
+ /// We need to keep track of the early exit block from the original scalar
+ /// loop in order to update the dominator tree correctly, since the vector
+ /// early exit will also jump to the original.
+ BasicBlock *EarlyExitBB = nullptr;
+
/// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
/// of replication, maps the BasicBlock of the last replica created.
SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
@@ -617,6 +622,17 @@ class VPBlockBase {
return true;
}
+ void moveSuccessorToFront(VPBlockBase *Succ) {
+ if (Successors[0] == Succ)
+ return;
+
+ removeSuccessor(Succ);
+
+ VPBlockBase *Old = Successors[0];
+ Successors[0] = Succ;
+ appendSuccessor(Old);
+ }
+
/// Replace all operands of VPUsers in the block with \p NewValue and also
/// replaces all uses of VPValues defined in the block with NewValue.
virtual void dropAllReferences(VPValue *NewValue) = 0;
@@ -662,10 +678,12 @@ class VPBlockBase {
/// used.
class VPLiveOut : public VPUser {
PHINode *Phi;
+ VPBasicBlock *IncomingVPBB;
public:
- VPLiveOut(PHINode *Phi, VPValue *Op)
- : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {}
+ VPLiveOut(PHINode *Phi, VPValue *Op, VPBasicBlock *IncomingVPBB = nullptr)
+ : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi),
+ IncomingVPBB(IncomingVPBB) {}
static inline bool classof(const VPUser *U) {
return U->getVPUserID() == VPUser::VPUserID::LiveOut;
@@ -1244,6 +1262,8 @@ class VPInstruction : public VPRecipeWithIRFlags,
// operand). Only generates scalar values (either for the first lane only or
// for all lanes, depending on its uses).
PtrAdd,
+ ExtractHighestActive,
+ OrReduction,
};
private:
@@ -3343,7 +3363,12 @@ class VPIRBasicBlock : public VPBasicBlock {
};
/// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks
-/// which form a Single-Entry-Single-Exiting subgraph of the output IR CFG.
+/// which form a Single-Entry-Single-Exiting or Single-Entry-Multiple-Exiting
+/// subgraph of the output IR CFG. For the multiple-exiting case only a total
+/// of two exits are currently supported and the early exit is tracked
+/// separately. The first successor should always correspond to the normal
+/// exiting block, i.e. vector latch -> middle.block. An optional second
+/// successor corresponds to the early exit.
/// A VPRegionBlock may indicate that its contents are to be replicated several
/// times. This is designed to support predicated scalarization, in which a
/// scalar if-then code structure needs to be generated VF * UF times. Having
@@ -3351,13 +3376,25 @@ class VPIRBasicBlock : public VPBasicBlock {
/// candidate VF's. The actual replication takes place only once the desired VF
/// and UF have been determined.
class VPRegionBlock : public VPBlockBase {
- /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock.
+ /// Hold the Single Entry of the SESE/SEME region modelled by the
+ /// VPRegionBlock.
VPBlockBase *Entry;
- /// Hold the Single Exiting block of the SESE region modelled by the
+ /// Hold the normal Exiting block of the SESE/SEME region modelled by the
/// VPRegionBlock.
VPBlockBase *Exiting;
+ /// Hold the Early Exiting block of the SEME region. If this is a SESE region
+ /// this value should be nullptr.
+ VPBlockBase *EarlyExiting;
+
+ /// Hold the Early Exit block of the SEME region, if one exists.
+ VPBasicBlock *EarlyExit;
+
+ /// If one exists, this keeps track of the vector early mask that triggered
+ /// the early exit.
+ VPValue *VectorEarlyExitCond;
+
/// An indicator whether this region is to generate multiple replicated
/// instances of output IR corresponding to its VPBlockBases.
bool IsReplicator;
@@ -3366,6 +3403,7 @@ class VPRegionBlock : public VPBlockBase {
VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
const std::string &Name = "", bool IsReplicator = false)
: VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting),
+ EarlyExiting(nullptr), EarlyExit(nullptr), VectorEarlyExitCond(nullptr),
IsReplicator(IsReplicator) {
assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
assert(Exiting->getSuccessors().empty() && "Exit block has successors.");
@@ -3374,6 +3412,7 @@ class VPRegionBlock : public VPBlockBase {
}
VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
: VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
+ EarlyExiting(nullptr), EarlyExit(nullptr), VectorEarlyExitCond(nullptr),
IsReplicator(IsReplicator) {}
~VPRegionBlock() override {
@@ -3392,6 +3431,12 @@ class VPRegionBlock : public VPBlockBase {
const VPBlockBase *getEntry() const { return Entry; }
VPBlockBase *getEntry() { return Entry; }
+ /// Sets the early exit vector mask.
+ void setVectorEarlyExitCond(VPValue *V) { VectorEarlyExitCond = V; }
+
+ /// Gets the early exit vector mask
+ VPValue *getVectorEarlyExitCond() const { return VectorEarlyExitCond; }
+
/// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
/// EntryBlock must have no predecessors.
void setEntry(VPBlockBase *EntryBlock) {
@@ -3404,8 +3449,8 @@ class VPRegionBlock : public VPBlockBase {
const VPBlockBase *getExiting() const { return Exiting; }
VPBlockBase *getExiting() { return Exiting; }
- /// Set \p ExitingBlock as the exiting VPBlockBase of this VPRegionBlock. \p
- /// ExitingBlock must have no successors.
+ /// Set \p ExitingBlock as the normal exiting VPBlockBase of this
+ /// VPRegionBlock. \p ExitingBlock must have no successors.
void setExiting(VPBlockBase *ExitingBlock) {
assert(ExitingBlock->getSuccessors().empty() &&
"Exit block cannot have successors.");
@@ -3413,6 +3458,29 @@ class VPRegionBlock : public VPBlockBase {
ExitingBlock->setParent(this);
}
+ /// Set \p EarlyExitingBlock as the early exiting VPBlockBase of this
+ /// VPRegionBlock. \p EarlyExitingBlock must have a successor, since
+ /// it cannot be the latch.
+ void setEarlyExiting(VPBlockBase *EarlyExitingBlock) {
+ assert(EarlyExitingBlock->getNumSuccessors() == 1 &&
+ "Early exit block must have a successor.");
+ assert(EarlyExitingBlock->getParent() == this &&
+ "Early exit block should already be in loop region");
+ EarlyExiting = EarlyExitingBlock;
+ }
+
+ const VPBlockBase *getEarlyExiting() const { return EarlyExiting; }
+ VPBlockBase *getEarlyExiting() { return EarlyExiting; }
+
+ void setEarlyExit(VPBasicBlock *ExitBlock) { EarlyExit = ExitBlock; }
+
+ const VPBasicBlock *getEarlyExit() const { return EarlyExit; }
+ VPBasicBlock *getEarlyExit() { return EarlyExit; }
+
+ /// Return the number of exiting blocks from this region. It should match
+ /// the number of successors.
+ unsigned getNumExitingBlocks() const { return EarlyExiting ? 2 : 1; }
+
/// Returns the pre-header VPBasicBlock of the loop region.
VPBasicBlock *getPreheaderVPBB() {
assert(!isReplicator() && "should only get pre-header of loop regions");
@@ -3505,7 +3573,7 @@ class VPlan {
/// Values used outside the plan. It contains live-outs that need fixing. Any
/// live-out that is fixed outside VPlan needs to be removed. The remaining
/// live-outs are fixed via VPLiveOut::fixPhi.
- MapVector<PHINode *, VPLiveOut *> LiveOuts;
+ MapVector<std::pair<PHINode *, VPBasicBlock *>, VPLiveOut *> LiveOuts;
/// Mapping from SCEVs to the VPValues representing their expansions.
/// NOTE: This mapping is temporary and will be removed once all users have
@@ -3549,7 +3617,8 @@ class VPlan {
static VPlanPtr createInitialVPlan(Type *InductionTy,
PredicatedScalarEvolution &PSE,
bool RequiresScalarEpilogueCheck,
- bool TailFolded, Loop *TheLoop);
+ bool TailFolded, Loop *TheLoop,
+ bool HasEarlyExit = false);
/// Prepare the plan for execution, setting up the required live-in values.
void prepareToExecute(Value *TripCount, Value *VectorTripCount,
@@ -3686,12 +3755,20 @@ class VPlan {
return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
}
- void addLiveOut(PHINode *PN, VPValue *V);
+ void addLiveOut(PHINode *PN, VPValue *V,
+ VPBasicBlock *IncomingBlock = nullptr);
- const MapVector<PHINode *, VPLiveOut *> &getLiveOuts() const {
+ const MapVector<std::pair<PHINode *, VPBasicBlock *>, VPLiveOut *> &
+ getLiveOuts() const {
return LiveOuts;
}
+ void removeLiveOut(PHINode *PN, VPBasicBlock *IncomingBlock = nullptr) {
+ auto Key = std::pair<PHINode *, VPBasicBlock *>(PN, IncomingBlock);
+ delete LiveOuts[Key];
+ LiveOuts.erase(Key);
+ }
+
VPValue *getSCEVExpansion(const SCEV *S) const {
return SCEVToExpansion.lookup(S);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9b1294f2c42822..1a9e8f41f79913 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -60,8 +60,10 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExtractFromEnd:
+ case VPInstruction::ExtractHighestActive:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
+ case VPInstruction::OrReduction:
case VPInstruction::PtrAdd:
return false;
default:
@@ -204,15 +206,20 @@ bool VPRecipeBase::mayHaveSideEffects() const {
void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
VPValue *ExitValue = getOperand(0);
- VPBasicBlock *MiddleVPBB =
- cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
- VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
- auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;
- // Values leaving the vector loop reach live out phi's in the exiting block
- // via middle block.
- auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion()
- ? MiddleVPBB
- : ExitingVPBB;
+ VPBasicBlock *PredVPBB = nullptr;
+ if (IncomingVPBB)
+ PredVPBB = IncomingVPBB;
+ else {
+ VPRecipeBase *ExitRecipe = ExitValue->getDefiningRecipe();
+ auto *ExitVPBB = ExitRecipe ? ExitRecipe->getParent() : nullptr;
+ VPBasicBlock *MiddleVPBB =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSuccessors()[0]);
+ // Values leaving the vector loop reach live out phi's in the exiting block
+ // via middle block.
+ PredVPBB =
+ !ExitVPBB || ExitVPBB->getEnclosingLoopRegion() ? MiddleVPBB : ExitVPBB;
+ }
+
BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
Value *V = State.get(ExitValue, VPLane(0));
if (Phi->getBasicBlockIndex(PredBB) != -1)
@@ -386,6 +393,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
case VPInstruction::BranchOnCount:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
+ case VPInstruction::OrReduction:
case VPInstruction::PtrAdd:
case VPInstruction::ExplicitVectorLength:
return true;
@@ -521,6 +529,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
return CondBr;
VPRegionBlock *ParentRegion = getParent()->getParent();
+ if (ParentRegion->getEarlyExiting() == getParent())
+ return CondBr;
+
VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
return CondBr;
@@ -613,6 +624,13 @@ Value *VPInstruction::generate(VPTransformState &State) {
return ReducedPartRdx;
}
+ case VPInstruction::ExtractHighestActive: {
+ Value *Vec = State.get(getOperand(0));
+ Value *Mask = State.get(getOperand(1));
+ Value *Ctz =
+ Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask);
+ return Builder.CreateExtractElement(Vec, Ctz);
+ }
case VPInstruction::ExtractFromEnd: {
auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
unsigned Offset = CI->getZExtValue();
@@ -662,7 +680,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
return NewPhi;
}
-
+ case VPInstruction::OrReduction: {
+ Value *Val = State.get(getOperand(0));
+ return Builder.CreateOrReduce(Val);
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -670,7 +691,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractFromEnd ||
- getOpcode() == VPInstruction::ComputeReductionResult;
+ getOpcode() == VPInstruction::ExtractHighestActive ||
+ getOpcode() == VPInstruction::ComputeReductionResult ||
+ getOpcode() == VPInstruction::OrReduction;
}
bool VPInstruction::isSingleScalar() const {
@@ -818,6 +841,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ExtractFromEnd:
O << "extract-from-end";
break;
+ case VPInstruction::ExtractHighestActive:
+ O << "extract-highest-active";
+ break;
case VPInstruction::ComputeReductionResult:
O << "compute-reduction-result";
break;
@@ -827,6 +853,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::PtrAdd:
O << "ptradd";
break;
+ case VPInstruction::OrReduction:
+ O << "or reduction";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9ca14fc7812138..aadb9942bab4c4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -382,7 +382,8 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
continue;
auto *PredVPBB =
dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
- if (!PredVPBB || PredVPBB->getNumSuccessors() != 1)
+ if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
+ PredVPBB == Plan.getVectorLoopRegion()->getEarlyExiting())
continue;
WorkList.push_back(VPBB);
}
@@ -399,6 +400,8 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
VPBlockUtils::disconnectBlocks(VPBB, Succ);
VPBlockUtils::connectBlocks(PredVPBB, Succ);
}
+ if (ParentRegion && ParentRegion->getEarlyExiting() == VPBB)
+ ParentRegion->setEarlyExiting(PredVPBB);
delete VPBB;
}
return !WorkList.empty();
@@ -851,7 +854,6 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
/// Try to simplify recipe \p R.
static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
using namespace llvm::VPlanPatternMatch;
-
if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
// Try to remove redundant blend recipes.
SmallPtrSet<VPValue *, 4> UniqueValues;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 99bc4c38a3c3cd..5d5e2828da53d4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -244,13 +244,28 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
return false;
}
- VPBlockBase *MiddleBB =
- IRBB->getPlan()->getVectorLoopRegion()->getSingleSuccessor();
- if (IRBB != IRBB->getPlan()->getPreheader() &&
- IRBB->getSinglePredecessor() != MiddleBB) {
- errs() << "VPIRBasicBlock can only be used as pre-header or a successor of "
- "middle-block at the moment!\n";
- return false;
+ if (IRBB != IRBB->getPlan()->getPreheader()) {
+ const SmallVectorImpl<VPBlockBase *> &Succs =
+ IRBB->getPlan()->getVectorLoopRegion()->getSuccessors();
+
+ // First successor is always the middle block, and the middle block's first
+ // successor is always the exit block.
+ unsigned NumMatches = 0;
+ if (Succs[0]->getSuccessors()[0] == IRBB) {
+ NumMatches++;
+ }
+
+ // The remaining successors should be vector early exits.
+ for (unsigned I = 1; I < Succs.size(); I++) {
+ if (Succs[I]->getSingleSuccessor() == IRBB)
+ NumMatches++;
+ }
+ if (!NumMatches) {
+ errs() << "VPIRBasicBlock can only be used as pre-header or an indirect "
+ "successor of "
+ "VPRegionBlock at the moment!\n";
+ return false;
+ }
}
return true;
}
@@ -269,7 +284,9 @@ static bool hasDuplicates(const SmallVectorImpl<VPBlockBase *> &VPBlockVec) {
bool VPlanVerifier::verifyBlock(const VPBlockBase *VPB) {
auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
// Check block's condition bit.
- if (VPB->getNumSuccessors() > 1 ||
+ // NOTE: A VPRegionBlock can legally have multiple successors due to
+ // early exits from the loop.
+ if ((VPB->getNumSuccessors() > 1 && !isa<VPRegionBlock>(VPB)) ||
(VPBB && VPBB->getParent() && VPBB->isExiting() &&
!VPBB->getParent()->isReplicator())) {
if (!VPBB || !VPBB->getTerminator()) {
@@ -277,7 +294,7 @@ bool VPlanVerifier::verifyBlock(const VPBlockBase *VPB) {
"have a proper branch recipe!\n";
return false;
}
- } else {
+ } else if (!isa<VPRegionBlock>(VPB)) {
if (VPBB && VPBB->getTerminator()) {
errs() << "Unexpected branch recipe!\n";
return false;
@@ -293,6 +310,26 @@ bool VPlanVerifier::verifyBlock(const VPBlockBase *VPB) {
return false;
}
+ // If this is a loop region with multiple successors it must have as many
+ // exiting blocks as successors, even if the original scalar loop only had a
+ // single exit block. That's because in the vector loop we always create a
+ // middle block for the vector latch exit, which is distinct from the early
+ // exit.
+ auto *VPRB = dyn_cast<VPRegionBlock>(VPB);
+ if (VPRB && VPRB->getNumExitingBlocks() != VPB->getNumSuccessors()) {
+ errs() << "Number of exiting blocks (" << VPRB->getNumExitingBlocks()
+ << ") does not match number of successors ("
+ << VPB->getNumSuccessors() << ")!\n";
+ return false;
+ }
+
+ if (auto *VPRB = dyn_cast<VPRegionBlock>(VPB)) {
+ if (VPRB->getNumExitingBlocks() != VPB->getNumSuccessors()) {
+ errs() << "Not enough exiting blocks for successors!\n";
+ return false;
+ }
+ }
+
for (const VPBlockBase *Succ : Successors) {
// There must be a bi-directional link between block and successor.
const auto &SuccPreds = Succ->getPredecessors();
diff --git a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll
index 49454ae18db79d..4657fa32415752 100644
--- a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll
@@ -1,7 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
; REQUIRES: asserts
-; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize 2>%t | FileCheck %s --check-prefixes=CHECK
+; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -debug-only=loop-vectorize \
+; RUN: 2>%t | FileCheck %s --check-prefixes=CHECK,MAY_FAULT
; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
+; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -vectorizer-no-mem-fault \
+; RUN: | FileCheck %s --check-prefixes=CHECK,NO_FAULT
declare void @init_mem(ptr, i64);
@@ -9,29 +12,62 @@ define i64 @same_exit_block_pre_inc_use1() {
; DEBUG-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1'
; DEBUG: LV: Found an early exit loop with symbolic max backedge taken count: 63
; DEBUG-NEXT: LV: We can vectorize this loop!
-; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported.
; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1() {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT: br label [[LAND_RHS:%.*]]
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK: loop.inc3:
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], [[INDEX1]]
+; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP12]]
+; CHECK-NEXT: br label [[LOOP_END:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
; CHECK: loop.inc:
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: loop.end:
-; CHECK-NEXT: [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ]
-; CHECK-NEXT: ret i64 [[START_0_LCSSA]]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
%p1 = alloca [1024 x i8]
@@ -67,21 +103,55 @@ define i64 @same_exit_block_pre_inc_use1_gep_two_indices() {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT: br i1 [[TMP20]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK: loop.inc3:
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; CHECK-NEXT: br i1 [[TMP8]], label [[LOOP_INC:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], [[INDEX1]]
+; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP25]]
+; CHECK-NEXT: br label [[LOOP_END:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[LOOP_INC]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[INDEX2]]
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[INDEX2]]
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_END]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ], [ 67, [[LOOP_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -118,21 +188,55 @@ define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() {
; CHECK-NEXT: [[P2:%.*]] = alloca [40 x i32], align 4
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: br label [[LAND_RHS:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT: br i1 [[TMP20]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK: loop.inc3:
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_INC:%.*]], label [[LAND_RHS]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], [[INDEX1]]
+; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP25]]
+; CHECK-NEXT: br label [[LOOP_END:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[FOR_INC]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
+; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: loop.end:
-; CHECK-NEXT: [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ]
+; CHECK-NEXT: [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX2]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[START_0_LCSSA]]
;
entry:
@@ -169,21 +273,50 @@ define i64 @same_exit_block_pre_inc_use2() {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
+; CHECK-NEXT: br i1 [[TMP13]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK: loop.inc3:
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> <i64 67, i64 67, i64 67, i64 67>, i64 [[TMP14]]
+; CHECK-NEXT: br label [[LOOP_END:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_END]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 67, [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 67, [[LOOP1]] ], [ [[INDEX2]], [[LOOP_INC1]] ], [ 66, [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -219,21 +352,58 @@ define i64 @same_exit_block_pre_inc_use3() {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT: br i1 [[TMP20]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK: loop.inc3:
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], [[INDEX1]]
+; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP25]]
+; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[TMP26]], [[INDEX1]]
+; CHECK-NEXT: [[IND_EARLY_ESCAPE5:%.*]] = add i64 3, [[TMP27]]
+; CHECK-NEXT: br label [[LOOP_END:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_END]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: loop.end:
-; CHECK-NEXT: [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[INDEX]], [[LOOP]] ]
+; CHECK-NEXT: [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX2]], [[LOOP_INC1]] ], [ [[INDEX2]], [[LOOP1]] ], [ [[IND_EARLY_ESCAPE5]], [[VECTOR_EARLY_EXIT]] ], [ 66, [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[INDEX_LCSSA]]
;
entry:
@@ -271,19 +441,19 @@ define i64 @same_exit_block_pre_inc_use4() {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i64], align 8
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD1:%.*]] = load i64, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i64 [[INDEX]], [[LD1]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i64 [[INDEX2]], [[LD1]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_END:%.*]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -319,21 +489,55 @@ define i64 @same_exit_block_post_inc_use() {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT: br i1 [[TMP20]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK: loop.inc3:
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], [[INDEX1]]
+; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP25]]
+; CHECK-NEXT: br label [[LOOP_END:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_END]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[INDEX_NEXT]], [[LOOP_INC]] ]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ [[INDEX_NEXT1]], [[LOOP_INC1]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 67, [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -369,21 +573,57 @@ define i64 @same_exit_block_post_inc_use2() {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP7]], 1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
+; CHECK-NEXT: br i1 [[TMP15]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK: loop.inc3:
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i64 [[TMP16]]
+; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[TMP19]], [[INDEX1]]
+; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP20]], 1
+; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP21]]
+; CHECK-NEXT: br label [[LOOP_END:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_END]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX_NEXT1]], [[LOOP1]] ], [ [[INDEX2]], [[LOOP_INC1]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 66, [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -419,21 +659,50 @@ define i64 @same_exit_block_phi_of_consts() {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
+; CHECK-NEXT: br i1 [[TMP13]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK: loop.inc3:
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; CHECK-NEXT: br i1 [[TMP12]], label [[LOOP_INC:%.*]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> zeroinitializer, i64 [[TMP14]]
+; CHECK-NEXT: br label [[LOOP_END:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[LOOP_INC]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_END]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP1]] ], [ 1, [[LOOP_INC1]] ], [ 1, [[LOOP_INC]] ], [ [[TMP16]], [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -470,24 +739,58 @@ define i64 @diff_exit_block_pre_inc_use1() {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT: br i1 [[TMP20]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK: loop.inc3:
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; CHECK-NEXT: br i1 [[TMP8]], label [[LOOP_INC:%.*]], label [[LOOP]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], [[INDEX1]]
+; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP25]]
+; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[LOOP_INC]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_EARLY_EXIT]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
+; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK: loop.early.exit:
-; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ]
+; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[RETVAL1]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ 67, [[LOOP_INC1]] ], [ 67, [[LOOP_INC]] ]
; CHECK-NEXT: ret i64 [[RETVAL2]]
;
entry:
@@ -527,24 +830,53 @@ define i64 @diff_exit_block_pre_inc_use2() {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
+; CHECK-NEXT: br i1 [[TMP13]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK: loop.inc3:
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> <i64 67, i64 67, i64 67, i64 67>, i64 [[TMP14]]
+; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_EARLY_EXIT]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
+; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK: loop.early.exit:
-; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP]] ]
+; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP1]] ], [ [[TMP16]], [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[RETVAL1]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ]
+; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX2]], [[LOOP_INC1]] ], [ 66, [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RETVAL2]]
;
entry:
@@ -584,24 +916,58 @@ define i64 @diff_exit_block_pre_inc_use3() {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT5:%.*]], [[LOOP_INC4:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC4]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX2]]
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT: br i1 [[TMP20]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC4]]
+; CHECK: loop.inc4:
+; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i64 [[INDEX2]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT5]], 64
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], [[INDEX2]]
+; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP25]]
+; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX1]]
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX1]]
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_EARLY_EXIT]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
+; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX1]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK: loop.early.exit:
-; CHECK-NEXT: [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ]
+; CHECK-NEXT: [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX1]], [[LOOP1]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[INDEX_LCSSA]]
; CHECK: loop.end:
-; CHECK-NEXT: [[INDEX_LCSSA1:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ]
+; CHECK-NEXT: [[INDEX_LCSSA1:%.*]] = phi i64 [ [[INDEX1]], [[LOOP_INC1]] ], [ 66, [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[INDEX_LCSSA1]]
;
entry:
@@ -640,19 +1006,46 @@ define i64 @diff_exit_block_phi_of_consts() {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
+; CHECK-NEXT: br i1 [[TMP13]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK: loop.inc3:
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_EARLY_EXIT]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
+; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP25:![0-9]+]]
; CHECK: loop.early.exit:
; CHECK-NEXT: ret i64 0
; CHECK: loop.end:
@@ -694,24 +1087,58 @@ define i64 @diff_exit_block_post_inc_use1() {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT: br i1 [[TMP20]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK: loop.inc3:
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], [[INDEX1]]
+; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP25]]
+; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_EARLY_EXIT]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
+; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP27:![0-9]+]]
; CHECK: loop.early.exit:
-; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ]
+; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[RETVAL1]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP_INC]] ]
+; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX_NEXT1]], [[LOOP_INC1]] ], [ 67, [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RETVAL2]]
;
entry:
@@ -752,24 +1179,60 @@ define i64 @diff_exit_block_post_inc_use2() {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
+; CHECK-NEXT: br i1 [[TMP15]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK: loop.inc3:
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i64 [[TMP16]]
+; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[TMP19]], [[INDEX1]]
+; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP20]], 1
+; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP21]]
+; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_EARLY_EXIT]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP29:![0-9]+]]
; CHECK: loop.early.exit:
-; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ]
+; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX_NEXT1]], [[LOOP1]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[RETVAL1]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ]
+; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX2]], [[LOOP_INC1]] ], [ 66, [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RETVAL2]]
;
entry:
@@ -867,24 +1330,57 @@ define i64 @multiple_exits_one_early() {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP15]]
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP21:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT: br i1 [[TMP21]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK: loop.inc3:
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 60
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
+; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[TMP25]], [[INDEX1]]
+; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP26]]
+; CHECK-NEXT: br label [[LOOP_END:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 63, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64
-; CHECK-NEXT: br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i64 [[INDEX2]], 64
+; CHECK-NEXT: br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END]]
; CHECK: search:
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_END]], label [[LOOP_INC]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_END]], label [[LOOP_INC1]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 128
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP31:![0-9]+]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 64, [[LOOP]] ], [ [[INDEX]], [[SEARCH]] ], [ 128, [[LOOP_INC]] ]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 64, [[LOOP1]] ], [ [[INDEX2]], [[SEARCH]] ], [ 128, [[LOOP_INC1]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -1040,22 +1536,57 @@ define i64 @same_exit_block_pre_inc_use_inv_cond(i1 %cond) {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[COND]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP21:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
+; CHECK-NEXT: br i1 [[TMP21]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK: loop.inc3:
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; CHECK-NEXT: br i1 [[TMP9]], label [[LOOP_INC:%.*]], label [[LOOP]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP7]], i1 true)
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP10]]
+; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP7]], i1 true)
+; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[TMP25]], [[INDEX1]]
+; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP26]]
+; CHECK-NEXT: br label [[LOOP_END:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[LOOP_INC]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
; CHECK-NEXT: [[CMP4:%.*]] = select i1 [[COND]], i1 [[CMP3]], i1 false
-; CHECK-NEXT: br i1 [[CMP4]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK-NEXT: br i1 [[CMP4]], label [[LOOP_INC1]], label [[LOOP_END]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP33:![0-9]+]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ], [ 67, [[LOOP_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -1096,20 +1627,20 @@ define i64 @loop_contains_safe_call() {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[INDEX2]]
; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT: [[SQRT:%.*]] = tail call fast float @llvm.sqrt.f32(float [[LD1]])
; CHECK-NEXT: [[CMP:%.*]] = fcmp fast ult float [[SQRT]], 3.000000e+00
-; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC1]], label [[LOOP_END:%.*]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -1152,7 +1683,7 @@ define i64 @loop_contains_unsafe_call() {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[BAD_CALL:%.*]] = call i32 @foo(i32 [[LD1]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT: [[BAD_CALL:%.*]] = call i32 @foo(i32 [[LD1]]) #[[ATTR3:[0-9]+]]
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[BAD_CALL]], 34
; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
; CHECK: loop.inc:
@@ -1199,20 +1730,20 @@ define i64 @loop_contains_safe_div() {
; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT: br label [[LOOP1:%.*]]
+; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX2]]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[LD1]], 20000
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[DIV]], 1
-; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC1]], label [[LOOP_END:%.*]]
+; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -1346,7 +1877,6 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_load_after_early_exit'
; DEBUG: LV: Found an early exit loop with symbolic max backedge taken count: 63
; DEBUG-NEXT: LV: We can vectorize this loop!
-; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported.
; CHECK-LABEL: define i64 @loop_contains_load_after_early_exit(
; CHECK-SAME: ptr align 8 dereferenceable(1024) [[P2:%.*]]) {
; CHECK-NEXT: entry:
@@ -1460,28 +1990,89 @@ loop.end:
define i64 @same_exit_block_pre_inc_use1_reverse() {
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_reverse() {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 1023, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], -1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]]
-; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 1024, [[LOOP_INC]] ]
-; CHECK-NEXT: ret i64 [[RETVAL]]
+; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_reverse() {
+; MAY_FAULT-NEXT: entry:
+; MAY_FAULT-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1
+; MAY_FAULT-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
+; MAY_FAULT-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
+; MAY_FAULT-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; MAY_FAULT-NEXT: br label [[LOOP:%.*]]
+; MAY_FAULT: loop:
+; MAY_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 1023, [[ENTRY:%.*]] ]
+; MAY_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; MAY_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MAY_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; MAY_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; MAY_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; MAY_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; MAY_FAULT: loop.inc:
+; MAY_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], -1
+; MAY_FAULT-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; MAY_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]]
+; MAY_FAULT: loop.end:
+; MAY_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 1024, [[LOOP_INC]] ]
+; MAY_FAULT-NEXT: ret i64 [[RETVAL]]
+;
+; NO_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_reverse() {
+; NO_FAULT-NEXT: entry:
+; NO_FAULT-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1
+; NO_FAULT-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
+; NO_FAULT-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
+; NO_FAULT-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; NO_FAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO_FAULT: vector.ph:
+; NO_FAULT-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO_FAULT: vector.body:
+; NO_FAULT-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT5:%.*]], [[LOOP_INC4:%.*]] ]
+; NO_FAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1023, i64 1022, i64 1021, i64 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC4]] ]
+; NO_FAULT-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX1]]
+; NO_FAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; NO_FAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
+; NO_FAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; NO_FAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 -3
+; NO_FAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; NO_FAULT-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; NO_FAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]]
+; NO_FAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; NO_FAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 -3
+; NO_FAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP6]], align 1
+; NO_FAULT-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD2]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; NO_FAULT-NEXT: [[TMP7:%.*]] = icmp eq <4 x i8> [[REVERSE]], [[REVERSE3]]
+; NO_FAULT-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
+; NO_FAULT-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]])
+; NO_FAULT-NEXT: br i1 [[TMP9]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC4]]
+; NO_FAULT: loop.inc4:
+; NO_FAULT-NEXT: [[INDEX_NEXT5]] = add nuw i64 [[INDEX1]], 4
+; NO_FAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
+; NO_FAULT-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT5]], 1020
+; NO_FAULT-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; NO_FAULT: vector.early.exit:
+; NO_FAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP8]], i1 true)
+; NO_FAULT-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP11]]
+; NO_FAULT-NEXT: [[TMP13:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP8]], i1 true)
+; NO_FAULT-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], [[INDEX1]]
+; NO_FAULT-NEXT: [[IND_EARLY_ESCAPE:%.*]] = sub i64 1023, [[TMP14]]
+; NO_FAULT-NEXT: br label [[LOOP_END:%.*]]
+; NO_FAULT: middle.block:
+; NO_FAULT-NEXT: br i1 false, label [[LOOP_END]], label [[SCALAR_PH]]
+; NO_FAULT: scalar.ph:
+; NO_FAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; NO_FAULT-NEXT: br label [[LOOP:%.*]]
+; NO_FAULT: loop:
+; NO_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; NO_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; NO_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; NO_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; NO_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; NO_FAULT: loop.inc:
+; NO_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], -1
+; NO_FAULT-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; NO_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]], !llvm.loop [[LOOP35:![0-9]+]]
+; NO_FAULT: loop.end:
+; NO_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 1024, [[LOOP_INC]] ], [ 1024, [[MIDDLE_BLOCK]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
+; NO_FAULT-NEXT: ret i64 [[RETVAL]]
;
entry:
%p1 = alloca [1024 x i8]
@@ -1573,25 +2164,113 @@ loop.end:
define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p1, ptr dereferenceable(1024) %p2) {
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_deref_ptrs(
-; CHECK-SAME: ptr dereferenceable(1024) [[P1:%.*]], ptr dereferenceable(1024) [[P2:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT: ret i64 [[RETVAL]]
+; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_deref_ptrs(
+; MAY_FAULT-SAME: ptr dereferenceable(1024) [[P1:%.*]], ptr dereferenceable(1024) [[P2:%.*]]) {
+; MAY_FAULT-NEXT: entry:
+; MAY_FAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; MAY_FAULT: vector.ph:
+; MAY_FAULT-NEXT: br label [[VECTOR_BODY:%.*]]
+; MAY_FAULT: vector.body:
+; MAY_FAULT-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; MAY_FAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ]
+; MAY_FAULT-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; MAY_FAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; MAY_FAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
+; MAY_FAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; MAY_FAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; MAY_FAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]]
+; MAY_FAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; MAY_FAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; MAY_FAULT-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; MAY_FAULT-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; MAY_FAULT-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; MAY_FAULT-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; MAY_FAULT: loop.inc3:
+; MAY_FAULT-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; MAY_FAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; MAY_FAULT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; MAY_FAULT-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; MAY_FAULT: vector.early.exit:
+; MAY_FAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; MAY_FAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]]
+; MAY_FAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; MAY_FAULT-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], [[INDEX1]]
+; MAY_FAULT-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP12]]
+; MAY_FAULT-NEXT: br label [[LOOP_END:%.*]]
+; MAY_FAULT: middle.block:
+; MAY_FAULT-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; MAY_FAULT: scalar.ph:
+; MAY_FAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; MAY_FAULT-NEXT: br label [[LOOP:%.*]]
+; MAY_FAULT: loop:
+; MAY_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; MAY_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; MAY_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MAY_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; MAY_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; MAY_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; MAY_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; MAY_FAULT: loop.inc:
+; MAY_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; MAY_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; MAY_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP35:![0-9]+]]
+; MAY_FAULT: loop.end:
+; MAY_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
+; MAY_FAULT-NEXT: ret i64 [[RETVAL]]
+;
+; NO_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_deref_ptrs(
+; NO_FAULT-SAME: ptr dereferenceable(1024) [[P1:%.*]], ptr dereferenceable(1024) [[P2:%.*]]) {
+; NO_FAULT-NEXT: entry:
+; NO_FAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO_FAULT: vector.ph:
+; NO_FAULT-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO_FAULT: vector.body:
+; NO_FAULT-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; NO_FAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ]
+; NO_FAULT-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; NO_FAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; NO_FAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
+; NO_FAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; NO_FAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; NO_FAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]]
+; NO_FAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; NO_FAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; NO_FAULT-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; NO_FAULT-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; NO_FAULT-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; NO_FAULT-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; NO_FAULT: loop.inc3:
+; NO_FAULT-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; NO_FAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; NO_FAULT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; NO_FAULT-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; NO_FAULT: vector.early.exit:
+; NO_FAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; NO_FAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]]
+; NO_FAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; NO_FAULT-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], [[INDEX1]]
+; NO_FAULT-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP12]]
+; NO_FAULT-NEXT: br label [[LOOP_END:%.*]]
+; NO_FAULT: middle.block:
+; NO_FAULT-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; NO_FAULT: scalar.ph:
+; NO_FAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; NO_FAULT-NEXT: br label [[LOOP:%.*]]
+; NO_FAULT: loop:
+; NO_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; NO_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; NO_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; NO_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; NO_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; NO_FAULT: loop.inc:
+; NO_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; NO_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; NO_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP37:![0-9]+]]
+; NO_FAULT: loop.end:
+; NO_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
+; NO_FAULT-NEXT: ret i64 [[RETVAL]]
;
entry:
br label %loop
@@ -1621,7 +2300,6 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) {
; DEBUG-LABEL: LV: Checking a loop in 'diff_exit_block_needs_scev_check'
; DEBUG: Found an early exit loop with symbolic max backedge taken count: (-1 + (1 umax (zext i10 (trunc i32 %end to i10) to i32)))<nsw>
; DEBUG-NEXT: LV: We can vectorize this loop!
-; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported.
; CHECK-LABEL: define i32 @diff_exit_block_needs_scev_check(
; CHECK-SAME: i32 [[END:%.*]]) {
; CHECK-NEXT: entry:
@@ -1691,7 +2369,7 @@ declare void @abort()
define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) {
; DEBUG-LABEL: LV: Checking a loop in 'diff_blocks_invariant_early_exit_cond'
; DEBUG: LV: Found an early exit loop with symbolic max backedge taken count: 275
-; DEBUG: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported.
+; DEBUG-NEXT: LV: We can vectorize this loop!
; CHECK-LABEL: define i32 @diff_blocks_invariant_early_exit_cond(
; CHECK-SAME: ptr [[S:%.*]]) {
; CHECK-NEXT: entry:
@@ -1791,28 +2469,85 @@ loop.end:
define i64 @same_exit_block_pre_inc_use1_too_small_allocas() {
; DEBUG-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_too_small_allocas'
; DEBUG: LV: Not vectorizing: Loop may fault.
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_allocas() {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[P1:%.*]] = alloca [42 x i8], align 1
-; CHECK-NEXT: [[P2:%.*]] = alloca [42 x i8], align 1
-; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT: ret i64 [[RETVAL]]
+; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_allocas() {
+; MAY_FAULT-NEXT: entry:
+; MAY_FAULT-NEXT: [[P1:%.*]] = alloca [42 x i8], align 1
+; MAY_FAULT-NEXT: [[P2:%.*]] = alloca [42 x i8], align 1
+; MAY_FAULT-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
+; MAY_FAULT-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; MAY_FAULT-NEXT: br label [[LOOP:%.*]]
+; MAY_FAULT: loop:
+; MAY_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; MAY_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; MAY_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MAY_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; MAY_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; MAY_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; MAY_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; MAY_FAULT: loop.inc:
+; MAY_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; MAY_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; MAY_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; MAY_FAULT: loop.end:
+; MAY_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; MAY_FAULT-NEXT: ret i64 [[RETVAL]]
+;
+; NO_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_allocas() {
+; NO_FAULT-NEXT: entry:
+; NO_FAULT-NEXT: [[P1:%.*]] = alloca [42 x i8], align 1
+; NO_FAULT-NEXT: [[P2:%.*]] = alloca [42 x i8], align 1
+; NO_FAULT-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
+; NO_FAULT-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; NO_FAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO_FAULT: vector.ph:
+; NO_FAULT-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO_FAULT: vector.body:
+; NO_FAULT-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; NO_FAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ]
+; NO_FAULT-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; NO_FAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; NO_FAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
+; NO_FAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; NO_FAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; NO_FAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]]
+; NO_FAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; NO_FAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; NO_FAULT-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; NO_FAULT-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; NO_FAULT-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; NO_FAULT-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; NO_FAULT: loop.inc3:
+; NO_FAULT-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; NO_FAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; NO_FAULT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; NO_FAULT-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; NO_FAULT: vector.early.exit:
+; NO_FAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; NO_FAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]]
+; NO_FAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; NO_FAULT-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], [[INDEX1]]
+; NO_FAULT-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP12]]
+; NO_FAULT-NEXT: br label [[LOOP_END:%.*]]
+; NO_FAULT: middle.block:
+; NO_FAULT-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; NO_FAULT: scalar.ph:
+; NO_FAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; NO_FAULT-NEXT: br label [[LOOP:%.*]]
+; NO_FAULT: loop:
+; NO_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; NO_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; NO_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; NO_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; NO_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; NO_FAULT: loop.inc:
+; NO_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; NO_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; NO_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP39:![0-9]+]]
+; NO_FAULT: loop.end:
+; NO_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
+; NO_FAULT-NEXT: ret i64 [[RETVAL]]
;
entry:
%p1 = alloca [42 x i8]
@@ -1842,25 +2577,79 @@ loop.end:
define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceable(42) %p1, ptr dereferenceable(42) %p2) {
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(
-; CHECK-SAME: ptr dereferenceable(42) [[P1:%.*]], ptr dereferenceable(42) [[P2:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT: ret i64 [[RETVAL]]
+; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(
+; MAY_FAULT-SAME: ptr dereferenceable(42) [[P1:%.*]], ptr dereferenceable(42) [[P2:%.*]]) {
+; MAY_FAULT-NEXT: entry:
+; MAY_FAULT-NEXT: br label [[LOOP:%.*]]
+; MAY_FAULT: loop:
+; MAY_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; MAY_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; MAY_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MAY_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; MAY_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; MAY_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; MAY_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; MAY_FAULT: loop.inc:
+; MAY_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; MAY_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; MAY_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; MAY_FAULT: loop.end:
+; MAY_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; MAY_FAULT-NEXT: ret i64 [[RETVAL]]
+;
+; NO_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(
+; NO_FAULT-SAME: ptr dereferenceable(42) [[P1:%.*]], ptr dereferenceable(42) [[P2:%.*]]) {
+; NO_FAULT-NEXT: entry:
+; NO_FAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO_FAULT: vector.ph:
+; NO_FAULT-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO_FAULT: vector.body:
+; NO_FAULT-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; NO_FAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ]
+; NO_FAULT-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; NO_FAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; NO_FAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
+; NO_FAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; NO_FAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; NO_FAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]]
+; NO_FAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; NO_FAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; NO_FAULT-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; NO_FAULT-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; NO_FAULT-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; NO_FAULT-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; NO_FAULT: loop.inc3:
+; NO_FAULT-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; NO_FAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; NO_FAULT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; NO_FAULT-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
+; NO_FAULT: vector.early.exit:
+; NO_FAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; NO_FAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]]
+; NO_FAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; NO_FAULT-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], [[INDEX1]]
+; NO_FAULT-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP12]]
+; NO_FAULT-NEXT: br label [[LOOP_END:%.*]]
+; NO_FAULT: middle.block:
+; NO_FAULT-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; NO_FAULT: scalar.ph:
+; NO_FAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; NO_FAULT-NEXT: br label [[LOOP:%.*]]
+; NO_FAULT: loop:
+; NO_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; NO_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; NO_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; NO_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; NO_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; NO_FAULT: loop.inc:
+; NO_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; NO_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; NO_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP41:![0-9]+]]
+; NO_FAULT: loop.end:
+; NO_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
+; NO_FAULT-NEXT: ret i64 [[RETVAL]]
;
entry:
br label %loop
@@ -1886,25 +2675,79 @@ loop.end:
define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) {
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(
-; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK: loop.inc:
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT: ret i64 [[RETVAL]]
+; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(
+; MAY_FAULT-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) {
+; MAY_FAULT-NEXT: entry:
+; MAY_FAULT-NEXT: br label [[LOOP:%.*]]
+; MAY_FAULT: loop:
+; MAY_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; MAY_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; MAY_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MAY_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; MAY_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; MAY_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; MAY_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; MAY_FAULT: loop.inc:
+; MAY_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; MAY_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; MAY_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; MAY_FAULT: loop.end:
+; MAY_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; MAY_FAULT-NEXT: ret i64 [[RETVAL]]
+;
+; NO_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(
+; NO_FAULT-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) {
+; NO_FAULT-NEXT: entry:
+; NO_FAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO_FAULT: vector.ph:
+; NO_FAULT-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO_FAULT: vector.body:
+; NO_FAULT-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; NO_FAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ]
+; NO_FAULT-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; NO_FAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; NO_FAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
+; NO_FAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; NO_FAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; NO_FAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]]
+; NO_FAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; NO_FAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; NO_FAULT-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; NO_FAULT-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; NO_FAULT-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; NO_FAULT-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; NO_FAULT: loop.inc3:
+; NO_FAULT-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; NO_FAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; NO_FAULT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
+; NO_FAULT-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; NO_FAULT: vector.early.exit:
+; NO_FAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; NO_FAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]]
+; NO_FAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; NO_FAULT-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], [[INDEX1]]
+; NO_FAULT-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP12]]
+; NO_FAULT-NEXT: br label [[LOOP_END:%.*]]
+; NO_FAULT: middle.block:
+; NO_FAULT-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; NO_FAULT: scalar.ph:
+; NO_FAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; NO_FAULT-NEXT: br label [[LOOP:%.*]]
+; NO_FAULT: loop:
+; NO_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; NO_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; NO_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; NO_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; NO_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; NO_FAULT: loop.inc:
+; NO_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; NO_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; NO_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP43:![0-9]+]]
+; NO_FAULT: loop.end:
+; NO_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
+; NO_FAULT-NEXT: ret i64 [[RETVAL]]
;
entry:
br label %loop
@@ -1934,3 +2777,87 @@ declare i32 @foo(i32) readonly
declare <vscale x 4 x i32> @foo_vec(<vscale x 4 x i32>)
attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" }
+
+;.
+; MAY_FAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; MAY_FAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; MAY_FAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; MAY_FAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP35]] = distinct !{[[LOOP35]], [[META2]], [[META1]]}
+;.
+; NO_FAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; NO_FAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO_FAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO_FAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP35]] = distinct !{[[LOOP35]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP36]] = distinct !{[[LOOP36]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP37]] = distinct !{[[LOOP37]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP38]] = distinct !{[[LOOP38]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP39]] = distinct !{[[LOOP39]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP40]] = distinct !{[[LOOP40]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP41]] = distinct !{[[LOOP41]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP42]] = distinct !{[[LOOP42]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP43]] = distinct !{[[LOOP43]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
index 9958d6ea124f81..28ae078d5cb987 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
@@ -8,6 +8,7 @@
#include "../lib/Transforms/Vectorize/VPlanVerifier.h"
#include "../lib/Transforms/Vectorize/VPlan.h"
+#include "../lib/Transforms/Vectorize/VPlanTransforms.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "gtest/gtest.h"
@@ -28,6 +29,10 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
VPBasicBlock *VPBB2 = new VPBasicBlock();
VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1");
VPBlockUtils::connectBlocks(VPBB1, R1);
+
+ VPBasicBlock *VPMIDDLE = new VPBasicBlock("middle.block");
+ VPBlockUtils::connectBlocks(R1, VPMIDDLE);
+
VPlan Plan(VPPH, &*TC, VPBB1);
#if GTEST_HAS_STREAM_REDIRECTION
@@ -57,6 +62,10 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
VPBB2->appendRecipe(BranchOnCond);
VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1");
+
+ VPBasicBlock *VPMIDDLE = new VPBasicBlock("middle.block");
+ VPBlockUtils::connectBlocks(R1, VPMIDDLE);
+
VPBlockUtils::connectBlocks(VPBB1, R1);
auto TC = std::make_unique<VPValue>();
@@ -102,6 +111,9 @@ TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
VPBlockUtils::connectBlocks(VPBB1, R1);
VPBB3->setParent(R1);
+ VPBasicBlock *VPMIDDLE = new VPBasicBlock("middle.block");
+ VPBlockUtils::connectBlocks(R1, VPMIDDLE);
+
auto TC = std::make_unique<VPValue>();
VPlan Plan(VPPH, &*TC, VPBB1);
@@ -138,6 +150,9 @@ TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
VPBlockUtils::connectBlocks(VPBB1, R1);
VPBlockUtils::connectBlocks(VPBB1, R1);
+ VPBasicBlock *VPMIDDLE = new VPBasicBlock("middle.block");
+ VPBlockUtils::connectBlocks(R1, VPMIDDLE);
+
auto TC = std::make_unique<VPValue>();
VPlan Plan(VPPH, &*TC, VPBB1);
@@ -175,6 +190,9 @@ TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
VPBlockUtils::connectBlocks(VPBB1, R1);
VPBB3->setParent(R1);
+ VPBasicBlock *VPMIDDLE = new VPBasicBlock("middle.block");
+ VPBlockUtils::connectBlocks(R1, VPMIDDLE);
+
auto TC = std::make_unique<VPValue>();
VPlan Plan(VPPH, &*TC, VPBB1);
@@ -204,6 +222,9 @@ TEST(VPVerifierTest, BlockOutsideRegionWithParent) {
VPBlockUtils::connectBlocks(VPBB1, R1);
VPBB1->setParent(R1);
+ VPBasicBlock *VPMIDDLE = new VPBasicBlock("middle.block");
+ VPBlockUtils::connectBlocks(R1, VPMIDDLE);
+
auto TC = std::make_unique<VPValue>();
VPlan Plan(VPPH, &*TC, VPBB1);
@@ -217,4 +238,120 @@ TEST(VPVerifierTest, BlockOutsideRegionWithParent) {
#endif
}
+TEST(VPVerifierTest, LoopRegionMultipleSuccessors1) {
+ VPInstruction *TC = new VPInstruction(Instruction::Add, {});
+ VPBasicBlock *VPBBPH = new VPBasicBlock("preheader");
+ VPBBPH->appendRecipe(TC);
+
+ VPInstruction *TC2 = new VPInstruction(Instruction::Add, {});
+ VPBasicBlock *VPBBENTRY = new VPBasicBlock("entry");
+ VPBBENTRY->appendRecipe(TC2);
+
+ // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
+ auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(TC2, {});
+ VPInstruction *I1 = new VPInstruction(Instruction::Add, {});
+ VPInstruction *I2 = new VPInstruction(Instruction::Sub, {I1});
+ VPInstruction *I3 = new VPInstruction(VPInstruction::BranchOnCond, {I1});
+
+ VPBasicBlock *RBB1 = new VPBasicBlock();
+ RBB1->appendRecipe(CanonicalIVPHI);
+ RBB1->appendRecipe(I1);
+ RBB1->appendRecipe(I2);
+ RBB1->appendRecipe(I3);
+ RBB1->setName("bb1");
+
+ VPInstruction *I4 = new VPInstruction(Instruction::Mul, {I2, I1});
+ VPInstruction *I5 = new VPInstruction(VPInstruction::BranchOnCond, {I4});
+ VPBasicBlock *RBB2 = new VPBasicBlock();
+ RBB2->appendRecipe(I4);
+ RBB2->appendRecipe(I5);
+ RBB2->setName("bb2");
+
+ VPRegionBlock *R1 = new VPRegionBlock(RBB1, RBB2, "R1");
+ VPBlockUtils::connectBlocks(RBB1, RBB2);
+ VPBlockUtils::connectBlocks(VPBBENTRY, R1);
+
+ VPBasicBlock *VPMIDDLE = new VPBasicBlock("middle.block");
+ VPBasicBlock *VPEARLYEXIT = new VPBasicBlock("early.exit");
+ VPBlockUtils::connectBlocks(R1, VPMIDDLE);
+ VPBlockUtils::connectBlocks(R1, VPEARLYEXIT);
+
+ VPlan Plan(VPBBPH, TC, VPBBENTRY);
+ Plan.setName("TestPlan");
+ Plan.addVF(ElementCount::getFixed(4));
+ Plan.getVectorLoopRegion()->setExiting(RBB2);
+ Plan.getVectorLoopRegion()->setEarlyExiting(RBB1);
+ Plan.getVectorLoopRegion()->setEarlyExit(VPEARLYEXIT);
+
+ EXPECT_TRUE(verifyVPlanIsValid(Plan));
+}
+
+TEST(VPVerifierTest, LoopRegionMultipleSuccessors2) {
+ VPInstruction *TC = new VPInstruction(Instruction::Add, {});
+ VPBasicBlock *VPBBPH = new VPBasicBlock("preheader");
+ VPBBPH->appendRecipe(TC);
+
+ VPInstruction *TC2 = new VPInstruction(Instruction::Add, {});
+ VPBasicBlock *VPBBENTRY = new VPBasicBlock("entry");
+ VPBBENTRY->appendRecipe(TC2);
+
+ // We can't create a live-in without a VPlan, but we can't create
+ // a VPlan without the blocks. So we initialize this to a silly
+ // value here, then fix it up later.
+ auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(TC2, {});
+ VPInstruction *I1 = new VPInstruction(Instruction::Add, {});
+ VPInstruction *I2 = new VPInstruction(Instruction::Sub, {I1});
+ VPInstruction *I3 = new VPInstruction(VPInstruction::BranchOnCond, {I1});
+
+ VPBasicBlock *RBB1 = new VPBasicBlock();
+ RBB1->appendRecipe(CanonicalIVPHI);
+ RBB1->appendRecipe(I1);
+ RBB1->appendRecipe(I2);
+ RBB1->appendRecipe(I3);
+ RBB1->setName("vector.body");
+
+ // This really is what the vplan cfg looks like before optimising!
+ VPBasicBlock *RBB2 = new VPBasicBlock();
+ RBB2->setName("loop.inc");
+ // A block that inherits the latch name from the original scalar loop.
+
+ VPBasicBlock *RBB3 = new VPBasicBlock();
+ // No name
+
+ VPInstruction *I4 = new VPInstruction(Instruction::Mul, {I2, I1});
+ VPInstruction *I5 = new VPInstruction(VPInstruction::BranchOnCond, {I4});
+ VPBasicBlock *RBB4 = new VPBasicBlock();
+ RBB4->appendRecipe(I4);
+ RBB4->appendRecipe(I5);
+ RBB4->setName("vector.latch");
+
+ VPRegionBlock *R1 = new VPRegionBlock(RBB1, RBB4, "R1");
+ VPBlockUtils::insertBlockAfter(RBB2, RBB1);
+ VPBlockUtils::insertBlockAfter(RBB3, RBB2);
+ VPBlockUtils::insertBlockAfter(RBB4, RBB3);
+ VPBlockUtils::connectBlocks(VPBBENTRY, R1);
+
+ VPBasicBlock *VPMIDDLE = new VPBasicBlock("middle.block");
+ VPBasicBlock *VPEARLYEXIT = new VPBasicBlock("early.exit");
+ VPBlockUtils::connectBlocks(R1, VPMIDDLE);
+ VPBlockUtils::connectBlocks(R1, VPEARLYEXIT);
+
+ VPlan Plan(VPBBPH, TC, VPBBENTRY);
+ Plan.setName("TestPlan");
+ Plan.addVF(ElementCount::getFixed(4));
+ Plan.getVectorLoopRegion()->setExiting(RBB4);
+ Plan.getVectorLoopRegion()->setEarlyExiting(RBB1);
+ Plan.getVectorLoopRegion()->setEarlyExit(VPEARLYEXIT);
+
+ // Update the VPCanonicalIVPHIRecipe to have a live-in IR value.
+ LLVMContext C;
+ IntegerType *Int32 = IntegerType::get(C, 32);
+ Value *StartIV = PoisonValue::get(Int32);
+ CanonicalIVPHI->setStartValue(Plan.getOrAddLiveIn(StartIV));
+
+ EXPECT_TRUE(verifyVPlanIsValid(Plan));
+
+ VPlanTransforms::optimize(Plan);
+}
+
} // namespace
More information about the llvm-commits
mailing list