[llvm-branch-commits] [llvm] 638a188 - [LV] Generate RT checks up-front and remove them if required.
Florian Hahn via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sun Sep 13 10:48:56 PDT 2020
Author: Florian Hahn
Date: 2020-09-13T18:47:56+01:00
New Revision: 638a188b6a0262fe26ad62353d71cdd384c40bd9
URL: https://github.com/llvm/llvm-project/commit/638a188b6a0262fe26ad62353d71cdd384c40bd9
DIFF: https://github.com/llvm/llvm-project/commit/638a188b6a0262fe26ad62353d71cdd384c40bd9.diff
LOG: [LV] Generate RT checks up-front and remove them if required.
Differential Revision: https://reviews.llvm.org/D75980
Added:
llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll
Modified:
llvm/include/llvm/Transforms/Utils/LoopUtils.h
llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
llvm/lib/Transforms/Utils/LoopUtils.cpp
llvm/lib/Transforms/Utils/LoopVersioning.cpp
llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 70c8c84c857b..bb72c19f8532 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -35,6 +35,7 @@ class MemorySSAUpdater;
class OptimizationRemarkEmitter;
class PredIteratorCache;
class ScalarEvolution;
+class ScalarEvolutionExpander;
class SCEV;
class SCEVExpander;
class TargetLibraryInfo;
@@ -446,7 +447,7 @@ Loop *cloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
std::pair<Instruction *, Instruction *>
addRuntimeChecks(Instruction *Loc, Loop *TheLoop,
const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,
- ScalarEvolution *SE);
+ SCEVExpander &Expander);
} // end namespace llvm
diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
index 77360cb2671d..829bdcbb2588 100644
--- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
+++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
@@ -16,6 +16,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ScalarEvolutionNormalization.h"
@@ -199,6 +200,8 @@ class SCEVExpander : public SCEVVisitor<SCEVExpander, Value *> {
ChainedPhis.clear();
}
+ ScalarEvolution *getSE() { return &SE; }
+
/// Return a vector containing all instructions inserted during expansion.
SmallVector<Instruction *, 32> getAllInsertedInstructions() const {
SmallVector<Instruction *, 32> Result;
@@ -509,10 +512,12 @@ class SCEVExpanderCleaner {
SCEVExpanderCleaner(SCEVExpander &Expander, DominatorTree &DT)
: Expander(Expander), DT(DT), ResultUsed(false) {}
- ~SCEVExpanderCleaner();
+ ~SCEVExpanderCleaner() { cleanup(); }
/// Indicate that the result of the expansion is used.
void markResultUsed() { ResultUsed = true; }
+
+ void cleanup();
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index d7cd9b19b8d5..dd808619f67a 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1574,7 +1574,8 @@ struct PointerBounds {
/// in \p TheLoop. \return the values for the bounds.
static PointerBounds expandBounds(const RuntimeCheckingPtrGroup *CG,
Loop *TheLoop, Instruction *Loc,
- SCEVExpander &Exp, ScalarEvolution *SE) {
+ SCEVExpander &Exp) {
+ ScalarEvolution *SE = Exp.getSE();
// TODO: Add helper to retrieve pointers to CG.
Value *Ptr = CG->RtCheck.Pointers[CG->Members[0]].PointerValue;
const SCEV *Sc = SE->getSCEV(Ptr);
@@ -1613,16 +1614,15 @@ static PointerBounds expandBounds(const RuntimeCheckingPtrGroup *CG,
/// lower bounds for both pointers in the check.
static SmallVector<std::pair<PointerBounds, PointerBounds>, 4>
expandBounds(const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, Loop *L,
- Instruction *Loc, ScalarEvolution *SE, SCEVExpander &Exp) {
+ Instruction *Loc, SCEVExpander &Exp) {
SmallVector<std::pair<PointerBounds, PointerBounds>, 4> ChecksWithBounds;
// Here we're relying on the SCEV Expander's cache to only emit code for the
// same bounds once.
transform(PointerChecks, std::back_inserter(ChecksWithBounds),
[&](const RuntimePointerCheck &Check) {
- PointerBounds First = expandBounds(Check.first, L, Loc, Exp, SE),
- Second =
- expandBounds(Check.second, L, Loc, Exp, SE);
+ PointerBounds First = expandBounds(Check.first, L, Loc, Exp),
+ Second = expandBounds(Check.second, L, Loc, Exp);
return std::make_pair(First, Second);
});
@@ -1632,12 +1632,10 @@ expandBounds(const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, Loop *L,
std::pair<Instruction *, Instruction *> llvm::addRuntimeChecks(
Instruction *Loc, Loop *TheLoop,
const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,
- ScalarEvolution *SE) {
+ SCEVExpander &Exp) {
// TODO: Move noalias annotation code from LoopVersioning here and share with LV if possible.
// TODO: Pass RtPtrChecking instead of PointerChecks and SE separately, if possible
- const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
- SCEVExpander Exp(*SE, DL, "induction");
- auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, SE, Exp);
+ auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, Exp);
LLVMContext &Ctx = Loc->getContext();
Instruction *FirstInst = nullptr;
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index b4925064bc6b..d2c9def03785 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -68,9 +68,12 @@ void LoopVersioning::versionLoop(
// Add the memcheck in the original preheader (this is empty initially).
BasicBlock *RuntimeCheckBB = VersionedLoop->getLoopPreheader();
const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
- std::tie(FirstCheckInst, MemRuntimeCheck) =
- addRuntimeChecks(RuntimeCheckBB->getTerminator(), VersionedLoop,
- AliasChecks, RtPtrChecking.getSE());
+
+ SCEVExpander Exp2(*RtPtrChecking.getSE(),
+ VersionedLoop->getHeader()->getModule()->getDataLayout(),
+ "induction");
+ std::tie(FirstCheckInst, MemRuntimeCheck) = addRuntimeChecks(
+ RuntimeCheckBB->getTerminator(), VersionedLoop, AliasChecks, Exp2);
const SCEVUnionPredicate &Pred = LAI.getPSE().getUnionPredicate();
SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(),
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 165030c6d2f1..1d7e7882d7ea 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -2663,7 +2663,7 @@ bool isSafeToExpandAt(const SCEV *S, const Instruction *InsertionPoint,
return false;
}
-SCEVExpanderCleaner::~SCEVExpanderCleaner() {
+void SCEVExpanderCleaner::cleanup() {
// Result is used, nothing to remove.
if (ResultUsed)
return;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 545540efc284..f9a0e6f35f50 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -411,9 +411,8 @@ static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
return None;
}
-
+struct GeneratedRTChecks;
namespace llvm {
-
/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// block to a specified vectorization factor (VF).
/// This class performs the widening of scalars into vectors, or multiple
@@ -437,12 +436,12 @@ class InnerLoopVectorizer {
OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
unsigned UnrollFactor, LoopVectorizationLegality *LVL,
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI)
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
Builder(PSE.getSE()->getContext()),
VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
- BFI(BFI), PSI(PSI) {
+ BFI(BFI), PSI(PSI), Check(Check) {
// Query this against the original loop and save it here because the profile
// of the original loop header may change as the transformation happens.
OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
@@ -855,6 +854,10 @@ class InnerLoopVectorizer {
// Whether this loop should be optimized for size based on profile guided size
// optimizatios.
bool OptForSizeBasedOnProfile;
+
+ /// Structure to hold information about generated runtime checks, responsible
+ /// for cleaning the checks, if they turn out unprofitable.
+ GeneratedRTChecks &Check;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -866,10 +869,9 @@ class InnerLoopUnroller : public InnerLoopVectorizer {
OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
LoopVectorizationLegality *LVL,
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI)
- : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- ElementCount::getFixed(1), UnrollFactor, LVL, CM,
- BFI, PSI) {}
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
+ : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
+ UnrollFactor, LVL, CM, BFI, PSI, Check) {}
private:
Value *getBroadcastInstrs(Value *V) override;
@@ -1645,9 +1647,99 @@ class LoopVectorizationCostModel {
/// Values to ignore in the cost model when VF > 1.
SmallPtrSet<const Value *, 16> VecValuesToIgnore;
};
-
} // end namespace llvm
+/// Helper struct to manage generating runtime checks for vectorization.
+///
+/// The runtime checks are created up-front in a temporary block to allow better
+/// estimating the cost and un-linked from the existing IR. After deciding to
+/// vectorize, the checks are moved backed. If deciding not to vectorize, the
+/// temporary block is completely removed.
+struct GeneratedRTChecks {
+ BasicBlock *TmpBlock = nullptr;
+ BasicBlock *Preheader;
+ Value *SCEVCheck;
+ Instruction *FirstCheckInst = nullptr;
+ Instruction *MemRuntimeCheck = nullptr;
+
+ ScalarEvolution &SE;
+ DominatorTree *DT;
+
+ SCEVExpander Exp;
+ SCEVExpanderCleaner Cleaner;
+
+ GeneratedRTChecks(BasicBlock *Preheader, ScalarEvolution &SE,
+ DominatorTree *DT)
+ : Preheader(Preheader), SE(SE), DT(DT),
+ Exp(SE, Preheader->getModule()->getDataLayout(), "scev.check"),
+ Cleaner(Exp, *DT) {}
+
+ /// Generate runtime checks in temporary block (TmpBlock), so we can
+ /// accurately estimate the cost of the runtime checks. The block is un-linked
+ /// from the IR and is added back during vector code generation. If there is
+ /// no vector code generation, the check blocks is removed completely.
+ void Create(Loop *L, const LoopAccessInfo &LAI,
+ const SCEVUnionPredicate &UnionPred, LoopInfo *LI) {
+ BasicBlock *LoopHeader = Preheader->getSingleSuccessor();
+ TmpBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
+ nullptr, "tmp.rtchecks");
+
+ SCEVCheck =
+ Exp.expandCodeForPredicate(&UnionPred, TmpBlock->getTerminator());
+
+ const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
+ if (RtPtrChecking.Need) {
+ std::tie(FirstCheckInst, MemRuntimeCheck) = addRuntimeChecks(
+ TmpBlock->getTerminator(), L, RtPtrChecking.getChecks(), Exp);
+ assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
+ "claimed checks are required");
+ }
+
+ // Unhook the temporary block with the checks, update various places
+ // accordingly.
+ TmpBlock->replaceAllUsesWith(Preheader);
+ TmpBlock->getTerminator()->moveBefore(Preheader->getTerminator());
+ Preheader->getTerminator()->eraseFromParent();
+ DT->changeImmediateDominator(LoopHeader, Preheader);
+ DT->eraseNode(TmpBlock);
+ LI->removeBlock(TmpBlock);
+ }
+
+ ~GeneratedRTChecks() {
+ if (!TmpBlock) {
+ Cleaner.markResultUsed();
+ return;
+ }
+
+ if (!SCEVCheck && TmpBlock->empty()) {
+ Cleaner.markResultUsed();
+ TmpBlock->eraseFromParent();
+ return;
+ }
+
+ if (MemRuntimeCheck && !isa<Constant>(MemRuntimeCheck))
+ MemRuntimeCheck->replaceAllUsesWith(
+ ConstantInt::getFalse(MemRuntimeCheck->getType()->getContext()));
+ if (SCEVCheck && !isa<Constant>(SCEVCheck))
+ SCEVCheck->replaceAllUsesWith(
+ ConstantInt::getFalse(SCEVCheck->getType()->getContext()));
+
+ SmallPtrSet<Value *, 8> Removed;
+ // Completely remove the block.
+ for (auto &I : make_early_inc_range(reverse(*TmpBlock))) {
+ if (Exp.isInsertedInstruction(&I))
+ continue;
+ SE.forgetValue(&I);
+ SE.eraseValueFromMap(&I);
+ Removed.insert(&I);
+ I.eraseFromParent();
+ }
+
+ Cleaner.cleanup();
+ TmpBlock->eraseFromParent();
+ }
+};
+
// Return true if \p OuterLp is an outer loop annotated with hints for explicit
// vectorization. The loop needs to be annotated with #pragma omp simd
// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
@@ -2887,17 +2979,11 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
// Reuse existing vector loop preheader for SCEV checks.
// Note that new preheader block is generated for vector loop.
- BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
+ BasicBlock *const SCEVCheckBlock = Check.TmpBlock;
- // Generate the code to check that the SCEV assumptions that we made.
- // We want the new basic block to start at the first instruction in a
- // sequence of instructions that form a check.
- SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
- "scev.check");
- Value *SCEVCheck = Exp.expandCodeForPredicate(
- &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
-
- if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
+ if (!Check.TmpBlock)
+ return;
+ if (auto *C = dyn_cast<ConstantInt>(Check.SCEVCheck))
if (C->isZero())
return;
@@ -2907,10 +2993,30 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
"Cannot SCEV check stride or overflow when optimizing for size");
SCEVCheckBlock->setName("vector.scevcheck");
+
+ auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
+
+ BranchInst::Create(LoopVectorPreHeader, Check.TmpBlock);
// Create new preheader for vector loop.
- LoopVectorPreHeader =
- SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
- nullptr, "vector.ph");
+ Check.TmpBlock = SplitBlock(SCEVCheckBlock,
+ cast<Instruction>(Check.SCEVCheck)->getNextNode(),
+ nullptr, nullptr, nullptr, "");
+
+ if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
+ PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
+
+ Check.TmpBlock->replaceAllUsesWith(SCEVCheckBlock);
+ Check.TmpBlock->getTerminator()->moveBefore(SCEVCheckBlock->getTerminator());
+ SCEVCheckBlock->getTerminator()->eraseFromParent();
+ SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
+
+ auto *PHTerm = Pred->getTerminator();
+ for (unsigned i = 0; i < PHTerm->getNumSuccessors(); i++)
+ if (PHTerm->getSuccessor(i) == LoopVectorPreHeader)
+ PHTerm->setSuccessor(i, SCEVCheckBlock);
+
+ DT->addNewBlock(SCEVCheckBlock, Pred);
+ DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
// Update dominator only if this is first RT check.
if (LoopBypassBlocks.empty()) {
@@ -2920,9 +3026,10 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
ReplaceInstWithInst(
SCEVCheckBlock->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
+ BranchInst::Create(Bypass, LoopVectorPreHeader, Check.SCEVCheck));
LoopBypassBlocks.push_back(SCEVCheckBlock);
AddedSafetyChecks = true;
+ Check.SCEVCheck = nullptr;
}
void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
@@ -2932,14 +3039,12 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
// Reuse existing vector loop preheader for runtime memory checks.
// Note that new preheader block is generated for vector loop.
- BasicBlock *const MemCheckBlock = L->getLoopPreheader();
+ BasicBlock *const MemCheckBlock = Check.TmpBlock;
- // Generate the code that checks in runtime if arrays overlap. We put the
- // checks into a separate block to make the more common case of few elements
- // faster.
- auto *LAI = Legal->getLAI();
- const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
- if (!RtPtrChecking.Need)
+ // Check if we generated code that checks in runtime if arrays overlap. We put
+ // the checks into a separate block to make the more common case of few
+ // elements faster.
+ if (!Check.MemRuntimeCheck)
return;
if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
@@ -2956,11 +3061,22 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
});
}
+ auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
+ auto *PHTerm = Pred->getTerminator();
+ for (unsigned i = 0; i < PHTerm->getNumSuccessors(); i++)
+ if (PHTerm->getSuccessor(i) == LoopVectorPreHeader)
+ PHTerm->setSuccessor(i, Check.TmpBlock);
+ auto *BI = BranchInst::Create(LoopVectorPreHeader, Check.TmpBlock);
+ BI->setDebugLoc(PHTerm->getDebugLoc());
+
+ DT->addNewBlock(Check.TmpBlock, Pred);
+ DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
+ Check.TmpBlock->moveBefore(LoopVectorPreHeader);
+
+ if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
+ PL->addBasicBlockToLoop(Check.TmpBlock, *LI);
+
MemCheckBlock->setName("vector.memcheck");
- // Create new preheader for vector loop.
- LoopVectorPreHeader =
- SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
- "vector.ph");
auto *CondBranch = cast<BranchInst>(
Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
@@ -2973,15 +3089,13 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
DT->changeImmediateDominator(Bypass, MemCheckBlock);
DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
}
+ Check.TmpBlock = nullptr;
- Instruction *FirstCheckInst;
- Instruction *MemRuntimeCheck;
- std::tie(FirstCheckInst, MemRuntimeCheck) =
- addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
- RtPtrChecking.getChecks(), RtPtrChecking.getSE());
- assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
- "claimed checks are required");
- CondBranch->setCondition(MemRuntimeCheck);
+ ReplaceInstWithInst(
+ MemCheckBlock->getTerminator(),
+ BranchInst::Create(Bypass, LoopVectorPreHeader, Check.MemRuntimeCheck));
+ LoopBypassBlocks.push_back(MemCheckBlock);
+ AddedSafetyChecks = true;
// We currently don't use LoopVersioning for the actual loop cloning but we
// still use it to add the noalias metadata.
@@ -8220,8 +8334,9 @@ static bool processLoopInVPlanNativePath(
LVP.setBestPlan(VF.Width, 1);
+ GeneratedRTChecks Checks(L->getLoopPreheader(), *PSE.getSE(), DT);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
- &CM, BFI, PSI);
+ &CM, BFI, PSI, Checks);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
LVP.executePlan(LB, DT);
@@ -8229,7 +8344,6 @@ static bool processLoopInVPlanNativePath(
// Mark the loop as already vectorized to avoid vectorizing again.
Hints.setAlreadyVectorized();
- assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
return true;
}
@@ -8387,6 +8501,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
}
+ // Optimistically generate runtime checks. Drop them if they turn out to not
+ // be profitable.
+ GeneratedRTChecks Checks(L->getLoopPreheader(), *PSE.getSE(), DT);
+ Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate(), LI);
+
// Identify the diagnostic messages that should be produced.
std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
bool VectorizeLoop = true, InterleaveLoop = true;
@@ -8487,7 +8606,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// If we decided that it is not legal to vectorize the loop, then
// interleave it.
InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
- BFI, PSI);
+ BFI, PSI, Checks);
LVP.executePlan(Unroller, DT);
ORE->emit([&]() {
@@ -8499,7 +8618,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
} else {
// If we decided that it is *legal* to vectorize the loop, then do it.
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
- &LVL, &CM, BFI, PSI);
+ &LVL, &CM, BFI, PSI, Checks);
LVP.executePlan(LB, DT);
++LoopsVectorized;
@@ -8532,7 +8651,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
Hints.setAlreadyVectorized();
}
- assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
return true;
}
@@ -8598,6 +8716,8 @@ LoopVectorizeResult LoopVectorizePass::runImpl(
Changed |= CFGChanged |= processLoop(L);
}
+ assert(!Changed || !verifyFunction(F, &dbgs()));
+
// Process each loop nest in the function.
return LoopVectorizeResult(Changed, CFGChanged);
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
index 6b7e809046ec..25911c61fa43 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
@@ -47,11 +47,10 @@ define void @_Z1dv() local_unnamed_addr #0 {
; CHECK-NEXT: br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
; CHECK: vector.memcheck:
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* [[CALL]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[G_0]], [[CONV]]
-; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP15]]
-; CHECK-NEXT: [[TMP16:%.*]] = sub i64 [[TMP15]], [[TMP0]]
-; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @c, i64 0, i64 4), i64 [[TMP16]]
+; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[TMP14]], [[TMP0]]
+; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @c, i64 0, i64 4), i64 [[TMP15]]
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP]], [[SCEVGEP3]]
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP2]], [[SCEVGEP1]]
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
@@ -65,23 +64,23 @@ define void @_Z1dv() local_unnamed_addr #0 {
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]]
-; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = add i64 [[TMP0]], [[INDEX]]
-; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[OFFSET_IDX4]] to i32
-; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 0
-; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[CONV]], [[TMP19]]
-; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP20]] to i64
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP21]]
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TMP22]], i32 0
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP23]] to <4 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP24]], align 1, !alias.scope !0
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 [[TMP17]]
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[TMP25]], i32 0
-; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to <4 x i8>*
-; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP27]], align 1, !alias.scope !3, !noalias !0
+; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[OFFSET_IDX4]] to i32
+; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP17]], 0
+; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[CONV]], [[TMP18]]
+; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP20]]
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[TMP21]], i32 0
+; CHECK-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP22]] to <4 x i8>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP23]], align 1, !alias.scope !0
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 [[TMP16]]
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i32 0
+; CHECK-NEXT: [[TMP26:%.*]] = bitcast i8* [[TMP25]] to <4 x i8>*
+; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP26]], align 1, !alias.scope !3, !noalias !0
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -96,13 +95,13 @@ define void @_Z1dv() local_unnamed_addr #0 {
; CHECK-NEXT: br label [[FOR_COND]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP29:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[CONV]], [[TMP29]]
+; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[CONV]], [[TMP28]]
; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[ADD]] to i64
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 [[IDXPROM]]
-; CHECK-NEXT: [[TMP30:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP29:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i8 [[TMP30]], i8* [[ARRAYIDX3]], align 1
+; CHECK-NEXT: store i8 [[TMP29]], i8* [[ARRAYIDX3]], align 1
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !7
diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
index 3bb73e97a536..cad0df5daac2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -26,9 +26,9 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
; CHECK-NEXT: br i1 [[CMP27]], label [[FOR_BODY3_LR_PH_US_PREHEADER:%.*]], label [[FOR_END15:%.*]]
; CHECK: for.body3.lr.ph.us.preheader:
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[M]], -1
-; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[K:%.*]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[K:%.*]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
; CHECK-NEXT: br label [[FOR_BODY3_LR_PH_US:%.*]]
; CHECK: for.end.us:
; CHECK-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
@@ -54,12 +54,12 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
; CHECK-NEXT: br i1 [[EXITCOND32]], label [[FOR_END_US:%.*]], label [[FOR_BODY3_US]], !llvm.loop !3
; CHECK: for.body3.lr.ph.us:
; CHECK-NEXT: [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ]
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP3]], [[INDVARS_IV33]]
+; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP1]], [[INDVARS_IV33]]
; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[INDVARS_IV33]] to i32
; CHECK-NEXT: [[ADD_US]] = add i32 [[TMP9]], [[K]]
; CHECK-NEXT: [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]]
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]]
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]])
@@ -74,8 +74,8 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
; CHECK-NEXT: [[TMP16:%.*]] = or i1 false, [[TMP15]]
; CHECK-NEXT: br i1 [[TMP16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -100,7 +100,7 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll
new file mode 100644
index 000000000000..1ae3a9b87715
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll
@@ -0,0 +1,32 @@
+; RUN: opt -loop-vectorize -force-vector-width=4 %s | FileCheck %s
+
+%struct.foo = type { [400 x double] }
+
+; Make sure we do not crash when dropping runtime checks.
+
+; CHECK-NOT: vector.body
+
+define void @barney(%struct.foo* %ptr) {
+entry:
+ br label %loop
+
+loop:
+ %tmp3 = phi i64 [ 0, %entry ], [ %tmp18, %loop ]
+ %tmp4 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 undef
+ %tmp5 = bitcast %struct.foo* %tmp4 to i64*
+ store i64 0, i64* %tmp5, align 8
+ %tmp8 = add i64 1, %tmp3
+ %tmp10 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 %tmp8
+ %tmp11 = bitcast %struct.foo* %tmp10 to i64*
+ store i64 1, i64* %tmp11, align 8
+ %tmp14 = add i64 undef, %tmp3
+ %tmp16 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 %tmp14
+ %tmp17 = bitcast %struct.foo* %tmp16 to i64*
+ store i64 2, i64* %tmp17, align 8
+ %tmp18 = add nuw nsw i64 %tmp3, 4
+ %c = icmp ult i64 %tmp18, 400
+ br i1 %c, label %exit, label %loop
+
+exit:
+ ret void
+}
More information about the llvm-branch-commits
mailing list