[llvm] r330739 - [LV][VPlan] Detect outer loops for explicit vectorization.
Diego Caballero via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 24 10:04:18 PDT 2018
Author: dcaballe
Date: Tue Apr 24 10:04:17 2018
New Revision: 330739
URL: http://llvm.org/viewvc/llvm-project?rev=330739&view=rev
Log:
[LV][VPlan] Detect outer loops for explicit vectorization.
Patch #2 from VPlan Outer Loop Vectorization Patch Series #1
(RFC: http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html).
This patch introduces the basic infrastructure to detect, legality check
and process outer loops annotated with hints for explicit vectorization.
All these changes are protected under the feature flag
-enable-vplan-native-path. This should make this patch NFC for the existing
inner loop vectorizer.
Reviewers: hfinkel, mkuper, rengolin, fhahn, aemerson, mssimpso.
Differential Revision: https://reviews.llvm.org/D42447
Added:
llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_detection.ll
llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll
llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll
Modified:
llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorize.h
llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
Modified: llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorize.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorize.h?rev=330739&r1=330738&r2=330739&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorize.h (original)
+++ llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorize.h Tue Apr 24 10:04:17 2018
@@ -26,6 +26,14 @@
// of vectorization. It decides on the optimal vector width, which
// can be one, if vectorization is not profitable.
//
+// There is a development effort going on to migrate loop vectorizer to the
+// VPlan infrastructure and to introduce outer loop vectorization support (see
+// docs/Proposal/VectorizationPlan.rst and
+// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
+// purpose, we temporarily introduced the VPlan-native vectorization path: an
+// alternative vectorization path that is natively implemented on top of the
+// VPlan infrastructure. See EnableVPlanNativePath for enabling.
+//
//===----------------------------------------------------------------------===//
//
// The reduction-variable vectorization is based on the paper:
Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationPlanner.h?rev=330739&r1=330738&r2=330739&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationPlanner.h (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationPlanner.h Tue Apr 24 10:04:17 2018
@@ -144,6 +144,10 @@ public:
/// Plan how to best vectorize, return the best VF and its cost.
VectorizationFactor plan(bool OptForSize, unsigned UserVF);
+ /// Use the VPlan-native path to plan how to best vectorize, return the best
+ /// VF and its cost.
+ VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF);
+
/// Finalize the best decision and dispose of all other VPlans.
void setBestPlan(unsigned VF, unsigned UF);
Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=330739&r1=330738&r2=330739&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Tue Apr 24 10:04:17 2018
@@ -26,6 +26,14 @@
// of vectorization. It decides on the optimal vector width, which
// can be one, if vectorization is not profitable.
//
+// There is a development effort going on to migrate loop vectorizer to the
+// VPlan infrastructure and to introduce outer loop vectorization support (see
+// docs/Proposal/VectorizationPlan.rst and
+// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
+// purpose, we temporarily introduced the VPlan-native vectorization path: an
+// alternative vectorization path that is natively implemented on top of the
+// VPlan infrastructure. See EnableVPlanNativePath for enabling.
+//
//===----------------------------------------------------------------------===//
//
// The reduction-variable vectorization is based on the paper:
@@ -251,6 +259,11 @@ static cl::opt<unsigned> PragmaVectorize
cl::desc("The maximum number of SCEV checks allowed with a "
"vectorize(enable) pragma"));
+static cl::opt<bool> EnableVPlanNativePath(
+ "enable-vplan-native-path", cl::init(false), cl::Hidden,
+ cl::desc("Enable VPlan-native vectorization path with "
+ "support for outer loop vectorization."));
+
/// Create an analysis remark that explains why vectorization failed
///
/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
@@ -1519,7 +1532,7 @@ public:
std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC)
- : TheLoop(L), PSE(PSE), TLI(TLI), DT(DT), GetLAA(GetLAA),
+ : TheLoop(L), LI(LI), PSE(PSE), TLI(TLI), DT(DT), GetLAA(GetLAA),
ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {}
/// ReductionList contains the reduction descriptors for all
@@ -1621,6 +1634,15 @@ public:
bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
private:
+ /// Return true if the pre-header, exiting and latch blocks of \p Lp and all
+ /// its nested loops are considered legal for vectorization. These legal
+ /// checks are common for inner and outer loop vectorization.
+ bool canVectorizeLoopNestCFG(Loop *Lp);
+
+ /// Return true if the pre-header, exiting and latch blocks of \p Lp
+ /// (non-recursive) are considered legal for vectorization.
+ bool canVectorizeLoopCFG(Loop *Lp);
+
/// Check if a single basic block loop is vectorizable.
/// At this point we know that this is a loop with a constant trip count
/// and we only need to check individual instructions.
@@ -1636,6 +1658,10 @@ private:
/// transformation.
bool canVectorizeWithIfConvert();
+ /// Return true if we can vectorize this outer loop. The method performs
+ /// specific checks for outer loop vectorization.
+ bool canVectorizeOuterLoop();
+
/// Return true if all of the instructions in the block can be speculatively
/// executed. \p SafePtrs is a list of addresses that are known to be legal
/// and we know that we can read from them without segfault.
@@ -1672,6 +1698,9 @@ private:
/// The loop that we evaluate.
Loop *TheLoop;
+ /// Loop Info analysis.
+ LoopInfo *LI;
+
/// A wrapper around ScalarEvolution used to add runtime SCEV checks.
/// Applies dynamic knowledge to simplify SCEV expressions in the context
/// of existing SCEV assumptions. The analysis will also add a minimal set
@@ -2275,17 +2304,73 @@ private:
} // end anonymous namespace
-static void addAcyclicInnerLoop(Loop &L, LoopInfo &LI,
- SmallVectorImpl<Loop *> &V) {
- if (L.empty()) {
+// Return true if \p OuterLp is an outer loop annotated with hints for explicit
+// vectorization. The loop needs to be annotated with #pragma omp simd
+// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
+// vector length information is not provided, vectorization is not considered
+// explicit. Interleave hints are not allowed either. These limitations will be
+// relaxed in the future.
+// Please, note that we are currently forced to abuse the pragma 'clang
+// vectorize' semantics. This pragma provides *auto-vectorization hints*
+// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
+// provides *explicit vectorization hints* (LV can bypass legal checks and
+// assume that vectorization is legal). However, both hints are implemented
+// using the same metadata (llvm.loop.vectorize, processed by
+// LoopVectorizeHints). This will be fixed in the future when the native IR
+// representation for pragma 'omp simd' is introduced.
+static bool isExplicitVecOuterLoop(Loop *OuterLp,
+ OptimizationRemarkEmitter *ORE) {
+ assert(!OuterLp->empty() && "This is not an outer loop");
+ LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
+
+ // Only outer loops with an explicit vectorization hint are supported.
+ // Unannotated outer loops are ignored.
+ if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
+ return false;
+
+ Function *Fn = OuterLp->getHeader()->getParent();
+ if (!Hints.allowVectorization(Fn, OuterLp, false /*AlwaysVectorize*/)) {
+ DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
+ return false;
+ }
+
+ if (!Hints.getWidth()) {
+ DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n");
+ emitMissedWarning(Fn, OuterLp, Hints, ORE);
+ return false;
+ }
+
+ if (Hints.getInterleave() > 1) {
+ // TODO: Interleave support is future work.
+ DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
+ "outer loops.\n");
+ emitMissedWarning(Fn, OuterLp, Hints, ORE);
+ return false;
+ }
+
+ return true;
+}
+
+static void collectSupportedLoops(Loop &L, LoopInfo *LI,
+ OptimizationRemarkEmitter *ORE,
+ SmallVectorImpl<Loop *> &V) {
+ // Collect inner loops and outer loops without irreducible control flow. For
+ // now, only collect outer loops that have explicit vectorization hints.
+ if (L.empty() || (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
LoopBlocksRPO RPOT(&L);
- RPOT.perform(&LI);
- if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, LI))
+ RPOT.perform(LI);
+ if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
V.push_back(&L);
- return;
+ // TODO: Collect inner loops inside marked outer loops in case
+ // vectorization fails for the outer loop. Do not invoke
+ // 'containsIrreducibleCFG' again for inner loops when the outer loop is
+ // already known to be reducible. We can use an inherited attribute for
+ // that.
+ return;
+ }
}
for (Loop *InnerL : L)
- addAcyclicInnerLoop(*InnerL, LI, V);
+ collectSupportedLoops(*InnerL, LI, ORE, V);
}
namespace {
@@ -4832,15 +4917,24 @@ bool LoopVectorizationLegality::canVecto
return true;
}
-bool LoopVectorizationLegality::canVectorize() {
+// Helper function to canVectorizeLoopNestCFG.
+bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp) {
+ assert((EnableVPlanNativePath || Lp->empty()) &&
+ "VPlan-native path is not enabled.");
+
+ // TODO: ORE should be improved to show more accurate information when an
+ // outer loop can't be vectorized because a nested loop is not understood or
+ // legal. Something like: "outer_loop_location: loop not vectorized:
+ // (inner_loop_location) loop control flow is not understood by vectorizer".
+
// Store the result and return it at the end instead of exiting early, in case
// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
bool Result = true;
-
bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+
// We must have a loop in canonical form. Loops with indirectbr in them cannot
// be canonicalized.
- if (!TheLoop->getLoopPreheader()) {
+ if (!Lp->getLoopPreheader()) {
DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n");
ORE->emit(createMissedAnalysis("CFGNotUnderstood")
<< "loop control flow is not understood by vectorizer");
@@ -4850,21 +4944,8 @@ bool LoopVectorizationLegality::canVecto
return false;
}
- // FIXME: The code is currently dead, since the loop gets sent to
- // LoopVectorizationLegality is already an innermost loop.
- //
- // We can only vectorize innermost loops.
- if (!TheLoop->empty()) {
- ORE->emit(createMissedAnalysis("NotInnermostLoop")
- << "loop is not the innermost loop");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
// We must have a single backedge.
- if (TheLoop->getNumBackEdges() != 1) {
+ if (Lp->getNumBackEdges() != 1) {
ORE->emit(createMissedAnalysis("CFGNotUnderstood")
<< "loop control flow is not understood by vectorizer");
if (DoExtraAnalysis)
@@ -4874,7 +4955,7 @@ bool LoopVectorizationLegality::canVecto
}
// We must have a single exiting block.
- if (!TheLoop->getExitingBlock()) {
+ if (!Lp->getExitingBlock()) {
ORE->emit(createMissedAnalysis("CFGNotUnderstood")
<< "loop control flow is not understood by vectorizer");
if (DoExtraAnalysis)
@@ -4886,7 +4967,7 @@ bool LoopVectorizationLegality::canVecto
// We only handle bottom-tested loops, i.e. loop in which the condition is
// checked at the end of each iteration. With that we can assume that all
// instructions in the loop are executed the same number of times.
- if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+ if (Lp->getExitingBlock() != Lp->getLoopLatch()) {
ORE->emit(createMissedAnalysis("CFGNotUnderstood")
<< "loop control flow is not understood by vectorizer");
if (DoExtraAnalysis)
@@ -4895,10 +4976,70 @@ bool LoopVectorizationLegality::canVecto
return false;
}
+ return Result;
+}
+
+bool LoopVectorizationLegality::canVectorizeLoopNestCFG(Loop *Lp) {
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+ if (!canVectorizeLoopCFG(Lp)) {
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Recursively check whether the loop control flow of nested loops is
+ // understood.
+ for (Loop *SubLp : *Lp)
+ if (!canVectorizeLoopNestCFG(SubLp)) {
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ return Result;
+}
+
+bool LoopVectorizationLegality::canVectorize() {
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+ // Check whether the loop-related control flow in the loop nest is expected by
+ // vectorizer.
+ if (!canVectorizeLoopNestCFG(TheLoop)) {
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
// We need to have a loop header.
DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
<< '\n');
+ // Specific checks for outer loops. We skip the remaining legal checks at this
+ // point because they don't support outer loops.
+ if (!TheLoop->empty()) {
+ assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+
+ if (!canVectorizeOuterLoop()) {
+ DEBUG(dbgs() << "LV: Not vectorizing: Unsupported outer loop.\n");
+ // TODO: Implement DoExtraAnalysis when subsequent legal checks support
+ // outer loops.
+ return false;
+ }
+
+ DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
+ return Result;
+ }
+
+ assert(TheLoop->empty() && "Inner loop expected.");
// Check if we can if-convert non-single-bb loops.
unsigned NumBlocks = TheLoop->getNumBlocks();
if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
@@ -4955,6 +5096,140 @@ bool LoopVectorizationLegality::canVecto
return Result;
}
+// Return true if the inner loop \p Lp is uniform with regard to the outer loop
+// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
+// executing the inner loop will execute the same iterations). This check is
+// very constrained for now but it will be relaxed in the future. \p Lp is
+// considered uniform if it meets all the following conditions:
+// 1) it has a canonical IV (starting from 0 and with stride 1),
+// 2) its latch terminator is a conditional branch and,
+// 3) its latch condition is a compare instruction whose operands are the
+// canonical IV and an OuterLp invariant.
+// This check doesn't take into account the uniformity of other conditions not
+// related to the loop latch because they don't affect the loop uniformity.
+//
+// NOTE: We decided to keep all these checks and its associated documentation
+// together so that we can easily have a picture of the current supported loop
+// nests. However, some of the current checks don't depend on \p OuterLp and
+// would be redundantly executed for each \p Lp if we invoked this function for
+// different candidate outer loops. This is not the case for now because we
+// don't currently have the infrastructure to evaluate multiple candidate outer
+// loops and \p OuterLp will be a fixed parameter while we only support explicit
+// outer loop vectorization. It's also very likely that these checks go away
+// before introducing the aforementioned infrastructure. However, if this is not
+// the case, we should move the \p OuterLp independent checks to a separate
+// function that is only executed once for each \p Lp.
+static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
+ assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
+
+ // If Lp is the outer loop, it's uniform by definition.
+ if (Lp == OuterLp)
+ return true;
+ assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
+
+ // 1.
+ PHINode *IV = Lp->getCanonicalInductionVariable();
+ if (!IV) {
+ DEBUG(dbgs() << "LV: Canonical IV not found.\n");
+ return false;
+ }
+
+ // 2.
+ BasicBlock *Latch = Lp->getLoopLatch();
+ auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
+ if (!LatchBr || LatchBr->isUnconditional()) {
+ DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
+ return false;
+ }
+
+ // 3.
+ auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition());
+ if (!LatchCmp) {
+ DEBUG(dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
+ return false;
+ }
+
+ Value *CondOp0 = LatchCmp->getOperand(0);
+ Value *CondOp1 = LatchCmp->getOperand(1);
+ Value *IVUpdate = IV->getIncomingValueForBlock(Latch);
+ if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) &&
+ !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) {
+ DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
+ return false;
+ }
+
+ return true;
+}
+
+// Return true if \p Lp and all its nested loops are uniform with regard to \p
+// OuterLp.
+static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
+ if (!isUniformLoop(Lp, OuterLp))
+ return false;
+
+ // Check if nested loops are uniform.
+ for (Loop *SubLp : *Lp)
+ if (!isUniformLoopNest(SubLp, OuterLp))
+ return false;
+
+ return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeOuterLoop() {
+ assert(!TheLoop->empty() && "We are not vectorizing an outer loop.");
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // Check whether the BB terminator is a BranchInst. Any other terminator is
+ // not supported yet.
+ auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!Br) {
+ DEBUG(dbgs() << "LV: Unsupported basic block terminator.\n");
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Check whether the BranchInst is a supported one. Only unconditional
+ // branches, conditional branches with an outer loop invariant condition or
+ // backedges are supported.
+ if (Br && Br->isConditional() &&
+ !TheLoop->isLoopInvariant(Br->getCondition()) &&
+ !LI->isLoopHeader(Br->getSuccessor(0)) &&
+ !LI->isLoopHeader(Br->getSuccessor(1))) {
+ DEBUG(dbgs() << "LV: Unsupported conditional branch.\n");
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+ }
+
+ // Check whether inner loops are uniform. At this point, we only support
+ // simple outer loops scenarios with uniform nested loops.
+ if (!isUniformLoopNest(TheLoop /*loop nest*/,
+ TheLoop /*context outer loop*/)) {
+ DEBUG(dbgs()
+ << "LV: Not vectorizing: Outer loop contains divergent loops.\n");
+ ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+ << "loop control flow is not understood by vectorizer");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ return Result;
+}
+
static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
if (Ty->isPointerTy())
return DL.getIntPtrType(Ty);
@@ -7406,7 +7681,33 @@ void LoopVectorizationCostModel::collect
}
VectorizationFactor
+LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
+ unsigned UserVF) {
+ // Width 1 means no vectorize, cost 0 means uncomputed cost.
+ const VectorizationFactor NoVectorization = {1U, 0U};
+
+ // Outer loop handling: They may require CFG and instruction level
+ // transformations before even evaluating whether vectorization is profitable.
+ // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+ // the vectorization pipeline.
+ if (!OrigLoop->empty()) {
+ assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+ assert(UserVF && "Expected UserVF for outer loop vectorization.");
+ assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+ DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
+ buildVPlans(UserVF, UserVF);
+
+ return {UserVF, 0};
+ }
+
+ DEBUG(dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
+ "VPlan-native path.\n");
+ return NoVectorization;
+}
+
+VectorizationFactor
LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
+ assert(OrigLoop->empty() && "Inner loop expected.");
// Width 1 means no vectorize, cost 0 means uncomputed cost.
const VectorizationFactor NoVectorization = {1U, 0U};
Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
@@ -7969,6 +8270,19 @@ LoopVectorizationPlanner::createReplicat
LoopVectorizationPlanner::VPlanPtr
LoopVectorizationPlanner::buildVPlan(VFRange &Range,
const SmallPtrSetImpl<Value *> &NeedDef) {
+ // Outer loop handling: They may require CFG and instruction level
+ // transformations before even evaluating whether vectorization is profitable.
+ // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+ // the vectorization pipeline.
+ if (!OrigLoop->empty()) {
+ assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+
+ // Create new empty VPlan
+ auto Plan = llvm::make_unique<VPlan>();
+ return Plan;
+ }
+
+ assert(OrigLoop->empty() && "Inner loop expected.");
EdgeMaskCache.clear();
BlockMaskCache.clear();
DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
@@ -8298,8 +8612,45 @@ void VPWidenMemoryInstructionRecipe::exe
State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
}
+// Process the loop in the VPlan-native vectorization path. This path builds
+// VPlan upfront in the vectorization pipeline, which allows to apply
+// VPlan-to-VPlan transformations from the very beginning without modifying the
+// input LLVM IR.
+static bool processLoopInVPlanNativePath(
+ Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
+ LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
+ TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints) {
+
+ assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
+ Function *F = L->getHeader()->getParent();
+ InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
+ LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
+ &Hints, IAI);
+ // Use the planner for outer loop vectorization.
+ // TODO: CM is not used at this point inside the planner. Turn CM into an
+ // optional argument if we don't need it in the future.
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
+
+ // Get user vectorization factor.
+ unsigned UserVF = Hints.getWidth();
+
+ // Check the function attributes to find out if this function should be
+ // optimized for size.
+ bool OptForSize =
+ Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
+
+ // Plan how to best vectorize, return the best VF and its cost.
+ LVP.planInVPlanNativePath(OptForSize, UserVF);
+
+ // Returning false. We are currently not generating vector code in the VPlan
+ // native path.
+ return false;
+}
+
bool LoopVectorizePass::processLoop(Loop *L) {
- assert(L->empty() && "Only process inner loops.");
+ assert((EnableVPlanNativePath || L->empty()) &&
+ "VPlan-native path is not enabled. Only process inner loops.");
#ifndef NDEBUG
const std::string DebugLocStr = getDebugLocString(L);
@@ -8354,6 +8705,16 @@ bool LoopVectorizePass::processLoop(Loop
bool OptForSize =
Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
+ // Entrance to the VPlan-native vectorization path. Outer loops are processed
+ // here. They may require CFG and instruction level transformations before
+ // even evaluating whether vectorization is profitable. Since we cannot modify
+ // the incoming IR, we need to build VPlan upfront in the vectorization
+ // pipeline.
+ if (!L->empty())
+ return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
+ ORE, Hints);
+
+ assert(L->empty() && "Inner loop expected.");
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
// count by optimizing for size, to minimize overheads.
// Prefer constant trip counts over profile data, over upper bound estimate.
@@ -8630,7 +8991,7 @@ bool LoopVectorizePass::runImpl(
SmallVector<Loop *, 8> Worklist;
for (Loop *L : *LI)
- addAcyclicInnerLoop(*L, *LI, Worklist);
+ collectSupportedLoops(*L, LI, ORE, Worklist);
LoopsAnalyzed += Worklist.size();
Added: llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_detection.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_detection.ll?rev=330739&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_detection.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_detection.ll Tue Apr 24 10:04:17 2018
@@ -0,0 +1,238 @@
+; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Verify that outer loops annotated only with the expected explicit
+; vectorization hints are collected for vectorization instead of inner loops.
+
+; Root C/C++ source code for all the test cases
+; void foo(int *a, int *b, int N, int M)
+; {
+; int i, j;
+; #pragma clang loop vectorize(enable)
+; for (i = 0; i < N; i++) {
+; for (j = 0; j < M; j++) {
+; a[i*M+j] = b[i*M+j] * b[i*M+j];
+; }
+; }
+; }
+
+; Case 1: Annotated outer loop WITH vector width information must be collected.
+
+; CHECK-LABEL: vector_width
+; CHECK: LV: Loop hints: force=enabled width=4 unroll=0
+; CHECK: LV: We can vectorize this outer loop!
+; CHECK: LV: Using user VF 4.
+; CHECK-NOT: LV: Loop hints: force=?
+; CHECK-NOT: LV: Found a loop: inner.body
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @vector_width(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+ %cmp32 = icmp sgt i32 %N, 0
+ br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph: ; preds = %entry
+ %cmp230 = icmp sgt i32 %M, 0
+ %0 = sext i32 %M to i64
+ %wide.trip.count = zext i32 %M to i64
+ %wide.trip.count38 = zext i32 %N to i64
+ br label %outer.body
+
+outer.body: ; preds = %outer.inc, %outer.ph
+ %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
+ br i1 %cmp230, label %inner.ph, label %outer.inc
+
+inner.ph: ; preds = %outer.body
+ %1 = mul nsw i64 %indvars.iv35, %0
+ br label %inner.body
+
+inner.body: ; preds = %inner.body, %inner.ph
+ %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+ %2 = add nsw i64 %indvars.iv, %1
+ %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+ %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+ %mul8 = mul nsw i32 %3, %3
+ %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+ store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc: ; preds = %inner.body, %outer.body
+ %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+ %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
+ br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !6
+
+for.end15: ; preds = %outer.inc, %entry
+ ret void
+}
+
+; Case 2: Annotated outer loop WITHOUT vector width information doesn't have to
+; be collected.
+
+; CHECK-LABEL: case2
+; CHECK-NOT: LV: Loop hints: force=enabled
+; CHECK-NOT: LV: We can vectorize this outer loop!
+; CHECK: LV: Loop hints: force=?
+; CHECK: LV: Found a loop: inner.body
+
+define void @case2(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+ %cmp32 = icmp sgt i32 %N, 0
+ br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph: ; preds = %entry
+ %cmp230 = icmp sgt i32 %M, 0
+ %0 = sext i32 %M to i64
+ %wide.trip.count = zext i32 %M to i64
+ %wide.trip.count38 = zext i32 %N to i64
+ br label %outer.body
+
+outer.body: ; preds = %outer.inc, %outer.ph
+ %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
+ br i1 %cmp230, label %inner.ph, label %outer.inc
+
+inner.ph: ; preds = %outer.body
+ %1 = mul nsw i64 %indvars.iv35, %0
+ br label %inner.body
+
+inner.body: ; preds = %inner.body, %inner.ph
+ %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+ %2 = add nsw i64 %indvars.iv, %1
+ %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+ %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+ %mul8 = mul nsw i32 %3, %3
+ %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+ store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc: ; preds = %inner.body, %outer.body
+ %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+ %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
+ br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !9
+
+for.end15: ; preds = %outer.inc, %entry
+ ret void
+}
+
+; Case 3: Annotated outer loop WITH vector width and interleave information
+; doesn't have to be collected.
+
+; CHECK-LABEL: case3
+; CHECK-NOT: LV: Loop hints: force=enabled
+; CHECK-NOT: LV: We can vectorize this outer loop!
+; CHECK: LV: Loop hints: force=?
+; CHECK: LV: Found a loop: inner.body
+
+define void @case3(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+ %cmp32 = icmp sgt i32 %N, 0
+ br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph: ; preds = %entry
+ %cmp230 = icmp sgt i32 %M, 0
+ %0 = sext i32 %M to i64
+ %wide.trip.count = zext i32 %M to i64
+ %wide.trip.count38 = zext i32 %N to i64
+ br label %outer.body
+
+outer.body: ; preds = %outer.inc, %outer.ph
+ %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
+ br i1 %cmp230, label %inner.ph, label %outer.inc
+
+inner.ph: ; preds = %outer.body
+ %1 = mul nsw i64 %indvars.iv35, %0
+ br label %inner.body
+
+inner.body: ; preds = %inner.body, %inner.ph
+ %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+ %2 = add nsw i64 %indvars.iv, %1
+ %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+ %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+ %mul8 = mul nsw i32 %3, %3
+ %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+ store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc: ; preds = %inner.body, %outer.body
+ %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+ %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
+ br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !11
+
+for.end15: ; preds = %outer.inc, %entry
+ ret void
+}
+
+; Case 4: Outer loop without any explicit vectorization annotation doesn't have
+; to be collected.
+
+; CHECK-LABEL: case4
+; CHECK-NOT: LV: Loop hints: force=enabled
+; CHECK-NOT: LV: We can vectorize this outer loop!
+; CHECK: LV: Loop hints: force=?
+; CHECK: LV: Found a loop: inner.body
+
+define void @case4(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+ %cmp32 = icmp sgt i32 %N, 0
+ br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph: ; preds = %entry
+ %cmp230 = icmp sgt i32 %M, 0
+ %0 = sext i32 %M to i64
+ %wide.trip.count = zext i32 %M to i64
+ %wide.trip.count38 = zext i32 %N to i64
+ br label %outer.body
+
+outer.body: ; preds = %outer.inc, %outer.ph
+ %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
+ br i1 %cmp230, label %inner.ph, label %outer.inc
+
+inner.ph: ; preds = %outer.body
+ %1 = mul nsw i64 %indvars.iv35, %0
+ br label %inner.body
+
+inner.body: ; preds = %inner.body, %inner.ph
+ %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+ %2 = add nsw i64 %indvars.iv, %1
+ %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+ %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+ %mul8 = mul nsw i32 %3, %3
+ %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+ store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc: ; preds = %inner.body, %outer.body
+ %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+ %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
+ br i1 %exitcond39, label %for.end15, label %outer.body
+
+for.end15: ; preds = %outer.inc, %entry
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+; Case 1
+!6 = distinct !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.width", i32 4}
+!8 = !{!"llvm.loop.vectorize.enable", i1 true}
+; Case 2
+!9 = distinct !{!9, !8}
+; Case 3
+!10 = !{!"llvm.loop.interleave.count", i32 2}
+!11 = distinct !{!11, !7, !10, !8}
Added: llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll?rev=330739&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll Tue Apr 24 10:04:17 2018
@@ -0,0 +1,177 @@
+; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Verify that LV bails out on explicit vectorization outer loops that contain
+; divergent inner loops.
+
+; Root C/C++ source code for all the test cases
+; void foo(int *a, int *b, int N, int M)
+; {
+; int i, j;
+; #pragma clang loop vectorize(enable) vectorize_width(8)
+; for (i = 0; i < N; i++) {
+; // Tested inner loop. It will be replaced per test.
+; for (j = 0; j < M; j++) {
+; a[i*M+j] = b[i*M+j] * b[i*M+j];
+; }
+; }
+; }
+
+; Case 1 (for (j = i; j < M; j++)): Inner loop with divergent IV start.
+
+; CHECK-LABEL: iv_start
+; CHECK: LV: Not vectorizing: Outer loop contains divergent loops.
+; CHECK: LV: Not vectorizing: Unsupported outer loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @iv_start(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+ %cmp33 = icmp sgt i32 %N, 0
+ br i1 %cmp33, label %outer.ph, label %for.end15
+
+outer.ph: ; preds = %entry
+ %0 = sext i32 %M to i64
+ %wide.trip.count = zext i32 %M to i64
+ %wide.trip.count41 = zext i32 %N to i64
+ br label %outer.body
+
+outer.body: ; preds = %outer.inc, %outer.ph
+ %indvars.iv38 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next39, %outer.inc ]
+ %cmp231 = icmp slt i64 %indvars.iv38, %0
+ br i1 %cmp231, label %inner.ph, label %outer.inc
+
+inner.ph: ; preds = %outer.body
+ %1 = mul nsw i64 %indvars.iv38, %0
+ br label %inner.body
+
+inner.body: ; preds = %inner.body, %inner.ph
+ %indvars.iv35 = phi i64 [ %indvars.iv38, %inner.ph ], [ %indvars.iv.next36, %inner.body ]
+ %2 = add nsw i64 %indvars.iv35, %1
+ %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+ %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+ %mul8 = mul nsw i32 %3, %3
+ %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+ store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+ %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+ %exitcond = icmp eq i64 %indvars.iv.next36, %wide.trip.count
+ br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc: ; preds = %inner.body, %outer.body
+ %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1
+ %exitcond42 = icmp eq i64 %indvars.iv.next39, %wide.trip.count41
+ br i1 %exitcond42, label %for.end15, label %outer.body, !llvm.loop !6
+
+for.end15: ; preds = %outer.inc, %entry
+ ret void
+}
+
+
+; Case 2 (for (j = 0; j < i; j++)): Inner loop with divergent upper-bound.
+
+; CHECK-LABEL: loop_ub
+; CHECK: LV: Not vectorizing: Outer loop contains divergent loops.
+; CHECK: LV: Not vectorizing: Unsupported outer loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @loop_ub(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+ %cmp32 = icmp sgt i32 %N, 0
+ br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph: ; preds = %entry
+ %0 = sext i32 %M to i64
+ %wide.trip.count41 = zext i32 %N to i64
+ br label %outer.body
+
+outer.body: ; preds = %outer.inc, %outer.ph
+ %indvars.iv38 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next39, %outer.inc ]
+ %cmp230 = icmp eq i64 %indvars.iv38, 0
+ br i1 %cmp230, label %outer.inc, label %inner.ph
+
+inner.ph: ; preds = %outer.body
+ %1 = mul nsw i64 %indvars.iv38, %0
+ br label %inner.body
+
+inner.body: ; preds = %inner.body, %inner.ph
+ %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+ %2 = add nsw i64 %indvars.iv, %1
+ %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+ %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+ %mul8 = mul nsw i32 %3, %3
+ %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+ store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %indvars.iv38
+ br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc: ; preds = %inner.body, %outer.body
+ %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1
+ %exitcond42 = icmp eq i64 %indvars.iv.next39, %wide.trip.count41
+ br i1 %exitcond42, label %for.end15, label %outer.body, !llvm.loop !6
+
+for.end15: ; preds = %outer.inc, %entry
+ ret void
+}
+
+; Case 3 (for (j = 0; j < M; j+=i)): Inner loop with divergent step.
+
+; CHECK-LABEL: iv_step
+; CHECK: LV: Not vectorizing: Outer loop contains divergent loops.
+; CHECK: LV: Not vectorizing: Unsupported outer loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @iv_step(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+ %cmp33 = icmp sgt i32 %N, 0
+ br i1 %cmp33, label %outer.ph, label %for.end15
+
+outer.ph: ; preds = %entry
+ %cmp231 = icmp sgt i32 %M, 0
+ %0 = sext i32 %M to i64
+ %wide.trip.count = zext i32 %N to i64
+ br label %outer.body
+
+outer.body: ; preds = %for.inc14, %outer.ph
+ %indvars.iv39 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next40, %for.inc14 ]
+ br i1 %cmp231, label %inner.ph, label %for.inc14
+
+inner.ph: ; preds = %outer.body
+ %1 = mul nsw i64 %indvars.iv39, %0
+ br label %inner.body
+
+inner.body: ; preds = %inner.ph, %inner.body
+ %indvars.iv36 = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next37, %inner.body ]
+ %2 = add nsw i64 %indvars.iv36, %1
+ %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+ %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+ %mul8 = mul nsw i32 %3, %3
+ %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+ store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+ %indvars.iv.next37 = add nuw nsw i64 %indvars.iv36, %indvars.iv39
+ %cmp2 = icmp slt i64 %indvars.iv.next37, %0
+ br i1 %cmp2, label %inner.body, label %for.inc14
+
+for.inc14: ; preds = %inner.body, %outer.body
+ %indvars.iv.next40 = add nuw nsw i64 %indvars.iv39, 1
+ %exitcond = icmp eq i64 %indvars.iv.next40, %wide.trip.count
+ br i1 %exitcond, label %for.end15, label %outer.body, !llvm.loop !6
+
+for.end15: ; preds = %for.inc14, %entry
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.width", i32 8}
+!8 = !{!"llvm.loop.vectorize.enable", i1 true}
Added: llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll?rev=330739&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll Tue Apr 24 10:04:17 2018
@@ -0,0 +1,138 @@
+; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Verify that LV can handle explicit vectorization outer loops with uniform branches
+; but bails out on outer loops with divergent branches.
+
+; Root C/C++ source code for the test cases
+; void foo(int *a, int *b, int N, int M)
+; {
+; int i, j;
+; #pragma clang loop vectorize(enable) vectorize_width(8)
+; for (i = 0; i < N; i++) {
+; // Tested conditional branch. COND will be replaced per test.
+; if (COND)
+; for (j = 0; j < M; j++) {
+; a[i*M+j] = b[i*M+j] * b[i*M+j];
+; }
+; }
+; }
+
+; Case 1 (COND => M == N): Outer loop with uniform conditional branch.
+
+; CHECK-LABEL: uniform_branch
+; CHECK: LV: We can vectorize this outer loop!
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @uniform_branch(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+ %cmp39 = icmp sgt i32 %N, 0
+ br i1 %cmp39, label %outer.ph, label %for.end19
+
+outer.ph: ; preds = %entry
+ %cmp337 = icmp slt i32 %M, 1
+ %0 = sext i32 %M to i64
+ %N64 = zext i32 %N to i64
+ %M64 = zext i32 %M to i64
+ %cmp1 = icmp ne i32 %M, %N ; Uniform condition
+ %brmerge = or i1 %cmp1, %cmp337 ; Uniform condition
+ br label %outer.body
+
+outer.body: ; preds = %outer.inc, %outer.ph
+ %indvars.iv42 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next43, %outer.inc ]
+ %1 = mul nsw i64 %indvars.iv42, %0
+ %arrayidx = getelementptr inbounds i32, i32* %b, i64 %1
+ %2 = load i32, i32* %arrayidx, align 4, !tbaa !2
+ br i1 %brmerge, label %outer.inc, label %inner.ph ; Supported uniform branch
+
+inner.ph: ; preds = %outer.body
+ br label %inner.body
+
+inner.body: ; preds = %inner.ph, %inner.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %inner.body ], [ 0, %inner.ph ]
+ %3 = add nsw i64 %indvars.iv, %1
+ %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %3
+ %4 = load i32, i32* %arrayidx7, align 4, !tbaa !2
+ %mul12 = mul nsw i32 %4, %4
+ %arrayidx16 = getelementptr inbounds i32, i32* %a, i64 %3
+ store i32 %mul12, i32* %arrayidx16, align 4, !tbaa !2
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %M64
+ br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc: ; preds = %inner.body, %outer.body
+ %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
+ %exitcond46 = icmp eq i64 %indvars.iv.next43, %N64
+ br i1 %exitcond46, label %for.end19, label %outer.body, !llvm.loop !6
+
+for.end19: ; preds = %outer.inc, %entry
+ ret void
+}
+
+
+; Case 2 (COND => B[i * M] == 0): Outer loop with divergent conditional branch.
+
+; CHECK-LABEL: divergent_branch
+; CHECK: Unsupported conditional branch.
+; CHECK: LV: Not vectorizing: Unsupported outer loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @divergent_branch(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+ %cmp39 = icmp sgt i32 %N, 0
+ br i1 %cmp39, label %outer.ph, label %for.end19
+
+outer.ph: ; preds = %entry
+ %cmp337 = icmp slt i32 %M, 1
+ %0 = sext i32 %M to i64
+ %N64 = zext i32 %N to i64
+ %M64 = zext i32 %M to i64
+ br label %outer.body
+
+outer.body: ; preds = %outer.inc, %outer.ph
+ %indvars.iv42 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next43, %outer.inc ]
+ %1 = mul nsw i64 %indvars.iv42, %0
+ %arrayidx = getelementptr inbounds i32, i32* %b, i64 %1
+ %2 = load i32, i32* %arrayidx, align 4, !tbaa !2
+ %cmp1 = icmp ne i32 %2, 0 ; Divergent condition
+ %brmerge = or i1 %cmp1, %cmp337 ; Divergent condition
+ br i1 %brmerge, label %outer.inc, label %inner.ph ; Unsupported divergent branch.
+
+inner.ph: ; preds = %outer.body
+ br label %inner.body
+
+inner.body: ; preds = %inner.ph, %inner.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %inner.body ], [ 0, %inner.ph ]
+ %3 = add nsw i64 %indvars.iv, %1
+ %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %3
+ %4 = load i32, i32* %arrayidx7, align 4, !tbaa !2
+ %mul12 = mul nsw i32 %4, %4
+ %arrayidx16 = getelementptr inbounds i32, i32* %a, i64 %3
+ store i32 %mul12, i32* %arrayidx16, align 4, !tbaa !2
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %M64
+ br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc: ; preds = %inner.body, %outer.body
+ %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
+ %exitcond46 = icmp eq i64 %indvars.iv.next43, %N64
+ br i1 %exitcond46, label %for.end19, label %outer.body, !llvm.loop !6
+
+for.end19: ; preds = %outer.inc, %entry
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.width", i32 8}
+!8 = !{!"llvm.loop.vectorize.enable", i1 true}
More information about the llvm-commits
mailing list