[llvm] r330739 - [LV][VPlan] Detect outer loops for explicit vectorization.

Tue Apr 24 10:04:18 PDT 2018

Author: dcaballe
Date: Tue Apr 24 10:04:17 2018
New Revision: 330739

URL: http://llvm.org/viewvc/llvm-project?rev=330739&view=rev
Log:
[LV][VPlan] Detect outer loops for explicit vectorization.

Patch #2 from VPlan Outer Loop Vectorization Patch Series #1
(RFC: http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html).

This patch introduces the basic infrastructure to detect, legality check
and process outer loops annotated with hints for explicit vectorization.
All these changes are protected under the feature flag
-enable-vplan-native-path. This should make this patch NFC for the existing
inner loop vectorizer.

Reviewers: hfinkel, mkuper, rengolin, fhahn, aemerson, mssimpso.

Differential Revision: https://reviews.llvm.org/D42447

Added:
    llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_detection.ll
    llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll
    llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll
Modified:
    llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorize.h
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp

Modified: llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorize.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorize.h?rev=330739&r1=330738&r2=330739&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorize.h (original)
+++ llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorize.h Tue Apr 24 10:04:17 2018
@@ -26,6 +26,14 @@
 //    of vectorization. It decides on the optimal vector width, which
 //    can be one, if vectorization is not profitable.
 //
+// There is a development effort going on to migrate loop vectorizer to the
+// VPlan infrastructure and to introduce outer loop vectorization support (see
+// docs/Proposal/VectorizationPlan.rst and
+// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
+// purpose, we temporarily introduced the VPlan-native vectorization path: an
+// alternative vectorization path that is natively implemented on top of the
+// VPlan infrastructure. See EnableVPlanNativePath for enabling.
+//
 //===----------------------------------------------------------------------===//
 //
 // The reduction-variable vectorization is based on the paper:

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationPlanner.h?rev=330739&r1=330738&r2=330739&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationPlanner.h (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationPlanner.h Tue Apr 24 10:04:17 2018
@@ -144,6 +144,10 @@ public:
   /// Plan how to best vectorize, return the best VF and its cost.
   VectorizationFactor plan(bool OptForSize, unsigned UserVF);
 
+  /// Use the VPlan-native path to plan how to best vectorize, return the best
+  /// VF and its cost.
+  VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF);
+
   /// Finalize the best decision and dispose of all other VPlans.
   void setBestPlan(unsigned VF, unsigned UF);
 

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=330739&r1=330738&r2=330739&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Tue Apr 24 10:04:17 2018
@@ -26,6 +26,14 @@
 //    of vectorization. It decides on the optimal vector width, which
 //    can be one, if vectorization is not profitable.
 //
+// There is a development effort going on to migrate loop vectorizer to the
+// VPlan infrastructure and to introduce outer loop vectorization support (see
+// docs/Proposal/VectorizationPlan.rst and
+// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
+// purpose, we temporarily introduced the VPlan-native vectorization path: an
+// alternative vectorization path that is natively implemented on top of the
+// VPlan infrastructure. See EnableVPlanNativePath for enabling.
+//
 //===----------------------------------------------------------------------===//
 //
 // The reduction-variable vectorization is based on the paper:
@@ -251,6 +259,11 @@ static cl::opt<unsigned> PragmaVectorize
     cl::desc("The maximum number of SCEV checks allowed with a "
              "vectorize(enable) pragma"));
 
+static cl::opt<bool> EnableVPlanNativePath(
+    "enable-vplan-native-path", cl::init(false), cl::Hidden,
+    cl::desc("Enable VPlan-native vectorization path with "
+             "support for outer loop vectorization."));
+
 /// Create an analysis remark that explains why vectorization failed
 ///
 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
@@ -1519,7 +1532,7 @@ public:
       std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
       OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
       LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC)
-      : TheLoop(L), PSE(PSE), TLI(TLI), DT(DT), GetLAA(GetLAA),
+      : TheLoop(L), LI(LI), PSE(PSE), TLI(TLI), DT(DT), GetLAA(GetLAA),
         ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {}
 
   /// ReductionList contains the reduction descriptors for all
@@ -1621,6 +1634,15 @@ public:
   bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
 
 private:
+  /// Return true if the pre-header, exiting and latch blocks of \p Lp and all
+  /// its nested loops are considered legal for vectorization. These legal
+  /// checks are common for inner and outer loop vectorization.
+  bool canVectorizeLoopNestCFG(Loop *Lp);
+
+  /// Return true if the pre-header, exiting and latch blocks of \p Lp
+  /// (non-recursive) are considered legal for vectorization.
+  bool canVectorizeLoopCFG(Loop *Lp);
+
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
   /// and we only need to check individual instructions.
@@ -1636,6 +1658,10 @@ private:
   /// transformation.
   bool canVectorizeWithIfConvert();
 
+  /// Return true if we can vectorize this outer loop. The method performs
+  /// specific checks for outer loop vectorization.
+  bool canVectorizeOuterLoop();
+
   /// Return true if all of the instructions in the block can be speculatively
   /// executed. \p SafePtrs is a list of addresses that are known to be legal
   /// and we know that we can read from them without segfault.
@@ -1672,6 +1698,9 @@ private:
   /// The loop that we evaluate.
   Loop *TheLoop;
 
+  /// Loop Info analysis.
+  LoopInfo *LI;
+
   /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
   /// Applies dynamic knowledge to simplify SCEV expressions in the context
   /// of existing SCEV assumptions. The analysis will also add a minimal set
@@ -2275,17 +2304,73 @@ private:
 
 } // end anonymous namespace
 
-static void addAcyclicInnerLoop(Loop &L, LoopInfo &LI,
-                                SmallVectorImpl<Loop *> &V) {
-  if (L.empty()) {
+// Return true if \p OuterLp is an outer loop annotated with hints for explicit
+// vectorization. The loop needs to be annotated with #pragma omp simd
+// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
+// vector length information is not provided, vectorization is not considered
+// explicit. Interleave hints are not allowed either. These limitations will be
+// relaxed in the future.
+// Please, note that we are currently forced to abuse the pragma 'clang
+// vectorize' semantics. This pragma provides *auto-vectorization hints*
+// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
+// provides *explicit vectorization hints* (LV can bypass legal checks and
+// assume that vectorization is legal). However, both hints are implemented
+// using the same metadata (llvm.loop.vectorize, processed by
+// LoopVectorizeHints). This will be fixed in the future when the native IR
+// representation for pragma 'omp simd' is introduced.
+static bool isExplicitVecOuterLoop(Loop *OuterLp,
+                                   OptimizationRemarkEmitter *ORE) {
+  assert(!OuterLp->empty() && "This is not an outer loop");
+  LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
+
+  // Only outer loops with an explicit vectorization hint are supported.
+  // Unannotated outer loops are ignored.
+  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
+    return false;
+
+  Function *Fn = OuterLp->getHeader()->getParent();
+  if (!Hints.allowVectorization(Fn, OuterLp, false /*AlwaysVectorize*/)) {
+    DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
+    return false;
+  }
+
+  if (!Hints.getWidth()) {
+    DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n");
+    emitMissedWarning(Fn, OuterLp, Hints, ORE);
+    return false;
+  }
+
+  if (Hints.getInterleave() > 1) {
+    // TODO: Interleave support is future work.
+    DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
+                    "outer loops.\n");
+    emitMissedWarning(Fn, OuterLp, Hints, ORE);
+    return false;
+  }
+
+  return true;
+}
+
+static void collectSupportedLoops(Loop &L, LoopInfo *LI,
+                                  OptimizationRemarkEmitter *ORE,
+                                  SmallVectorImpl<Loop *> &V) {
+  // Collect inner loops and outer loops without irreducible control flow. For
+  // now, only collect outer loops that have explicit vectorization hints.
+  if (L.empty() || (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
     LoopBlocksRPO RPOT(&L);
-    RPOT.perform(&LI);
-    if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, LI))
+    RPOT.perform(LI);
+    if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
       V.push_back(&L);
-    return;
+      // TODO: Collect inner loops inside marked outer loops in case
+      // vectorization fails for the outer loop. Do not invoke
+      // 'containsIrreducibleCFG' again for inner loops when the outer loop is
+      // already known to be reducible. We can use an inherited attribute for
+      // that.
+      return;
+    }
   }
   for (Loop *InnerL : L)
-    addAcyclicInnerLoop(*InnerL, LI, V);
+    collectSupportedLoops(*InnerL, LI, ORE, V);
 }
 
 namespace {
@@ -4832,15 +4917,24 @@ bool LoopVectorizationLegality::canVecto
   return true;
 }
 
-bool LoopVectorizationLegality::canVectorize() {
+// Helper function to canVectorizeLoopNestCFG.
+bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp) {
+  assert((EnableVPlanNativePath || Lp->empty()) &&
+         "VPlan-native path is not enabled.");
+
+  // TODO: ORE should be improved to show more accurate information when an
+  // outer loop can't be vectorized because a nested loop is not understood or
+  // legal. Something like: "outer_loop_location: loop not vectorized:
+  // (inner_loop_location) loop control flow is not understood by vectorizer".
+
   // Store the result and return it at the end instead of exiting early, in case
   // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
   bool Result = true;
-
   bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+
   // We must have a loop in canonical form. Loops with indirectbr in them cannot
   // be canonicalized.
-  if (!TheLoop->getLoopPreheader()) {
+  if (!Lp->getLoopPreheader()) {
     DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n");
     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
               << "loop control flow is not understood by vectorizer");
@@ -4850,21 +4944,8 @@ bool LoopVectorizationLegality::canVecto
       return false;
   }
 
-  // FIXME: The code is currently dead, since the loop gets sent to
-  // LoopVectorizationLegality is already an innermost loop.
-  //
-  // We can only vectorize innermost loops.
-  if (!TheLoop->empty()) {
-    ORE->emit(createMissedAnalysis("NotInnermostLoop")
-              << "loop is not the innermost loop");
-    if (DoExtraAnalysis)
-      Result = false;
-    else
-      return false;
-  }
-
   // We must have a single backedge.
-  if (TheLoop->getNumBackEdges() != 1) {
+  if (Lp->getNumBackEdges() != 1) {
     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
               << "loop control flow is not understood by vectorizer");
     if (DoExtraAnalysis)
@@ -4874,7 +4955,7 @@ bool LoopVectorizationLegality::canVecto
   }
 
   // We must have a single exiting block.
-  if (!TheLoop->getExitingBlock()) {
+  if (!Lp->getExitingBlock()) {
     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
               << "loop control flow is not understood by vectorizer");
     if (DoExtraAnalysis)
@@ -4886,7 +4967,7 @@ bool LoopVectorizationLegality::canVecto
   // We only handle bottom-tested loops, i.e. loop in which the condition is
   // checked at the end of each iteration. With that we can assume that all
   // instructions in the loop are executed the same number of times.
-  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+  if (Lp->getExitingBlock() != Lp->getLoopLatch()) {
     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
               << "loop control flow is not understood by vectorizer");
     if (DoExtraAnalysis)
@@ -4895,10 +4976,70 @@ bool LoopVectorizationLegality::canVecto
       return false;
   }
 
+  return Result;
+}
+
+bool LoopVectorizationLegality::canVectorizeLoopNestCFG(Loop *Lp) {
+  // Store the result and return it at the end instead of exiting early, in case
+  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+  bool Result = true;
+  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+  if (!canVectorizeLoopCFG(Lp)) {
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  // Recursively check whether the loop control flow of nested loops is
+  // understood.
+  for (Loop *SubLp : *Lp)
+    if (!canVectorizeLoopNestCFG(SubLp)) {
+      if (DoExtraAnalysis)
+        Result = false;
+      else
+        return false;
+    }
+
+  return Result;
+}
+
+bool LoopVectorizationLegality::canVectorize() {
+  // Store the result and return it at the end instead of exiting early, in case
+  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+  bool Result = true;
+
+  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+  // Check whether the loop-related control flow in the loop nest is expected by
+  // vectorizer.
+  if (!canVectorizeLoopNestCFG(TheLoop)) {
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
   // We need to have a loop header.
   DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
                << '\n');
 
+  // Specific checks for outer loops. We skip the remaining legal checks at this
+  // point because they don't support outer loops.
+  if (!TheLoop->empty()) {
+    assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+
+    if (!canVectorizeOuterLoop()) {
+      DEBUG(dbgs() << "LV: Not vectorizing: Unsupported outer loop.\n");
+      // TODO: Implement DoExtraAnalysis when subsequent legal checks support
+      // outer loops.
+      return false;
+    }
+
+    DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
+    return Result;
+  }
+
+  assert(TheLoop->empty() && "Inner loop expected.");
   // Check if we can if-convert non-single-bb loops.
   unsigned NumBlocks = TheLoop->getNumBlocks();
   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
@@ -4955,6 +5096,140 @@ bool LoopVectorizationLegality::canVecto
   return Result;
 }
 
+// Return true if the inner loop \p Lp is uniform with regard to the outer loop
+// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
+// executing the inner loop will execute the same iterations). This check is
+// very constrained for now but it will be relaxed in the future. \p Lp is
+// considered uniform if it meets all the following conditions:
+//   1) it has a canonical IV (starting from 0 and with stride 1),
+//   2) its latch terminator is a conditional branch and,
+//   3) its latch condition is a compare instruction whose operands are the
+//      canonical IV and an OuterLp invariant.
+// This check doesn't take into account the uniformity of other conditions not
+// related to the loop latch because they don't affect the loop uniformity.
+//
+// NOTE: We decided to keep all these checks and its associated documentation
+// together so that we can easily have a picture of the current supported loop
+// nests. However, some of the current checks don't depend on \p OuterLp and
+// would be redundantly executed for each \p Lp if we invoked this function for
+// different candidate outer loops. This is not the case for now because we
+// don't currently have the infrastructure to evaluate multiple candidate outer
+// loops and \p OuterLp will be a fixed parameter while we only support explicit
+// outer loop vectorization. It's also very likely that these checks go away
+// before introducing the aforementioned infrastructure. However, if this is not
+// the case, we should move the \p OuterLp independent checks to a separate
+// function that is only executed once for each \p Lp.
+static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
+  assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
+
+  // If Lp is the outer loop, it's uniform by definition.
+  if (Lp == OuterLp)
+    return true;
+  assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
+
+  // 1.
+  PHINode *IV = Lp->getCanonicalInductionVariable();
+  if (!IV) {
+    DEBUG(dbgs() << "LV: Canonical IV not found.\n");
+    return false;
+  }
+
+  // 2.
+  BasicBlock *Latch = Lp->getLoopLatch();
+  auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!LatchBr || LatchBr->isUnconditional()) {
+    DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
+    return false;
+  }
+
+  // 3.
+  auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition());
+  if (!LatchCmp) {
+    DEBUG(dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
+    return false;
+  }
+
+  Value *CondOp0 = LatchCmp->getOperand(0);
+  Value *CondOp1 = LatchCmp->getOperand(1);
+  Value *IVUpdate = IV->getIncomingValueForBlock(Latch);
+  if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) &&
+      !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) {
+    DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
+    return false;
+  }
+
+  return true;
+}
+
+// Return true if \p Lp and all its nested loops are uniform with regard to \p
+// OuterLp.
+static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
+  if (!isUniformLoop(Lp, OuterLp))
+    return false;
+
+  // Check if nested loops are uniform.
+  for (Loop *SubLp : *Lp)
+    if (!isUniformLoopNest(SubLp, OuterLp))
+      return false;
+
+  return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeOuterLoop() {
+  assert(!TheLoop->empty() && "We are not vectorizing an outer loop.");
+  // Store the result and return it at the end instead of exiting early, in case
+  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+  bool Result = true;
+  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    // Check whether the BB terminator is a BranchInst. Any other terminator is
+    // not supported yet.
+    auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!Br) {
+      DEBUG(dbgs() << "LV: Unsupported basic block terminator.\n");
+      ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+                << "loop control flow is not understood by vectorizer");
+      if (DoExtraAnalysis)
+        Result = false;
+      else
+        return false;
+    }
+
+    // Check whether the BranchInst is a supported one. Only unconditional
+    // branches, conditional branches with an outer loop invariant condition or
+    // backedges are supported.
+    if (Br && Br->isConditional() &&
+        !TheLoop->isLoopInvariant(Br->getCondition()) &&
+        !LI->isLoopHeader(Br->getSuccessor(0)) &&
+        !LI->isLoopHeader(Br->getSuccessor(1))) {
+      DEBUG(dbgs() << "LV: Unsupported conditional branch.\n");
+      ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+                << "loop control flow is not understood by vectorizer");
+      if (DoExtraAnalysis)
+        Result = false;
+      else
+        return false;
+    }
+  }
+
+  // Check whether inner loops are uniform. At this point, we only support
+  // simple outer loops scenarios with uniform nested loops.
+  if (!isUniformLoopNest(TheLoop /*loop nest*/,
+                         TheLoop /*context outer loop*/)) {
+    DEBUG(dbgs()
+          << "LV: Not vectorizing: Outer loop contains divergent loops.\n");
+    ORE->emit(createMissedAnalysis("CFGNotUnderstood")
+              << "loop control flow is not understood by vectorizer");
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
+  return Result;
+}
+
 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
   if (Ty->isPointerTy())
     return DL.getIntPtrType(Ty);
@@ -7406,7 +7681,33 @@ void LoopVectorizationCostModel::collect
 }
 
 VectorizationFactor
+LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
+                                                unsigned UserVF) {
+  // Width 1 means no vectorize, cost 0 means uncomputed cost.
+  const VectorizationFactor NoVectorization = {1U, 0U};
+
+  // Outer loop handling: They may require CFG and instruction level
+  // transformations before even evaluating whether vectorization is profitable.
+  // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+  // the vectorization pipeline.
+  if (!OrigLoop->empty()) {
+    assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+    assert(UserVF && "Expected UserVF for outer loop vectorization.");
+    assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+    DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
+    buildVPlans(UserVF, UserVF);
+
+    return {UserVF, 0};
+  }
+
+  DEBUG(dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
+                  "VPlan-native path.\n");
+  return NoVectorization;
+}
+
+VectorizationFactor
 LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
+  assert(OrigLoop->empty() && "Inner loop expected.");
   // Width 1 means no vectorize, cost 0 means uncomputed cost.
   const VectorizationFactor NoVectorization = {1U, 0U};
   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
@@ -7969,6 +8270,19 @@ LoopVectorizationPlanner::createReplicat
 LoopVectorizationPlanner::VPlanPtr
 LoopVectorizationPlanner::buildVPlan(VFRange &Range,
                                      const SmallPtrSetImpl<Value *> &NeedDef) {
+  // Outer loop handling: They may require CFG and instruction level
+  // transformations before even evaluating whether vectorization is profitable.
+  // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+  // the vectorization pipeline.
+  if (!OrigLoop->empty()) {
+    assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+
+    // Create new empty VPlan
+    auto Plan = llvm::make_unique<VPlan>();
+    return Plan;
+  }
+
+  assert(OrigLoop->empty() && "Inner loop expected.");
   EdgeMaskCache.clear();
   BlockMaskCache.clear();
   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
@@ -8298,8 +8612,45 @@ void VPWidenMemoryInstructionRecipe::exe
   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
 }
 
+// Process the loop in the VPlan-native vectorization path. This path builds
+// VPlan upfront in the vectorization pipeline, which allows to apply
+// VPlan-to-VPlan transformations from the very beginning without modifying the
+// input LLVM IR.
+static bool processLoopInVPlanNativePath(
+    Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
+    LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
+    TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
+    OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints) {
+
+  assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
+  Function *F = L->getHeader()->getParent();
+  InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
+  LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
+                                &Hints, IAI);
+  // Use the planner for outer loop vectorization.
+  // TODO: CM is not used at this point inside the planner. Turn CM into an
+  // optional argument if we don't need it in the future.
+  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
+
+  // Get user vectorization factor.
+  unsigned UserVF = Hints.getWidth();
+
+  // Check the function attributes to find out if this function should be
+  // optimized for size.
+  bool OptForSize =
+      Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
+
+  // Plan how to best vectorize, return the best VF and its cost.
+  LVP.planInVPlanNativePath(OptForSize, UserVF);
+
+  // Returning false. We are currently not generating vector code in the VPlan
+  // native path.
+  return false;
+}
+
 bool LoopVectorizePass::processLoop(Loop *L) {
-  assert(L->empty() && "Only process inner loops.");
+  assert((EnableVPlanNativePath || L->empty()) &&
+         "VPlan-native path is not enabled. Only process inner loops.");
 
 #ifndef NDEBUG
   const std::string DebugLocStr = getDebugLocString(L);
@@ -8354,6 +8705,16 @@ bool LoopVectorizePass::processLoop(Loop
   bool OptForSize =
       Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
 
+  // Entrance to the VPlan-native vectorization path. Outer loops are processed
+  // here. They may require CFG and instruction level transformations before
+  // even evaluating whether vectorization is profitable. Since we cannot modify
+  // the incoming IR, we need to build VPlan upfront in the vectorization
+  // pipeline.
+  if (!L->empty())
+    return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
+                                        ORE, Hints);
+
+  assert(L->empty() && "Inner loop expected.");
   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
   // count by optimizing for size, to minimize overheads.
   // Prefer constant trip counts over profile data, over upper bound estimate.
@@ -8630,7 +8991,7 @@ bool LoopVectorizePass::runImpl(
   SmallVector<Loop *, 8> Worklist;
 
   for (Loop *L : *LI)
-    addAcyclicInnerLoop(*L, *LI, Worklist);
+    collectSupportedLoops(*L, LI, ORE, Worklist);
 
   LoopsAnalyzed += Worklist.size();
 

Added: llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_detection.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_detection.ll?rev=330739&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_detection.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_detection.ll Tue Apr 24 10:04:17 2018
@@ -0,0 +1,238 @@
+; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Verify that outer loops annotated only with the expected explicit
+; vectorization hints are collected for vectorization instead of inner loops.
+
+; Root C/C++ source code for all the test cases
+; void foo(int *a, int *b, int N, int M)
+; {
+;   int i, j;
+; #pragma clang loop vectorize(enable)
+;   for (i = 0; i < N; i++) {
+;     for (j = 0; j < M; j++) {
+;       a[i*M+j] = b[i*M+j] * b[i*M+j];
+;     }
+;   }
+; }
+
+; Case 1: Annotated outer loop WITH vector width information must be collected.
+
+; CHECK-LABEL: vector_width
+; CHECK: LV: Loop hints: force=enabled width=4 unroll=0
+; CHECK: LV: We can vectorize this outer loop!
+; CHECK: LV: Using user VF 4.
+; CHECK-NOT: LV: Loop hints: force=?
+; CHECK-NOT: LV: Found a loop: inner.body
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @vector_width(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp32 = icmp sgt i32 %N, 0
+  br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph:                                   ; preds = %entry
+  %cmp230 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %M to i64
+  %wide.trip.count38 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                 ; preds = %outer.inc, %outer.ph
+  %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
+  br i1 %cmp230, label %inner.ph, label %outer.inc
+
+inner.ph:                                   ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv35, %0
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.body, %inner.ph
+  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+  %2 = add nsw i64 %indvars.iv, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                        ; preds = %inner.body, %outer.body
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
+  br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !6
+
+for.end15:                                        ; preds = %outer.inc, %entry
+  ret void
+}
+
+; Case 2: Annotated outer loop WITHOUT vector width information doesn't have to
+; be collected.
+
+; CHECK-LABEL: case2
+; CHECK-NOT: LV: Loop hints: force=enabled
+; CHECK-NOT: LV: We can vectorize this outer loop!
+; CHECK: LV: Loop hints: force=?
+; CHECK: LV: Found a loop: inner.body
+
+define void @case2(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp32 = icmp sgt i32 %N, 0
+  br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph:                                          ; preds = %entry
+  %cmp230 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %M to i64
+  %wide.trip.count38 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                        ; preds = %outer.inc, %outer.ph
+  %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
+  br i1 %cmp230, label %inner.ph, label %outer.inc
+
+inner.ph:                                  ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv35, %0
+  br label %inner.body
+
+inner.body:                                        ; preds = %inner.body, %inner.ph
+  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+  %2 = add nsw i64 %indvars.iv, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                        ; preds = %inner.body, %outer.body
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
+  br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !9
+
+for.end15:                                        ; preds = %outer.inc, %entry
+  ret void
+}
+
+; Case 3: Annotated outer loop WITH vector width and interleave information
+; doesn't have to be collected.
+
+; CHECK-LABEL: case3
+; CHECK-NOT: LV: Loop hints: force=enabled
+; CHECK-NOT: LV: We can vectorize this outer loop!
+; CHECK: LV: Loop hints: force=?
+; CHECK: LV: Found a loop: inner.body
+
+define void @case3(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp32 = icmp sgt i32 %N, 0
+  br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph:                                         ; preds = %entry
+  %cmp230 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %M to i64
+  %wide.trip.count38 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                       ; preds = %outer.inc, %outer.ph
+  %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
+  br i1 %cmp230, label %inner.ph, label %outer.inc
+
+inner.ph:                                         ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv35, %0
+  br label %inner.body
+
+inner.body:                                       ; preds = %inner.body, %inner.ph
+  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+  %2 = add nsw i64 %indvars.iv, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                        ; preds = %inner.body, %outer.body
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
+  br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !11
+
+for.end15:                                        ; preds = %outer.inc, %entry
+  ret void
+}
+
+; Case 4: Outer loop without any explicit vectorization annotation doesn't have
+; to be collected.
+
+; CHECK-LABEL: case4
+; CHECK-NOT: LV: Loop hints: force=enabled
+; CHECK-NOT: LV: We can vectorize this outer loop!
+; CHECK: LV: Loop hints: force=?
+; CHECK: LV: Found a loop: inner.body
+
+define void @case4(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp32 = icmp sgt i32 %N, 0
+  br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph:                                         ; preds = %entry
+  %cmp230 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %M to i64
+  %wide.trip.count38 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                       ; preds = %outer.inc, %outer.ph
+  %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
+  br i1 %cmp230, label %inner.ph, label %outer.inc
+
+inner.ph:                                  ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv35, %0
+  br label %inner.body
+
+inner.body:                                        ; preds = %inner.body, %inner.ph
+  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+  %2 = add nsw i64 %indvars.iv, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                        ; preds = %inner.body, %outer.body
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
+  br i1 %exitcond39, label %for.end15, label %outer.body
+
+for.end15:                                        ; preds = %outer.inc, %entry
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+; Case 1
+!6 = distinct !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.width", i32 4}
+!8 = !{!"llvm.loop.vectorize.enable", i1 true}
+; Case 2
+!9 = distinct !{!9, !8}
+; Case 3
+!10 = !{!"llvm.loop.interleave.count", i32 2}
+!11 = distinct !{!11, !7, !10, !8}

Added: llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll?rev=330739&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll Tue Apr 24 10:04:17 2018
@@ -0,0 +1,177 @@
+; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Verify that LV bails out on explicit vectorization outer loops that contain
+; divergent inner loops.
+
+; Root C/C++ source code for all the test cases
+; void foo(int *a, int *b, int N, int M)
+; {
+;   int i, j;
+; #pragma clang loop vectorize(enable) vectorize_width(8)
+;   for (i = 0; i < N; i++) {
+;     // Tested inner loop. It will be replaced per test.
+;     for (j = 0; j < M; j++) {
+;       a[i*M+j] = b[i*M+j] * b[i*M+j];
+;     }
+;   }
+; }
+
+; Case 1 (for (j = i; j < M; j++)): Inner loop with divergent IV start.
+
+; CHECK-LABEL: iv_start
+; CHECK: LV: Not vectorizing: Outer loop contains divergent loops.
+; CHECK: LV: Not vectorizing: Unsupported outer loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @iv_start(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp33 = icmp sgt i32 %N, 0
+  br i1 %cmp33, label %outer.ph, label %for.end15
+
+outer.ph:                                   ; preds = %entry
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %M to i64
+  %wide.trip.count41 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                 ; preds = %outer.inc, %outer.ph
+  %indvars.iv38 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next39, %outer.inc ]
+  %cmp231 = icmp slt i64 %indvars.iv38, %0
+  br i1 %cmp231, label %inner.ph, label %outer.inc
+
+inner.ph:                                   ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv38, %0
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.body, %inner.ph
+  %indvars.iv35 = phi i64 [ %indvars.iv38, %inner.ph ], [ %indvars.iv.next36, %inner.body ]
+  %2 = add nsw i64 %indvars.iv35, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond = icmp eq i64 %indvars.iv.next36, %wide.trip.count
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                  ; preds = %inner.body, %outer.body
+  %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1
+  %exitcond42 = icmp eq i64 %indvars.iv.next39, %wide.trip.count41
+  br i1 %exitcond42, label %for.end15, label %outer.body, !llvm.loop !6
+
+for.end15:                                  ; preds = %outer.inc, %entry
+  ret void
+}
+
+
+; Case 2 (for (j = 0; j < i; j++)): Inner loop with divergent upper-bound.
+
+; CHECK-LABEL: loop_ub
+; CHECK: LV: Not vectorizing: Outer loop contains divergent loops.
+; CHECK: LV: Not vectorizing: Unsupported outer loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @loop_ub(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp32 = icmp sgt i32 %N, 0
+  br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph:                                   ; preds = %entry
+  %0 = sext i32 %M to i64
+  %wide.trip.count41 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                 ; preds = %outer.inc, %outer.ph
+  %indvars.iv38 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next39, %outer.inc ]
+  %cmp230 = icmp eq i64 %indvars.iv38, 0
+  br i1 %cmp230, label %outer.inc, label %inner.ph
+
+inner.ph:                                   ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv38, %0
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.body, %inner.ph
+  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+  %2 = add nsw i64 %indvars.iv, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %indvars.iv38
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                  ; preds = %inner.body, %outer.body
+  %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1
+  %exitcond42 = icmp eq i64 %indvars.iv.next39, %wide.trip.count41
+  br i1 %exitcond42, label %for.end15, label %outer.body, !llvm.loop !6
+
+for.end15:                                  ; preds = %outer.inc, %entry
+  ret void
+}
+
+; Case 3 (for (j = 0; j < M; j+=i)): Inner loop with divergent step.
+
+; CHECK-LABEL: iv_step
+; CHECK: LV: Not vectorizing: Outer loop contains divergent loops.
+; CHECK: LV: Not vectorizing: Unsupported outer loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @iv_step(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp33 = icmp sgt i32 %N, 0
+  br i1 %cmp33, label %outer.ph, label %for.end15
+
+outer.ph:                                   ; preds = %entry
+  %cmp231 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                 ; preds = %for.inc14, %outer.ph
+  %indvars.iv39 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next40, %for.inc14 ]
+  br i1 %cmp231, label %inner.ph, label %for.inc14
+
+inner.ph:                                   ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv39, %0
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.ph, %inner.body
+  %indvars.iv36 = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next37, %inner.body ]
+  %2 = add nsw i64 %indvars.iv36, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next37 = add nuw nsw i64 %indvars.iv36, %indvars.iv39
+  %cmp2 = icmp slt i64 %indvars.iv.next37, %0
+  br i1 %cmp2, label %inner.body, label %for.inc14
+
+for.inc14:                                 ; preds = %inner.body, %outer.body
+  %indvars.iv.next40 = add nuw nsw i64 %indvars.iv39, 1
+  %exitcond = icmp eq i64 %indvars.iv.next40, %wide.trip.count
+  br i1 %exitcond, label %for.end15, label %outer.body, !llvm.loop !6
+
+for.end15:                                 ; preds = %for.inc14, %entry
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.width", i32 8}
+!8 = !{!"llvm.loop.vectorize.enable", i1 true}

Added: llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll?rev=330739&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll Tue Apr 24 10:04:17 2018
@@ -0,0 +1,138 @@
+; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Verify that LV can handle explicit vectorization outer loops with uniform branches
+; but bails out on outer loops with divergent branches.
+
+; Root C/C++ source code for the test cases
+; void foo(int *a, int *b, int N, int M)
+; {
+;   int i, j;
+; #pragma clang loop vectorize(enable) vectorize_width(8)
+;   for (i = 0; i < N; i++) {
+;     // Tested conditional branch. COND will be replaced per test.
+;     if (COND)
+;       for (j = 0; j < M; j++) {
+;         a[i*M+j] = b[i*M+j] * b[i*M+j];
+;       }
+;   }
+; }
+
+; Case 1 (COND => M == N): Outer loop with uniform conditional branch.
+
+; CHECK-LABEL: uniform_branch
+; CHECK: LV: We can vectorize this outer loop!
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @uniform_branch(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp39 = icmp sgt i32 %N, 0
+  br i1 %cmp39, label %outer.ph, label %for.end19
+
+outer.ph:                                   ; preds = %entry
+  %cmp337 = icmp slt i32 %M, 1
+  %0 = sext i32 %M to i64
+  %N64 = zext i32 %N to i64
+  %M64 = zext i32 %M to i64
+  %cmp1 = icmp ne i32 %M, %N ; Uniform condition
+  %brmerge = or i1 %cmp1, %cmp337 ; Uniform condition
+  br label %outer.body
+
+outer.body:                                 ; preds = %outer.inc, %outer.ph
+  %indvars.iv42 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next43, %outer.inc ]
+  %1 = mul nsw i64 %indvars.iv42, %0
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %1
+  %2 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  br i1 %brmerge, label %outer.inc, label %inner.ph ; Supported uniform branch
+
+inner.ph:                                   ; preds = %outer.body
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.ph, %inner.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %inner.body ], [ 0, %inner.ph ]
+  %3 = add nsw i64 %indvars.iv, %1
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %3
+  %4 = load i32, i32* %arrayidx7, align 4, !tbaa !2
+  %mul12 = mul nsw i32 %4, %4
+  %arrayidx16 = getelementptr inbounds i32, i32* %a, i64 %3
+  store i32 %mul12, i32* %arrayidx16, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %M64
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                  ; preds = %inner.body, %outer.body
+  %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
+  %exitcond46 = icmp eq i64 %indvars.iv.next43, %N64
+  br i1 %exitcond46, label %for.end19, label %outer.body, !llvm.loop !6
+
+for.end19:                                  ; preds = %outer.inc, %entry
+  ret void
+}
+
+
+; Case 2 (COND => B[i * M] == 0): Outer loop with divergent conditional branch.
+
+; CHECK-LABEL: divergent_branch
+; CHECK: Unsupported conditional branch.
+; CHECK: LV: Not vectorizing: Unsupported outer loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @divergent_branch(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp39 = icmp sgt i32 %N, 0
+  br i1 %cmp39, label %outer.ph, label %for.end19
+
+outer.ph:                                   ; preds = %entry
+  %cmp337 = icmp slt i32 %M, 1
+  %0 = sext i32 %M to i64
+  %N64 = zext i32 %N to i64
+  %M64 = zext i32 %M to i64
+  br label %outer.body
+
+outer.body:                                 ; preds = %outer.inc, %outer.ph
+  %indvars.iv42 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next43, %outer.inc ]
+  %1 = mul nsw i64 %indvars.iv42, %0
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %1
+  %2 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %cmp1 = icmp ne i32 %2, 0 ; Divergent condition
+  %brmerge = or i1 %cmp1, %cmp337 ; Divergent condition
+  br i1 %brmerge, label %outer.inc, label %inner.ph ; Unsupported divergent branch.
+
+inner.ph:                                   ; preds = %outer.body
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.ph, %inner.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %inner.body ], [ 0, %inner.ph ]
+  %3 = add nsw i64 %indvars.iv, %1
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %3
+  %4 = load i32, i32* %arrayidx7, align 4, !tbaa !2
+  %mul12 = mul nsw i32 %4, %4
+  %arrayidx16 = getelementptr inbounds i32, i32* %a, i64 %3
+  store i32 %mul12, i32* %arrayidx16, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %M64
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                  ; preds = %inner.body, %outer.body
+  %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
+  %exitcond46 = icmp eq i64 %indvars.iv.next43, %N64
+  br i1 %exitcond46, label %for.end19, label %outer.body, !llvm.loop !6
+
+for.end19:                                  ; preds = %outer.inc, %entry
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.width", i32 8}
+!8 = !{!"llvm.loop.vectorize.enable", i1 true}