[llvm] r342197 - [VPlan] Implement initial vector code generation support for simple outer loops.

Thu Sep 13 17:36:00 PDT 2018

Author: hsaito
Date: Thu Sep 13 17:36:00 2018
New Revision: 342197

URL: http://llvm.org/viewvc/llvm-project?rev=342197&view=rev
Log:
[VPlan] Implement initial vector code generation support for simple outer loops.

Summary:
[VPlan] Implement vector code generation support for simple outer loops.

Context: Patch Series #1 for outer loop vectorization support in LV  using VPlan. (RFC: http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html).
                                                          
This patch introduces vector code generation support for simple outer loops that are currently supported in the VPlanNativePath. Changes here essentially do the following:

  - force vector code generation using explicit vectorize_width

  - add conservative early returns in cost model and other places for VPlanNativePath

  - add code for setting up outer loop inductions 

  - support for widening non-induction PHIs that can result from inner loops and uniform conditional branches

  - support for generating uniform inner branches

We plan to add a handful C outer loop executable tests once the initial code generation support is committed. This patch is expected to be NFC for the inner loop vectorizer path. Since we are moving in the direction of supporting outer loop vectorization in LV, it may also be time to rename classes such as InnerLoopVectorizer. 

Reviewers: fhahn, rengolin, hsaito, dcaballe, mkuper, hfinkel, Ayal

Reviewed By: fhahn, hsaito

Subscribers: dmgreen, bollu, tschuett, rkruppe, rogfer01, llvm-commits

Differential Revision: https://reviews.llvm.org/D50820

Added:
    llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test1.ll
    llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test2.ll
Modified:
    llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/trunk/lib/Transforms/Vectorize/VPlan.cpp
    llvm/trunk/lib/Transforms/Vectorize/VPlan.h
    llvm/trunk/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
    llvm/trunk/lib/Transforms/Vectorize/VPlanValue.h

Modified: llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h?rev=342197&r1=342196&r2=342197&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h (original)
+++ llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h Thu Sep 13 17:36:00 2018
@@ -332,6 +332,11 @@ private:
   /// If false, good old LV code.
   bool canVectorizeLoopNestCFG(Loop *Lp, bool UseVPlanNativePath);
 
+  /// Set up outer loop inductions by checking Phis in outer loop header for
+  /// supported inductions (int inductions). Return false if any of these Phis
+  /// is not a supported induction or if we fail to find an induction.
+  bool setupOuterLoopInductions();
+
   /// Return true if the pre-header, exiting and latch blocks of \p Lp
   /// (non-recursive) are considered legal for vectorization.
   /// Temporarily taking UseVPlanNativePath parameter. If true, take

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp?rev=342197&r1=342196&r2=342197&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp Thu Sep 13 17:36:00 2018
@@ -516,6 +516,18 @@ bool LoopVectorizationLegality::canVecto
       return false;
   }
 
+  // Check whether we are able to set up outer loop induction.
+  if (!setupOuterLoopInductions()) {
+    LLVM_DEBUG(
+        dbgs() << "LV: Not vectorizing: Unsupported outer loop Phi(s).\n");
+    ORE->emit(createMissedAnalysis("UnsupportedPhi")
+              << "Unsupported outer loop Phi(s)");
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
   return Result;
 }
 
@@ -571,6 +583,32 @@ void LoopVectorizationLegality::addInduc
   LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
 }
 
+bool LoopVectorizationLegality::setupOuterLoopInductions() {
+  BasicBlock *Header = TheLoop->getHeader();
+
+  // Returns true if a given Phi is a supported induction.
+  auto isSupportedPhi = [&](PHINode &Phi) -> bool {
+    InductionDescriptor ID;
+    if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) &&
+        ID.getKind() == InductionDescriptor::IK_IntInduction) {
+      addInductionPhi(&Phi, ID, AllowedExit);
+      return true;
+    } else {
+      // Bail out for any Phi in the outer loop header that is not a supported
+      // induction.
+      LLVM_DEBUG(
+          dbgs()
+          << "LV: Found unsupported PHI for outer loop vectorization.\n");
+      return false;
+    }
+  };
+
+  if (llvm::all_of(Header->phis(), isSupportedPhi))
+    return true;
+  else
+    return false;
+}
+
 bool LoopVectorizationLegality::canVectorizeInstrs() {
   BasicBlock *Header = TheLoop->getHeader();
 

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=342197&r1=342196&r2=342197&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Thu Sep 13 17:36:00 2018
@@ -58,6 +58,7 @@
 #include "LoopVectorizationPlanner.h"
 #include "VPRecipeBuilder.h"
 #include "VPlanHCFGBuilder.h"
+#include "VPlanHCFGTransforms.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -234,7 +235,7 @@ static cl::opt<unsigned> MaxNestedScalar
     cl::desc("The maximum interleave count to use when interleaving a scalar "
              "reduction in a nested loop."));
 
-static cl::opt<bool> EnableVPlanNativePath(
+cl::opt<bool> EnableVPlanNativePath(
     "enable-vplan-native-path", cl::init(false), cl::Hidden,
     cl::desc("Enable VPlan-native vectorization path with "
              "support for outer loop vectorization."));
@@ -419,6 +420,9 @@ public:
   /// the instruction.
   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
 
+  /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
+  void fixNonInductionPHIs(void);
+
 protected:
   friend class LoopVectorizationPlanner;
 
@@ -686,6 +690,10 @@ protected:
   // Holds the end values for each induction variable. We save the end values
   // so we can later fix-up the external users of the induction variables.
   DenseMap<PHINode *, Value *> IVEndValues;
+
+  // Vector of original scalar PHIs whose corresponding widened PHIs need to be
+  // fixed up at the end of vector code generation.
+  SmallVector<PHINode *, 8> OrigPHIsToFix;
 };
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -888,6 +896,12 @@ public:
   /// vectorization factor \p VF.
   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
+
+    // Cost model is not run in the VPlan-native path - return conservative
+    // result until this changes.
+    if (EnableVPlanNativePath)
+      return false;
+
     auto Scalars = InstsToScalarize.find(VF);
     assert(Scalars != InstsToScalarize.end() &&
            "VF not yet analyzed for scalarization profitability");
@@ -898,6 +912,12 @@ public:
   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
     if (VF == 1)
       return true;
+
+    // Cost model is not run in the VPlan-native path - return conservative
+    // result until this changes.
+    if (EnableVPlanNativePath)
+      return false;
+
     auto UniformsPerVF = Uniforms.find(VF);
     assert(UniformsPerVF != Uniforms.end() &&
            "VF not yet analyzed for uniformity");
@@ -908,6 +928,12 @@ public:
   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
     if (VF == 1)
       return true;
+
+    // Cost model is not run in the VPlan-native path - return conservative
+    // result until this changes.
+    if (EnableVPlanNativePath)
+      return false;
+
     auto ScalarsPerVF = Scalars.find(VF);
     assert(ScalarsPerVF != Scalars.end() &&
            "Scalar values are not calculated for VF");
@@ -962,6 +988,12 @@ public:
   /// through the cost modeling.
   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
     assert(VF >= 2 && "Expected VF >=2");
+
+    // Cost model is not run in the VPlan-native path - return conservative
+    // result until this changes.
+    if (EnableVPlanNativePath)
+      return CM_GatherScatter;
+
     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
     auto Itr = WideningDecisions.find(InstOnVF);
     if (Itr == WideningDecisions.end())
@@ -1397,8 +1429,16 @@ struct LoopVectorize : public FunctionPa
     AU.addRequired<LoopAccessLegacyAnalysis>();
     AU.addRequired<DemandedBitsWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
-    AU.addPreserved<LoopInfoWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
+
+    // We currently do not preserve loopinfo/dominator analyses with outer loop
+    // vectorization. Until this is addressed, mark these analyses as preserved
+    // only for non-VPlan-native path.
+    // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
+    if (!EnableVPlanNativePath) {
+      AU.addPreserved<LoopInfoWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+    }
+
     AU.addPreserved<BasicAAWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
@@ -1749,8 +1789,9 @@ Value *InnerLoopVectorizer::getOrCreateV
   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
 
-  // If we have a stride that is replaced by one, do it here.
-  if (Legal->hasStride(V))
+  // If we have a stride that is replaced by one, do it here. Defer this for
+  // the VPlan-native path until we start running Legal checks in that path.
+  if (!EnableVPlanNativePath && Legal->hasStride(V))
     V = ConstantInt::get(V->getType(), 1);
 
   // If we have a vector mapped to this value, return it.
@@ -2416,6 +2457,10 @@ void InnerLoopVectorizer::emitSCEVChecks
 }
 
 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
+  // VPlan-native path does not do any analysis for runtime checks currently.
+  if (EnableVPlanNativePath)
+    return;
+
   BasicBlock *BB = L->getLoopPreheader();
 
   // Generate the code that checks in runtime if arrays overlap. We put the
@@ -3060,6 +3105,13 @@ void InnerLoopVectorizer::fixVectorizedL
   if (VF > 1)
     truncateToMinimalBitwidths();
 
+  // Fix widened non-induction PHIs by setting up the PHI operands.
+  if (OrigPHIsToFix.size()) {
+    assert(EnableVPlanNativePath &&
+           "Unexpected non-induction PHIs for fixup in non VPlan-native path");
+    fixNonInductionPHIs();
+  }
+
   // At this point every instruction in the original loop is widened to a
   // vector form. Now we need to fix the recurrences in the loop. These PHI
   // nodes are currently empty because we did not want to introduce cycles.
@@ -3532,12 +3584,62 @@ void InnerLoopVectorizer::sinkScalarOper
   } while (Changed);
 }
 
+void InnerLoopVectorizer::fixNonInductionPHIs() {
+  for (PHINode *OrigPhi : OrigPHIsToFix) {
+    PHINode *NewPhi =
+        cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
+    unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
+
+    SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
+        predecessors(OrigPhi->getParent()));
+    SmallVector<BasicBlock *, 2> VectorBBPredecessors(
+        predecessors(NewPhi->getParent()));
+    assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
+           "Scalar and Vector BB should have the same number of predecessors");
+
+    // The insertion point in Builder may be invalidated by the time we get
+    // here. Force the Builder insertion point to something valid so that we do
+    // not run into issues during insertion point restore in
+    // getOrCreateVectorValue calls below.
+    Builder.SetInsertPoint(NewPhi);
+
+    // The predecessor order is preserved and we can rely on mapping between
+    // scalar and vector block predecessors.
+    for (unsigned i = 0; i < NumIncomingValues; ++i) {
+      BasicBlock *NewPredBB = VectorBBPredecessors[i];
+
+      // When looking up the new scalar/vector values to fix up, use incoming
+      // values from original phi.
+      Value *ScIncV =
+          OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
+
+      // Scalar incoming value may need a broadcast
+      Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
+      NewPhi->addIncoming(NewIncV, NewPredBB);
+    }
+  }
+}
+
 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
                                               unsigned VF) {
+  PHINode *P = cast<PHINode>(PN);
+  if (EnableVPlanNativePath) {
+    // Currently we enter here in the VPlan-native path for non-induction
+    // PHIs where all control flow is uniform. We simply widen these PHIs.
+    // Create a vector phi with no operands - the vector phi operands will be
+    // set at the end of vector code generation.
+    Type *VecTy =
+        (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
+    Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
+    VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
+    OrigPHIsToFix.push_back(P);
+
+    return;
+  }
+
   assert(PN->getParent() == OrigLoop->getHeader() &&
          "Non-header phis should have been handled elsewhere");
 
-  PHINode *P = cast<PHINode>(PN);
   // In order to support recurrences we need to be able to vectorize Phi nodes.
   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
@@ -3893,6 +3995,10 @@ void InnerLoopVectorizer::updateAnalysis
   // Forget the original basic block.
   PSE.getSE()->forgetLoop(OrigLoop);
 
+  // DT is not kept up-to-date for outer loop vectorization
+  if (EnableVPlanNativePath)
+    return;
+
   // Update the dominator tree information.
   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
          "Entry does not dominate exit.");
@@ -6527,6 +6633,13 @@ LoopVectorizationPlanner::buildVPlan(VFR
   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
   HCFGBuilder.buildHierarchicalCFG();
 
+  SmallPtrSet<Instruction *, 1> DeadInstructions;
+  VPlanHCFGTransforms::VPInstructionsToVPRecipes(
+      Plan, Legal->getInductionVars(), DeadInstructions);
+
+  for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
+    Plan->addVF(VF);
+
   return Plan;
 }
 
@@ -6728,11 +6841,26 @@ static bool processLoopInVPlanNativePath
       Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
 
   // Plan how to best vectorize, return the best VF and its cost.
-  LVP.planInVPlanNativePath(OptForSize, UserVF);
+  VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);
 
-  // Returning false. We are currently not generating vector code in the VPlan
-  // native path.
-  return false;
+  // If we are stress testing VPlan builds, do not attempt to generate vector
+  // code.
+  if (VPlanBuildStressTest)
+    return false;
+
+  LVP.setBestPlan(VF.Width, 1);
+
+  InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, UserVF, 1, LVL,
+                         &CM);
+  LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
+                    << L->getHeader()->getParent()->getName() << "\"\n");
+  LVP.executePlan(LB, DT);
+
+  // Mark the loop as already vectorized to avoid vectorizing again.
+  Hints.setAlreadyVectorized();
+
+  LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
+  return true;
 }
 
 bool LoopVectorizePass::processLoop(Loop *L) {
@@ -7123,8 +7251,15 @@ PreservedAnalyses LoopVectorizePass::run
     if (!Changed)
       return PreservedAnalyses::all();
     PreservedAnalyses PA;
-    PA.preserve<LoopAnalysis>();
-    PA.preserve<DominatorTreeAnalysis>();
+
+    // We currently do not preserve loopinfo/dominator analyses with outer loop
+    // vectorization. Until this is addressed, mark these analyses as preserved
+    // only for non-VPlan-native path.
+    // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
+    if (!EnableVPlanNativePath) {
+      PA.preserve<LoopAnalysis>();
+      PA.preserve<DominatorTreeAnalysis>();
+    }
     PA.preserve<BasicAA>();
     PA.preserve<GlobalsAA>();
     return PA;

Modified: llvm/trunk/lib/Transforms/Vectorize/VPlan.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/VPlan.cpp?rev=342197&r1=342196&r2=342197&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/VPlan.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/VPlan.cpp Thu Sep 13 17:36:00 2018
@@ -44,6 +44,7 @@
 #include <vector>
 
 using namespace llvm;
+extern cl::opt<bool> EnableVPlanNativePath;
 
 #define DEBUG_TYPE "vplan"
 
@@ -124,6 +125,20 @@ VPBasicBlock::createEmptyBasicBlock(VPTr
     VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock();
     auto &PredVPSuccessors = PredVPBB->getSuccessors();
     BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
+
+    // In outer loop vectorization scenario, the predecessor BBlock may not yet
+    // be visited(backedge). Mark the VPBasicBlock for fixup at the end of
+    // vectorization. We do not encounter this case in inner loop vectorization
+    // as we start out by building a loop skeleton with the vector loop header
+    // and latch blocks. As a result, we never enter this function for the
+    // header block in the non VPlan-native path.
+    if (!PredBB) {
+      assert(EnableVPlanNativePath &&
+             "Unexpected null predecessor in non VPlan-native path");
+      CFG.VPBBsToFix.push_back(PredVPBB);
+      continue;
+    }
+
     assert(PredBB && "Predecessor basic-block not found building successor.");
     auto *PredBBTerminator = PredBB->getTerminator();
     LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
@@ -185,6 +200,35 @@ void VPBasicBlock::execute(VPTransformSt
   for (VPRecipeBase &Recipe : Recipes)
     Recipe.execute(*State);
 
+  VPValue *CBV;
+  if (EnableVPlanNativePath && (CBV = getCondBit())) {
+    Value *IRCBV = CBV->getUnderlyingValue();
+    assert(IRCBV && "Unexpected null underlying value for condition bit");
+
+    // Delete the condition bit at this point - it should be no longer needed.
+    delete CBV;
+    setCondBit(nullptr);
+
+    // Condition bit value in a VPBasicBlock is used as the branch selector. In
+    // the VPlan-native path case, since all branches are uniform we generate a
+    // branch instruction using the condition value from vector lane 0 and dummy
+    // successors. The successors are fixed later when the successor blocks are
+    // visited.
+    Value *NewCond = State->Callback.getOrCreateVectorValues(IRCBV, 0);
+    NewCond = State->Builder.CreateExtractElement(NewCond,
+                                                  State->Builder.getInt32(0));
+
+    // Replace the temporary unreachable terminator with the new conditional
+    // branch.
+    auto *CurrentTerminator = NewBB->getTerminator();
+    assert(isa<UnreachableInst>(CurrentTerminator) &&
+           "Expected to replace unreachable terminator with conditional "
+           "branch.");
+    auto *CondBr = BranchInst::Create(NewBB, nullptr, NewCond);
+    CondBr->setSuccessor(0, nullptr);
+    ReplaceInstWithInst(CurrentTerminator, CondBr);
+  }
+
   LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
 }
 
@@ -194,6 +238,20 @@ void VPRegionBlock::execute(VPTransformS
   if (!isReplicator()) {
     // Visit the VPBlocks connected to "this", starting from it.
     for (VPBlockBase *Block : RPOT) {
+      if (EnableVPlanNativePath) {
+        // The inner loop vectorization path does not represent loop preheader
+        // and exit blocks as part of the VPlan. In the VPlan-native path, skip
+        // vectorizing loop preheader block. In future, we may replace this
+        // check with the check for loop preheader.
+        if (Block->getNumPredecessors() == 0)
+          continue;
+
+        // Skip vectorizing loop exit block. In future, we may replace this
+        // check with the check for loop exit.
+        if (Block->getNumSuccessors() == 0)
+          continue;
+      }
+
       LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
       Block->execute(State);
     }
@@ -319,11 +377,32 @@ void VPlan::execute(VPTransformState *St
   for (VPBlockBase *Block : depth_first(Entry))
     Block->execute(State);
 
+  // Setup branch terminator successors for VPBBs in VPBBsToFix based on
+  // VPBB's successors.
+  for (auto VPBB : State->CFG.VPBBsToFix) {
+    assert(EnableVPlanNativePath &&
+           "Unexpected VPBBsToFix in non VPlan-native path");
+    BasicBlock *BB = State->CFG.VPBB2IRBB[VPBB];
+    assert(BB && "Unexpected null basic block for VPBB");
+
+    unsigned Idx = 0;
+    auto *BBTerminator = BB->getTerminator();
+
+    for (VPBlockBase *SuccVPBlock : VPBB->getHierarchicalSuccessors()) {
+      VPBasicBlock *SuccVPBB = SuccVPBlock->getEntryBasicBlock();
+      BBTerminator->setSuccessor(Idx, State->CFG.VPBB2IRBB[SuccVPBB]);
+      ++Idx;
+    }
+  }
+
   // 3. Merge the temporary latch created with the last basic-block filled.
   BasicBlock *LastBB = State->CFG.PrevBB;
   // Connect LastBB to VectorLatchBB to facilitate their merge.
-  assert(isa<UnreachableInst>(LastBB->getTerminator()) &&
-         "Expected VPlan CFG to terminate with unreachable");
+  assert((EnableVPlanNativePath ||
+          isa<UnreachableInst>(LastBB->getTerminator())) &&
+         "Expected InnerLoop VPlan CFG to terminate with unreachable");
+  assert((!EnableVPlanNativePath || isa<BranchInst>(LastBB->getTerminator())) &&
+         "Expected VPlan CFG to terminate with branch in NativePath");
   LastBB->getTerminator()->eraseFromParent();
   BranchInst::Create(VectorLatchBB, LastBB);
 
@@ -333,7 +412,9 @@ void VPlan::execute(VPTransformState *St
   assert(Merged && "Could not merge last basic block with latch.");
   VectorLatchBB = LastBB;
 
-  updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB);
+  // We do not attempt to preserve DT for outer loop vectorization currently.
+  if (!EnableVPlanNativePath)
+    updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB);
 }
 
 void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,

Modified: llvm/trunk/lib/Transforms/Vectorize/VPlan.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/VPlan.h?rev=342197&r1=342196&r2=342197&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/VPlan.h (original)
+++ llvm/trunk/lib/Transforms/Vectorize/VPlan.h Thu Sep 13 17:36:00 2018
@@ -293,6 +293,10 @@ struct VPTransformState {
     /// of replication, maps the BasicBlock of the last replica created.
     SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
 
+    /// Vector of VPBasicBlocks whose terminator instruction needs to be fixed
+    /// up at the end of vector code generation.
+    SmallVector<VPBasicBlock *, 8> VPBBsToFix;
+
     CFGState() = default;
   } CFG;
 

Modified: llvm/trunk/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp?rev=342197&r1=342196&r2=342197&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp Thu Sep 13 17:36:00 2018
@@ -24,6 +24,17 @@ void VPlanHCFGTransforms::VPInstructions
 
   VPRegionBlock *TopRegion = dyn_cast<VPRegionBlock>(Plan->getEntry());
   ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry());
+
+  // Condition bit VPValues get deleted during transformation to VPRecipes.
+  // Create new VPValues and save away as condition bits. These will be deleted
+  // after finalizing the vector IR basic blocks.
+  for (VPBlockBase *Base : RPOT) {
+    VPBasicBlock *VPBB = Base->getEntryBasicBlock();
+    if (auto *CondBit = VPBB->getCondBit()) {
+      auto *NCondBit = new VPValue(CondBit->getUnderlyingValue());
+      VPBB->setCondBit(NCondBit);
+    }
+  }
   for (VPBlockBase *Base : RPOT) {
     // Do not widen instructions in pre-header and exit blocks.
     if (Base->getNumPredecessors() == 0 || Base->getNumSuccessors() == 0)

Modified: llvm/trunk/lib/Transforms/Vectorize/VPlanValue.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/VPlanValue.h?rev=342197&r1=342196&r2=342197&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/VPlanValue.h (original)
+++ llvm/trunk/lib/Transforms/Vectorize/VPlanValue.h Thu Sep 13 17:36:00 2018
@@ -38,6 +38,9 @@ class VPUser;
 // and live-outs which the VPlan will need to fix accordingly.
 class VPValue {
   friend class VPBuilder;
+  friend class VPlanHCFGTransforms;
+  friend class VPBasicBlock;
+
 private:
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
 

Added: llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test1.ll?rev=342197&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test1.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test1.ll Thu Sep 13 17:36:00 2018
@@ -0,0 +1,82 @@
+; extern int arr[8][8];
+; extern int arr2[8];
+;
+; void foo(int n)
+; {
+;   int i1, i2;
+;
+; #pragma clang loop vectorize(enable) vectorize_width(4)
+;   for (i1 = 0; i1 < 8; i1++) {
+;     arr2[i1] = i1;
+;     for (i2 = 0; i2 < 8; i2++)
+;       arr[i2][i1] = i1 + n;
+;   }
+; }
+;
+; RUN: opt -S -loop-vectorize -enable-vplan-native-path < %s | FileCheck %s
+; CHECK-LABEL: vector.ph:
+; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> undef, i32 %n, i32 0
+; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> undef, <4 x i32> zeroinitializer
+
+; CHECK-LABEL: vector.body:
+; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
+; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
+; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, <4 x i64> %[[VecInd]]
+; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[VecIndTr]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
+; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]]
+; CHECK: br label %[[InnerLoop:.+]]
+
+; CHECK: [[InnerLoop]]:
+; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
+; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]]
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true
+; CHECK: %[[InnerPhiNext]] = add nuw nsw <4 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0
+; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
+
+; CHECK: [[ForInc]]:
+; CHECK: %[[IndNext]] = add i64 %[[Ind]], 4
+; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
+; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body
+
+ at arr2 = external global [8 x i32], align 16
+ at arr = external global [8 x [8 x i32]], align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32 %n) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc8, %entry
+  %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ]
+  %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, i64 %indvars.iv21
+  %0 = trunc i64 %indvars.iv21 to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %1 = trunc i64 %indvars.iv21 to i32
+  %add = add nsw i32 %1, %n
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx7 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, i64 %indvars.iv, i64 %indvars.iv21
+  store i32 %add, i32* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3
+  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+  %exitcond23 = icmp eq i64 %indvars.iv.next22, 8
+  br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}

Added: llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test2.ll?rev=342197&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test2.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/outer_loop_test2.ll Thu Sep 13 17:36:00 2018
@@ -0,0 +1,112 @@
+; int A[1024], B[1024];
+;
+; void foo(int iCount, int c, int jCount)
+; {
+;
+;   int i, j;
+;
+; #pragma clang loop vectorize(enable) vectorize_width(4)
+;   for (i = 0; i < iCount; i++) {
+;     A[i] = c;
+;     for (j = 0; j < jCount; j++) {
+;       A[i] += B[j] + i;
+;     }
+;   }
+; }
+; RUN: opt -S -loop-vectorize -enable-vplan-native-path < %s | FileCheck %s
+; CHECK: %[[ZeroTripChk:.*]] = icmp sgt i32 %jCount, 0
+; CHECK-LABEL: vector.ph:
+; CHECK: %[[CVal0:.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+; CHECK-NEXT: %[[CSplat:.*]] = shufflevector <4 x i32> %[[CVal0]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: %[[ZVal0:.*]] = insertelement <4 x i1> undef, i1 %[[ZeroTripChk]], i32 0
+; CHECK-NEXT: %[[ZSplat:.*]] = shufflevector <4 x i1> %[[ZVal0]], <4 x i1> undef, <4 x i32> zeroinitializer
+
+; CHECK-LABEL: vector.body:
+; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
+; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
+; CHECK: %[[AAddr:.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, <4 x i64> %[[VecInd]]
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[CSplat]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: %[[ZCmpExtr:.*]] = extractelement <4 x i1> %[[ZSplat]], i32 0
+; CHECK: br i1 %[[ZCmpExtr]], label %[[InnerForPh:.*]], label %[[OuterInc:.*]]
+
+; CHECK: [[InnerForPh]]:
+; CHECK: %[[WideAVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
+; CHECK: br label %[[InnerForBody:.*]]
+
+; CHECK: [[InnerForBody]]:
+; CHECK: %[[InnerInd:.*]] = phi <4 x i64> [ %[[InnerIndNext:.*]], %[[InnerForBody]] ], [ zeroinitializer, %[[InnerForPh]] ]
+; CHECK: %[[AccumPhi:.*]] = phi <4 x i32> [ %[[AccumPhiNext:.*]], %[[InnerForBody]] ], [ %[[WideAVal]], %[[InnerForPh]] ]
+; CHECK: %[[BAddr:.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, <4 x i64> %[[InnerInd]]
+; CHECK: %[[WideBVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %[[BAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK: %[[Add1:.*]] = add nsw <4 x i32> %[[WideBVal]], %[[VecIndTr]]
+; CHECK: %[[AccumPhiNext]] = add nsw <4 x i32> %[[Add1]], %[[AccumPhi]]
+; CHECK: %[[InnerIndNext]] = add nuw nsw <4 x i64> %[[InnerInd]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK: %[[InnerVecCond:.*]] = icmp eq <4 x i64> %[[InnerIndNext]], {{.*}}
+; CHECK: %[[InnerCond:.+]] = extractelement <4 x i1> %[[InnerVecCond]], i32 0
+; CHECK: br i1 %[[InnerCond]], label %[[InnerCrit:.*]], label %[[InnerForBody]]
+
+; CHECK: [[InnerCrit]]:
+; CHECK: %[[StorePhi:.*]] = phi <4 x i32> [ %[[AccumPhiNext]], %[[InnerForBody]] ]
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StorePhi]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK:  br label %[[ForInc]]
+
+; CHECK: [[ForInc]]:
+; CHECK: %[[IndNext]] = add i64 %[[Ind]], 4
+; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], {{.*}}
+; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body
+
+ at A = common global [1024 x i32] zeroinitializer, align 16
+ at B = common global [1024 x i32] zeroinitializer, align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32 %iCount, i32 %c, i32 %jCount) {
+entry:
+  %cmp22 = icmp sgt i32 %iCount, 0
+  br i1 %cmp22, label %for.body.lr.ph, label %for.end11
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp220 = icmp sgt i32 %jCount, 0
+  %wide.trip.count = zext i32 %jCount to i64
+  %wide.trip.count27 = zext i32 %iCount to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc9, %for.body.lr.ph
+  %indvars.iv25 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next26, %for.inc9 ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv25
+  store i32 %c, i32* %arrayidx, align 4
+  br i1 %cmp220, label %for.body3.lr.ph, label %for.inc9
+
+for.body3.lr.ph:                                  ; preds = %for.body
+  %arrayidx.promoted = load i32, i32* %arrayidx, align 4
+  %0 = trunc i64 %indvars.iv25 to i32
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+  %1 = phi i32 [ %arrayidx.promoted, %for.body3.lr.ph ], [ %add8, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx5, align 4
+  %add = add nsw i32 %2, %0
+  %add8 = add nsw i32 %add, %1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond1.for.inc9_crit_edge, label %for.body3
+
+for.cond1.for.inc9_crit_edge:                     ; preds = %for.body3
+  store i32 %add8, i32* %arrayidx, align 4
+  br label %for.inc9
+
+for.inc9:                                         ; preds = %for.cond1.for.inc9_crit_edge, %for.body
+  %indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1
+  %exitcond28 = icmp eq i64 %indvars.iv.next26, %wide.trip.count27
+  br i1 %exitcond28, label %for.end11, label %for.body, !llvm.loop !1
+
+for.end11:                                        ; preds = %for.inc9, %entry
+  ret void
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}