[llvm] 822be4b - Revert "[PassManager] add helper function to hold set of vector passes"

Mon May 10 08:11:28 PDT 2021

Author: Sanjay Patel
Date: 2021-05-10T10:59:30-04:00
New Revision: 822be4bec894134fa63ed4d2d289f353f3cfcc19

URL: https://github.com/llvm/llvm-project/commit/822be4bec894134fa63ed4d2d289f353f3cfcc19
DIFF: https://github.com/llvm/llvm-project/commit/822be4bec894134fa63ed4d2d289f353f3cfcc19.diff

LOG: Revert "[PassManager] add helper function to hold set of vector passes"

This reverts commit fefcb1f878c2dad435af604955661ca02a5302de.
It was supposed to be NFC, but as noted in the post-commit
comments in D102002, that was not true: SimplifyCFG uses
different parameters and there's a difference in an
extension point / callback.

Added: 
    

Modified: 
    llvm/include/llvm/Passes/PassBuilder.h
    llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
    llvm/lib/Passes/PassBuilder.cpp
    llvm/lib/Transforms/IPO/PassManagerBuilder.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index f8252b9583677..339a2b7d42412 100644

--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -709,9 +709,6 @@ class PassBuilder {
 
   void addRequiredLTOPreLinkPasses(ModulePassManager &MPM);
 
-  void addVectorPasses(OptimizationLevel Level, FunctionPassManager &FPM,
-                       bool IsLTO);
-
   static Optional<std::vector<PipelineElement>>
   parsePipelineText(StringRef Text);
 

diff  --git a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
index 76d5e8ff8ed0f..a9928c3f5a40d 100644
--- a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -218,7 +218,6 @@ class PassManagerBuilder {
   void addLateLTOOptimizationPasses(legacy::PassManagerBase &PM);
   void addPGOInstrPasses(legacy::PassManagerBase &MPM, bool IsCS);
   void addFunctionSimplificationPasses(legacy::PassManagerBase &MPM);
-  void addVectorPasses(legacy::PassManagerBase &PM, bool IsLTO);
 
 public:
   /// populateFunctionPassManager - This fills in the function pass manager,

diff  --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index c32104df0ab4a..e6554f6106ed7 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1201,118 +1201,6 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   return MPM;
 }
 
-/// FIXME: Should LTO cause any 
diff erences to this set of passes?
-void PassBuilder::addVectorPasses(OptimizationLevel Level,
-                                  FunctionPassManager &FPM, bool IsLTO) {
-  FPM.addPass(LoopVectorizePass(
-      LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
-
-  if (IsLTO) {
-    // The vectorizer may have significantly shortened a loop body; unroll
-    // again. Unroll small loops to hide loop backedge latency and saturate any
-    // parallel execution resources of an out-of-order processor. We also then
-    // need to clean up redundancies and loop invariant code.
-    // FIXME: It would be really good to use a loop-integrated instruction
-    // combiner for cleanup here so that the unrolling and LICM can be pipelined
-    // across the loop nests.
-    // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
-    if (EnableUnrollAndJam && PTO.LoopUnrolling)
-      FPM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel()));
-    FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
-        Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
-        PTO.ForgetAllSCEVInLoopUnroll)));
-    FPM.addPass(WarnMissedTransformationsPass());
-  }
-
-  if (!IsLTO) {
-    // Eliminate loads by forwarding stores from the previous iteration to loads
-    // of the current iteration.
-    FPM.addPass(LoopLoadEliminationPass());
-  }
-  // Cleanup after the loop optimization passes.
-  FPM.addPass(InstCombinePass());
-
-  if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
-    // At higher optimization levels, try to clean up any runtime overlap and
-    // alignment checks inserted by the vectorizer. We want to track correlated
-    // runtime checks for two inner loops in the same outer loop, fold any
-    // common computations, hoist loop-invariant aspects out of any outer loop,
-    // and unswitch the runtime checks if possible. Once hoisted, we may have
-    // dead (or speculatable) control flows or more combining opportunities.
-    FPM.addPass(EarlyCSEPass());
-    FPM.addPass(CorrelatedValuePropagationPass());
-    FPM.addPass(InstCombinePass());
-    LoopPassManager LPM;
-    LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
-    LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
-                                       OptimizationLevel::O3));
-    FPM.addPass(
-        RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
-    FPM.addPass(createFunctionToLoopPassAdaptor(
-        std::move(LPM), EnableMSSALoopDependency,
-        /*UseBlockFrequencyInfo=*/true));
-    FPM.addPass(SimplifyCFGPass());
-    FPM.addPass(InstCombinePass());
-  }
-
-  // Now that we've formed fast to execute loop structures, we do further
-  // optimizations. These are run afterward as they might block doing complex
-  // analyses and transforms such as what are needed for loop vectorization.
-
-  // Cleanup after loop vectorization, etc. Simplification passes like CVP and
-  // GVN, loop transforms, and others have already run, so it's now better to
-  // convert to more optimized IR using more aggressive simplify CFG options.
-  // The extra sinking transform can create larger basic blocks, so do this
-  // before SLP vectorization.
-  FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
-                                  .forwardSwitchCondToPhi(true)
-                                  .convertSwitchToLookupTable(true)
-                                  .needCanonicalLoops(false)
-                                  .hoistCommonInsts(true)
-                                  .sinkCommonInsts(true)));
-  if (IsLTO) {
-    FPM.addPass(SCCPPass());
-    FPM.addPass(InstCombinePass());
-    FPM.addPass(BDCEPass());
-  }
-
-  // Optimize parallel scalar instruction chains into SIMD instructions.
-  if (PTO.SLPVectorization) {
-    FPM.addPass(SLPVectorizerPass());
-    if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
-      FPM.addPass(EarlyCSEPass());
-    }
-  }
-
-  // Enhance/cleanup vector code.
-  FPM.addPass(VectorCombinePass());
-  if (IsLTO) {
-    // After vectorization, assume intrinsics may tell us more about pointer
-    // alignments.
-    FPM.addPass(AlignmentFromAssumptionsPass());
-  }
-
-  FPM.addPass(InstCombinePass());
-
-  if (!IsLTO) {
-    // The vectorizer may have significantly shortened a loop body; unroll
-    // again. Unroll small loops to hide loop backedge latency and saturate any
-    // parallel execution resources of an out-of-order processor. We also then
-    // need to clean up redundancies and loop invariant code.
-    // FIXME: It would be really good to use a loop-integrated instruction
-    // combiner for cleanup here so that the unrolling and LICM can be pipelined
-    // across the loop nests.
-    // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
-    if (EnableUnrollAndJam && PTO.LoopUnrolling)
-      FPM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel()));
-    FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
-        Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
-        PTO.ForgetAllSCEVInLoopUnroll)));
-    FPM.addPass(WarnMissedTransformationsPass());
-    FPM.addPass(InstCombinePass());
-  }
-}
-
 ModulePassManager
 PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
                                              bool LTOPreLink) {
@@ -1407,8 +1295,83 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   // from the TargetLibraryInfo.
   OptimizePM.addPass(InjectTLIMappings());
 
-  addVectorPasses(Level, OptimizePM, /* IsLTO */ false);
+  // Now run the core loop vectorizer.
+  OptimizePM.addPass(LoopVectorizePass(
+      LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
+
+  // Eliminate loads by forwarding stores from the previous iteration to loads
+  // of the current iteration.
+  OptimizePM.addPass(LoopLoadEliminationPass());
+
+  // Cleanup after the loop optimization passes.
+  OptimizePM.addPass(InstCombinePass());
+
+  if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
+    // At higher optimization levels, try to clean up any runtime overlap and
+    // alignment checks inserted by the vectorizer. We want to track correlated
+    // runtime checks for two inner loops in the same outer loop, fold any
+    // common computations, hoist loop-invariant aspects out of any outer loop,
+    // and unswitch the runtime checks if possible. Once hoisted, we may have
+    // dead (or speculatable) control flows or more combining opportunities.
+    OptimizePM.addPass(EarlyCSEPass());
+    OptimizePM.addPass(CorrelatedValuePropagationPass());
+    OptimizePM.addPass(InstCombinePass());
+    LoopPassManager LPM;
+    LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+    LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
+                                       OptimizationLevel::O3));
+    OptimizePM.addPass(
+        RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
+    OptimizePM.addPass(createFunctionToLoopPassAdaptor(
+        std::move(LPM), EnableMSSALoopDependency,
+        /*UseBlockFrequencyInfo=*/true));
+    OptimizePM.addPass(SimplifyCFGPass());
+    OptimizePM.addPass(InstCombinePass());
+  }
+
+  // Now that we've formed fast to execute loop structures, we do further
+  // optimizations. These are run afterward as they might block doing complex
+  // analyses and transforms such as what are needed for loop vectorization.
+
+  // Cleanup after loop vectorization, etc. Simplification passes like CVP and
+  // GVN, loop transforms, and others have already run, so it's now better to
+  // convert to more optimized IR using more aggressive simplify CFG options.
+  // The extra sinking transform can create larger basic blocks, so do this
+  // before SLP vectorization.
+  OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
+                                         .forwardSwitchCondToPhi(true)
+                                         .convertSwitchToLookupTable(true)
+                                         .needCanonicalLoops(false)
+                                         .hoistCommonInsts(true)
+                                         .sinkCommonInsts(true)));
+
+  // Optimize parallel scalar instruction chains into SIMD instructions.
+  if (PTO.SLPVectorization) {
+    OptimizePM.addPass(SLPVectorizerPass());
+    if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
+      OptimizePM.addPass(EarlyCSEPass());
+    }
+  }
 
+  // Enhance/cleanup vector code.
+  OptimizePM.addPass(VectorCombinePass());
+  OptimizePM.addPass(InstCombinePass());
+
+  // Unroll small loops to hide loop backedge latency and saturate any parallel
+  // execution resources of an out-of-order processor. We also then need to
+  // clean up redundancies and loop invariant code.
+  // FIXME: It would be really good to use a loop-integrated instruction
+  // combiner for cleanup here so that the unrolling and LICM can be pipelined
+  // across the loop nests.
+  // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
+  if (EnableUnrollAndJam && PTO.LoopUnrolling) {
+    OptimizePM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel()));
+  }
+  OptimizePM.addPass(LoopUnrollPass(LoopUnrollOptions(
+      Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
+      PTO.ForgetAllSCEVInLoopUnroll)));
+  OptimizePM.addPass(WarnMissedTransformationsPass());
+  OptimizePM.addPass(InstCombinePass());
   OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
   OptimizePM.addPass(createFunctionToLoopPassAdaptor(
       LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
@@ -1862,9 +1825,39 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
       std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true));
 
   MainFPM.addPass(LoopDistributePass());
+  MainFPM.addPass(LoopVectorizePass(
+      LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
+  // The vectorizer may have significantly shortened a loop body; unroll again.
+  MainFPM.addPass(LoopUnrollPass(LoopUnrollOptions(
+      Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
+      PTO.ForgetAllSCEVInLoopUnroll)));
+
+  MainFPM.addPass(WarnMissedTransformationsPass());
+
+  MainFPM.addPass(InstCombinePass());
+  MainFPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().hoistCommonInsts(true)));
+  MainFPM.addPass(SCCPPass());
+  MainFPM.addPass(InstCombinePass());
+  MainFPM.addPass(BDCEPass());
+
+  // More scalar chains could be vectorized due to more alias information
+  if (PTO.SLPVectorization) {
+    MainFPM.addPass(SLPVectorizerPass());
+    if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
+      MainFPM.addPass(EarlyCSEPass());
+    }
+  }
+
+  MainFPM.addPass(VectorCombinePass()); // Clean up partial vectorization.
+
+  // After vectorization, assume intrinsics may tell us more about pointer
+  // alignments.
+  MainFPM.addPass(AlignmentFromAssumptionsPass());
 
-  addVectorPasses(Level, MainFPM, /* IsLTO */ true);
+  // FIXME: Conditionally run LoadCombine here, after it's ported
+  // (in case we still have this pass, given its questionable usefulness).
 
+  MainFPM.addPass(InstCombinePass());
   invokePeepholeEPCallbacks(MainFPM, Level);
   MainFPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true));
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM)));

diff  --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index 6e74f4c929bd0..87731e5eb1207 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -523,109 +523,6 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     MPM.add(createControlHeightReductionLegacyPass());
 }
 
-/// FIXME: Should LTO cause any 
diff erences to this set of passes?
-void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM,
-                                         bool IsLTO) {
-  PM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
-
-  if (IsLTO) {
-    // The vectorizer may have significantly shortened a loop body; unroll
-    // again. Unroll small loops to hide loop backedge latency and saturate any
-    // parallel execution resources of an out-of-order processor. We also then
-    // need to clean up redundancies and loop invariant code.
-    // FIXME: It would be really good to use a loop-integrated instruction
-    // combiner for cleanup here so that the unrolling and LICM can be pipelined
-    // across the loop nests.
-    // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
-    if (EnableUnrollAndJam && !DisableUnrollLoops)
-      PM.add(createLoopUnrollAndJamPass(OptLevel));
-    PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
-                                ForgetAllSCEVInLoopUnroll));
-    PM.add(createWarnMissedTransformationsPass());
-  }
-
-  if (!IsLTO) {
-    // Eliminate loads by forwarding stores from the previous iteration to loads
-    // of the current iteration.
-    PM.add(createLoopLoadEliminationPass());
-  }
-  // Cleanup after the loop optimization passes.
-  PM.add(createInstructionCombiningPass());
-
-  if (OptLevel > 1 && ExtraVectorizerPasses) {
-    // At higher optimization levels, try to clean up any runtime overlap and
-    // alignment checks inserted by the vectorizer. We want to track correlated
-    // runtime checks for two inner loops in the same outer loop, fold any
-    // common computations, hoist loop-invariant aspects out of any outer loop,
-    // and unswitch the runtime checks if possible. Once hoisted, we may have
-    // dead (or speculatable) control flows or more combining opportunities.
-    PM.add(createEarlyCSEPass());
-    PM.add(createCorrelatedValuePropagationPass());
-    PM.add(createInstructionCombiningPass());
-    PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
-    PM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
-    PM.add(createCFGSimplificationPass());
-    PM.add(createInstructionCombiningPass());
-  }
-
-  // Now that we've formed fast to execute loop structures, we do further
-  // optimizations. These are run afterward as they might block doing complex
-  // analyses and transforms such as what are needed for loop vectorization.
-
-  // Cleanup after loop vectorization, etc. Simplification passes like CVP and
-  // GVN, loop transforms, and others have already run, so it's now better to
-  // convert to more optimized IR using more aggressive simplify CFG options.
-  // The extra sinking transform can create larger basic blocks, so do this
-  // before SLP vectorization.
-  PM.add(createCFGSimplificationPass(SimplifyCFGOptions()
-                                         .forwardSwitchCondToPhi(true)
-                                         .convertSwitchToLookupTable(true)
-                                         .needCanonicalLoops(false)
-                                         .hoistCommonInsts(true)
-                                         .sinkCommonInsts(true)));
-
-  if (IsLTO) {
-    PM.add(createSCCPPass()); // Propagate exposed constants
-    PM.add(createInstructionCombiningPass()); // Clean up again
-    PM.add(createBitTrackingDCEPass());
-  }
-
-  // Optimize parallel scalar instruction chains into SIMD instructions.
-  if (SLPVectorize) {
-    PM.add(createSLPVectorizerPass());
-    if (OptLevel > 1 && ExtraVectorizerPasses)
-      PM.add(createEarlyCSEPass());
-  }
-
-  // Enhance/cleanup vector code.
-  PM.add(createVectorCombinePass());
-
-  if (IsLTO) {
-    // After vectorization, assume intrinsics may tell us more about pointer
-    // alignments.
-    PM.add(createAlignmentFromAssumptionsPass());
-  }
-  addExtensionsToPM(EP_Peephole, PM);
-  PM.add(createInstructionCombiningPass());
-
-  if (!IsLTO) {
-    // The vectorizer may have significantly shortened a loop body; unroll
-    // again. Unroll small loops to hide loop backedge latency and saturate any
-    // parallel execution resources of an out-of-order processor. We also then
-    // need to clean up redundancies and loop invariant code.
-    // FIXME: It would be really good to use a loop-integrated instruction
-    // combiner for cleanup here so that the unrolling and LICM can be pipelined
-    // across the loop nests.
-    // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
-    if (EnableUnrollAndJam && !DisableUnrollLoops)
-      PM.add(createLoopUnrollAndJamPass(OptLevel));
-    PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
-                                ForgetAllSCEVInLoopUnroll));
-    if (!DisableUnrollLoops)
-      PM.add(createInstructionCombiningPass());
-  }
-}
-
 void PassManagerBuilder::populateModulePassManager(
     legacy::PassManagerBase &MPM) {
   // Whether this is a default or *LTO pre-link pipeline. The FullLTO post-link
@@ -897,9 +794,74 @@ void PassManagerBuilder::populateModulePassManager(
   // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
   MPM.add(createLoopDistributePass());
 
-  addVectorPasses(MPM, /* IsLTO */ false);
+  MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
+
+  // Eliminate loads by forwarding stores from the previous iteration to loads
+  // of the current iteration.
+  MPM.add(createLoopLoadEliminationPass());
+
+  // FIXME: Because of #pragma vectorize enable, the passes below are always
+  // inserted in the pipeline, even when the vectorizer doesn't run (ex. when
+  // on -O1 and no #pragma is found). Would be good to have these two passes
+  // as function calls, so that we can only pass them when the vectorizer
+  // changed the code.
+  MPM.add(createInstructionCombiningPass());
+  if (OptLevel > 1 && ExtraVectorizerPasses) {
+    // At higher optimization levels, try to clean up any runtime overlap and
+    // alignment checks inserted by the vectorizer. We want to track correllated
+    // runtime checks for two inner loops in the same outer loop, fold any
+    // common computations, hoist loop-invariant aspects out of any outer loop,
+    // and unswitch the runtime checks if possible. Once hoisted, we may have
+    // dead (or speculatable) control flows or more combining opportunities.
+    MPM.add(createEarlyCSEPass());
+    MPM.add(createCorrelatedValuePropagationPass());
+    MPM.add(createInstructionCombiningPass());
+    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
+    MPM.add(createCFGSimplificationPass());
+    MPM.add(createInstructionCombiningPass());
+  }
+
+  // Cleanup after loop vectorization, etc. Simplification passes like CVP and
+  // GVN, loop transforms, and others have already run, so it's now better to
+  // convert to more optimized IR using more aggressive simplify CFG options.
+  // The extra sinking transform can create larger basic blocks, so do this
+  // before SLP vectorization.
+  MPM.add(createCFGSimplificationPass(SimplifyCFGOptions()
+                                          .forwardSwitchCondToPhi(true)
+                                          .convertSwitchToLookupTable(true)
+                                          .needCanonicalLoops(false)
+                                          .hoistCommonInsts(true)
+                                          .sinkCommonInsts(true)));
+
+  if (SLPVectorize) {
+    MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
+    if (OptLevel > 1 && ExtraVectorizerPasses) {
+      MPM.add(createEarlyCSEPass());
+    }
+  }
+
+  // Enhance/cleanup vector code.
+  MPM.add(createVectorCombinePass());
+
+  addExtensionsToPM(EP_Peephole, MPM);
+  MPM.add(createInstructionCombiningPass());
+
+  if (EnableUnrollAndJam && !DisableUnrollLoops) {
+    // Unroll and Jam. We do this before unroll but need to be in a separate
+    // loop pass manager in order for the outer loop to be processed by
+    // unroll and jam before the inner loop is unrolled.
+    MPM.add(createLoopUnrollAndJamPass(OptLevel));
+  }
+
+  // Unroll small loops
+  MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
+                               ForgetAllSCEVInLoopUnroll));
 
   if (!DisableUnrollLoops) {
+    // LoopUnroll may generate some redundency to cleanup.
+    MPM.add(createInstructionCombiningPass());
+
     // Runtime unrolling will introduce runtime check in loop prologue. If the
     // unrolled loop is a inner loop, then the prologue will be inside the
     // outer loop. LICM pass can help to promote the runtime check out if the
@@ -1121,9 +1083,35 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
                                     ForgetAllSCEVInLoopUnroll));
   PM.add(createLoopDistributePass());
-
-  addVectorPasses(PM, /* IsLTO */ true);
-
+  PM.add(createLoopVectorizePass(true, !LoopVectorize));
+  // The vectorizer may have significantly shortened a loop body; unroll again.
+  PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
+                              ForgetAllSCEVInLoopUnroll));
+
+  PM.add(createWarnMissedTransformationsPass());
+
+  // Now that we've optimized loops (in particular loop induction variables),
+  // we may have exposed more scalar opportunities. Run parts of the scalar
+  // optimizer again at this point.
+  PM.add(createInstructionCombiningPass()); // Initial cleanup
+  PM.add(createCFGSimplificationPass(SimplifyCFGOptions() // if-convert
+                                         .hoistCommonInsts(true)));
+  PM.add(createSCCPPass()); // Propagate exposed constants
+  PM.add(createInstructionCombiningPass()); // Clean up again
+  PM.add(createBitTrackingDCEPass());
+
+  // More scalar chains could be vectorized due to more alias information
+  if (SLPVectorize)
+    PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
+
+  PM.add(createVectorCombinePass()); // Clean up partial vectorization.
+
+  // After vectorization, assume intrinsics may tell us more about pointer
+  // alignments.
+  PM.add(createAlignmentFromAssumptionsPass());
+
+  // Cleanup and simplify the code after the scalar optimizations.
+  PM.add(createInstructionCombiningPass());
   addExtensionsToPM(EP_Peephole, PM);
 
   PM.add(createJumpThreadingPass(/*FreezeSelectCond*/ true));