[llvm] r336062 - [UnrollAndJam] New Unroll and Jam pass

Sun Jul 1 05:47:30 PDT 2018

Author: dmgreen
Date: Sun Jul  1 05:47:30 2018
New Revision: 336062

URL: http://llvm.org/viewvc/llvm-project?rev=336062&view=rev
Log:
[UnrollAndJam] New Unroll and Jam pass

This is a simple implementation of the unroll-and-jam classical loop
optimisation.

The basic idea is that we take an outer loop of the form:

  for i..
    ForeBlocks(i)
    for j..
      SubLoopBlocks(i, j)
    AftBlocks(i)

Instead of doing normal inner or outer unrolling, we unroll as follows:

  for i... i+=2
    ForeBlocks(i)
    ForeBlocks(i+1)
    for j..
      SubLoopBlocks(i, j)
      SubLoopBlocks(i+1, j)
    AftBlocks(i)
    AftBlocks(i+1)
  Remainder Loop

So we have unrolled the outer loop, then jammed the two inner loops into
one. This can lead to a simpler inner loop if memory accesses can be shared
between the now jammed loops.

To do this we have to prove that this is all safe, both for the memory
accesses (using dependence analysis) and that ForeBlocks(i+1) can move before
AftBlocks(i) and SubLoopBlocks(i, j).

Differential Revision: https://reviews.llvm.org/D41953


Added:
    llvm/trunk/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h
    llvm/trunk/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
    llvm/trunk/lib/Transforms/Utils/LoopUnrollAndJam.cpp
    llvm/trunk/test/Transforms/LoopUnrollAndJam/
    llvm/trunk/test/Transforms/LoopUnrollAndJam/dependencies.ll
    llvm/trunk/test/Transforms/LoopUnrollAndJam/disable.ll
    llvm/trunk/test/Transforms/LoopUnrollAndJam/pragma.ll
    llvm/trunk/test/Transforms/LoopUnrollAndJam/unprofitable.ll
    llvm/trunk/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll
Modified:
    llvm/trunk/include/llvm-c/Transforms/Scalar.h
    llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
    llvm/trunk/include/llvm/InitializePasses.h
    llvm/trunk/include/llvm/LinkAllPasses.h
    llvm/trunk/include/llvm/Transforms/Scalar.h
    llvm/trunk/include/llvm/Transforms/Utils/UnrollLoop.h
    llvm/trunk/lib/Passes/PassBuilder.cpp
    llvm/trunk/lib/Passes/PassRegistry.def
    llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp
    llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp
    llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt
    llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp
    llvm/trunk/lib/Transforms/Scalar/Scalar.cpp
    llvm/trunk/lib/Transforms/Utils/CMakeLists.txt
    llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp

Modified: llvm/trunk/include/llvm-c/Transforms/Scalar.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm-c/Transforms/Scalar.h?rev=336062&r1=336061&r2=336062&view=diff
==============================================================================

--- llvm/trunk/include/llvm-c/Transforms/Scalar.h (original)
+++ llvm/trunk/include/llvm-c/Transforms/Scalar.h Sun Jul  1 05:47:30 2018
@@ -89,6 +89,9 @@ void LLVMAddLoopRerollPass(LLVMPassManag
 /** See llvm::createLoopUnrollPass function. */
 void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM);
 
+/** See llvm::createLoopUnrollAndJamPass function. */
+void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM);
+
 /** See llvm::createLoopUnswitchPass function. */
 void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM);
 

Modified: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h?rev=336062&r1=336061&r2=336062&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h (original)
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h Sun Jul  1 05:47:30 2018
@@ -422,6 +422,13 @@ public:
     bool AllowPeeling;
     /// Allow unrolling of all the iterations of the runtime loop remainder.
     bool UnrollRemainder;
+    /// Allow unroll and jam. Used to enable unroll and jam for the target.
+    bool UnrollAndJam;
+    /// Threshold for unroll and jam, for inner loop size. The 'Threshold'
+    /// value above is used during unroll and jam for the outer loop size.
+    /// This value is used in the same manner to limit the size of the inner
+    /// loop.
+    unsigned UnrollAndJamInnerLoopThreshold;
   };
 
   /// Get target-customized preferences for the generic loop unrolling

Modified: llvm/trunk/include/llvm/InitializePasses.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/InitializePasses.h?rev=336062&r1=336061&r2=336062&view=diff
==============================================================================
--- llvm/trunk/include/llvm/InitializePasses.h (original)
+++ llvm/trunk/include/llvm/InitializePasses.h Sun Jul  1 05:47:30 2018
@@ -226,6 +226,7 @@ void initializeLoopSimplifyCFGLegacyPass
 void initializeLoopSimplifyPass(PassRegistry&);
 void initializeLoopStrengthReducePass(PassRegistry&);
 void initializeLoopUnrollPass(PassRegistry&);
+void initializeLoopUnrollAndJamPass(PassRegistry&);
 void initializeLoopUnswitchPass(PassRegistry&);
 void initializeLoopVectorizePass(PassRegistry&);
 void initializeLoopVersioningLICMPass(PassRegistry&);

Modified: llvm/trunk/include/llvm/LinkAllPasses.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/LinkAllPasses.h?rev=336062&r1=336061&r2=336062&view=diff
==============================================================================
--- llvm/trunk/include/llvm/LinkAllPasses.h (original)
+++ llvm/trunk/include/llvm/LinkAllPasses.h Sun Jul  1 05:47:30 2018
@@ -132,6 +132,7 @@ namespace {
       (void) llvm::createLoopStrengthReducePass();
       (void) llvm::createLoopRerollPass();
       (void) llvm::createLoopUnrollPass();
+      (void) llvm::createLoopUnrollAndJamPass();
       (void) llvm::createLoopUnswitchPass();
       (void) llvm::createLoopVersioningLICMPass();
       (void) llvm::createLoopIdiomPass();

Modified: llvm/trunk/include/llvm/Transforms/Scalar.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Transforms/Scalar.h?rev=336062&r1=336061&r2=336062&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Transforms/Scalar.h (original)
+++ llvm/trunk/include/llvm/Transforms/Scalar.h Sun Jul  1 05:47:30 2018
@@ -192,6 +192,12 @@ Pass *createSimpleLoopUnrollPass(int Opt
 
 //===----------------------------------------------------------------------===//
 //
+// LoopUnrollAndJam - This pass is a simple loop unroll and jam pass.
+//
+Pass *createLoopUnrollAndJamPass(int OptLevel = 2);
+
+//===----------------------------------------------------------------------===//
+//
 // LoopReroll - This pass is a simple loop rerolling pass.
 //
 Pass *createLoopRerollPass();

Added: llvm/trunk/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h?rev=336062&view=auto
==============================================================================
--- llvm/trunk/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h (added)
+++ llvm/trunk/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h Sun Jul  1 05:47:30 2018
@@ -0,0 +1,35 @@
+//===- LoopUnrollAndJamPass.h -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H
+#define LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H
+
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Loop;
+struct LoopStandardAnalysisResults;
+class LPMUpdater;
+
+/// A simple loop rotation transformation.
+class LoopUnrollAndJamPass : public PassInfoMixin<LoopUnrollAndJamPass> {
+  const int OptLevel;
+
+public:
+  explicit LoopUnrollAndJamPass(int OptLevel = 2) : OptLevel(OptLevel) {}
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H

Modified: llvm/trunk/include/llvm/Transforms/Utils/UnrollLoop.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Transforms/Utils/UnrollLoop.h?rev=336062&r1=336061&r2=336062&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Transforms/Utils/UnrollLoop.h (original)
+++ llvm/trunk/include/llvm/Transforms/Utils/UnrollLoop.h Sun Jul  1 05:47:30 2018
@@ -19,11 +19,13 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 
 namespace llvm {
 
 class AssumptionCache;
 class BasicBlock;
+class DependenceInfo;
 class DominatorTree;
 class Loop;
 class LoopInfo;
@@ -78,8 +80,47 @@ bool canPeel(Loop *L);
 bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE,
               DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA);
 
+LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
+                                  unsigned TripMultiple, bool UnrollRemainder,
+                                  LoopInfo *LI, ScalarEvolution *SE,
+                                  DominatorTree *DT, AssumptionCache *AC,
+                                  OptimizationRemarkEmitter *ORE);
+
+bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
+                          DependenceInfo &DI);
+
+bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
+                        DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
+                        const SmallPtrSetImpl<const Value *> &EphValues,
+                        OptimizationRemarkEmitter *ORE, unsigned &TripCount,
+                        unsigned MaxTripCount, unsigned &TripMultiple,
+                        unsigned LoopSize,
+                        TargetTransformInfo::UnrollingPreferences &UP,
+                        bool &UseUpperBound);
+
+BasicBlock *foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI,
+                                     ScalarEvolution *SE, DominatorTree *DT);
+
+void remapInstruction(Instruction *I, ValueToValueMapTy &VMap);
+
+void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
+                             ScalarEvolution *SE, DominatorTree *DT,
+                             AssumptionCache *AC);
+
 MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);
 
+TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
+    Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
+    Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
+    Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
+    Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling);
+
+unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
+                             bool &NotDuplicatable, bool &Convergent,
+                             const TargetTransformInfo &TTI,
+                             const SmallPtrSetImpl<const Value *> &EphValues,
+                             unsigned BEInsns);
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_UNROLLLOOP_H

Modified: llvm/trunk/lib/Passes/PassBuilder.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Passes/PassBuilder.cpp?rev=336062&r1=336061&r2=336062&view=diff
==============================================================================
--- llvm/trunk/lib/Passes/PassBuilder.cpp (original)
+++ llvm/trunk/lib/Passes/PassBuilder.cpp Sun Jul  1 05:47:30 2018
@@ -121,6 +121,7 @@
 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
 #include "llvm/Transforms/Scalar/LoopSink.h"
 #include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
+#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
 #include "llvm/Transforms/Scalar/LowerAtomic.h"
 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
@@ -179,6 +180,10 @@ static cl::opt<bool> EnableGVNSink(
     "enable-npm-gvn-sink", cl::init(false), cl::Hidden,
     cl::desc("Enable the GVN hoisting pass for the new PM (default = off)"));
 
+static cl::opt<bool> EnableUnrollAndJam(
+    "enable-npm-unroll-and-jam", cl::init(false), cl::Hidden,
+    cl::desc("Enable the Unroll and Jam pass for the new PM (default = off)"));
+
 static cl::opt<bool> EnableSyntheticCounts(
     "enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore,
     cl::desc("Run synthetic function entry count generation "
@@ -798,6 +803,11 @@ PassBuilder::buildModuleOptimizationPipe
   // FIXME: It would be really good to use a loop-integrated instruction
   // combiner for cleanup here so that the unrolling and LICM can be pipelined
   // across the loop nests.
+  // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
+  if (EnableUnrollAndJam) {
+    OptimizePM.addPass(
+        createFunctionToLoopPassAdaptor(LoopUnrollAndJamPass(Level)));
+  }
   OptimizePM.addPass(LoopUnrollPass(Level));
   OptimizePM.addPass(InstCombinePass());
   OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());

Modified: llvm/trunk/lib/Passes/PassRegistry.def
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Passes/PassRegistry.def?rev=336062&r1=336061&r2=336062&view=diff
==============================================================================
--- llvm/trunk/lib/Passes/PassRegistry.def (original)
+++ llvm/trunk/lib/Passes/PassRegistry.def Sun Jul  1 05:47:30 2018
@@ -241,6 +241,7 @@ LOOP_PASS("simplify-cfg", LoopSimplifyCF
 LOOP_PASS("strength-reduce", LoopStrengthReducePass())
 LOOP_PASS("indvars", IndVarSimplifyPass())
 LOOP_PASS("irce", IRCEPass())
+LOOP_PASS("unroll-and-jam", LoopUnrollAndJamPass())
 LOOP_PASS("unroll-full", LoopFullUnrollPass())
 LOOP_PASS("unswitch", SimpleLoopUnswitchPass())
 LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs()))

Modified: llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp?rev=336062&r1=336061&r2=336062&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp Sun Jul  1 05:47:30 2018
@@ -622,6 +622,8 @@ void ARMTTIImpl::getUnrollingPreferences
   UP.Runtime = true;
   UP.UnrollRemainder = true;
   UP.DefaultUnrollRuntimeCount = 4;
+  UP.UnrollAndJam = true;
+  UP.UnrollAndJamInnerLoopThreshold = 60;
 
   // Force unrolling small loops can be very useful because of the branch
   // taken cost of the backedge.

Modified: llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp?rev=336062&r1=336061&r2=336062&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp (original)
+++ llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp Sun Jul  1 05:47:30 2018
@@ -96,6 +96,10 @@ static cl::opt<bool> EnableLoopInterchan
     "enable-loopinterchange", cl::init(false), cl::Hidden,
     cl::desc("Enable the new, experimental LoopInterchange Pass"));
 
+static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam",
+                                        cl::init(false), cl::Hidden,
+                                        cl::desc("Enable Unroll And Jam Pass"));
+
 static cl::opt<bool>
     EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
                             cl::desc("Enable preparation for ThinLTO."));
@@ -669,6 +673,13 @@ void PassManagerBuilder::populateModuleP
   addInstructionCombiningPass(MPM);
 
   if (!DisableUnrollLoops) {
+    if (EnableUnrollAndJam) {
+      // Unroll and Jam. We do this before unroll but need to be in a separate
+      // loop pass manager in order for the outer loop to be processed by
+      // unroll and jam before the inner loop is unrolled.
+      MPM.add(createLoopUnrollAndJamPass(OptLevel));
+    }
+
     MPM.add(createLoopUnrollPass(OptLevel));    // Unroll small loops
 
     // LoopUnroll may generate some redundency to cleanup.

Modified: llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt?rev=336062&r1=336061&r2=336062&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt (original)
+++ llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt Sun Jul  1 05:47:30 2018
@@ -39,6 +39,7 @@ add_llvm_library(LLVMScalarOpts
   LoopSimplifyCFG.cpp
   LoopStrengthReduce.cpp
   LoopUnrollPass.cpp
+  LoopUnrollAndJamPass.cpp
   LoopUnswitch.cpp
   LoopVersioningLICM.cpp
   LowerAtomic.cpp

Added: llvm/trunk/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp?rev=336062&view=auto
==============================================================================
--- llvm/trunk/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp (added)
+++ llvm/trunk/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp Sun Jul  1 05:47:30 2018
@@ -0,0 +1,447 @@
+//===- LoopUnrollAndJam.cpp - Loop unroll and jam pass --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements an unroll and jam pass. Most of the work is done by
+// Utils/UnrollLoopAndJam.cpp.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll-and-jam"
+
+static cl::opt<bool>
+    AllowUnrollAndJam("allow-unroll-and-jam", cl::Hidden,
+                      cl::desc("Allows loops to be unroll-and-jammed."));
+
+static cl::opt<unsigned> UnrollAndJamCount(
+    "unroll-and-jam-count", cl::Hidden,
+    cl::desc("Use this unroll count for all loops including those with "
+             "unroll_and_jam_count pragma values, for testing purposes"));
+
+static cl::opt<unsigned> UnrollAndJamThreshold(
+    "unroll-and-jam-threshold", cl::init(60), cl::Hidden,
+    cl::desc("Threshold to use for inner loop when doing unroll and jam."));
+
+static cl::opt<unsigned> PragmaUnrollAndJamThreshold(
+    "pragma-unroll-and-jam-threshold", cl::init(1024), cl::Hidden,
+    cl::desc("Unrolled size limit for loops with an unroll_and_jam(full) or "
+             "unroll_count pragma."));
+
+// Returns the loop hint metadata node with the given name (for example,
+// "llvm.loop.unroll.count").  If no such metadata node exists, then nullptr is
+// returned.
+static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
+  if (MDNode *LoopID = L->getLoopID())
+    return GetUnrollMetadata(LoopID, Name);
+  return nullptr;
+}
+
+// Returns true if the loop has any metadata starting with Prefix. For example a
+// Prefix of "llvm.loop.unroll." returns true if we have any unroll metadata.
+static bool HasAnyUnrollPragma(const Loop *L, StringRef Prefix) {
+  if (MDNode *LoopID = L->getLoopID()) {
+    // First operand should refer to the loop id itself.
+    assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+    assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+    for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+      MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+      if (!MD)
+        continue;
+
+      MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+      if (!S)
+        continue;
+
+      if (S->getString().startswith(Prefix))
+        return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if the loop has an unroll_and_jam(enable) pragma.
+static bool HasUnrollAndJamEnablePragma(const Loop *L) {
+  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.enable");
+}
+
+// Returns true if the loop has an unroll_and_jam(disable) pragma.
+static bool HasUnrollAndJamDisablePragma(const Loop *L) {
+  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.disable");
+}
+
+// If loop has an unroll_and_jam_count pragma return the (necessarily
+// positive) value from the pragma.  Otherwise return 0.
+static unsigned UnrollAndJamCountPragmaValue(const Loop *L) {
+  MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.count");
+  if (MD) {
+    assert(MD->getNumOperands() == 2 &&
+           "Unroll count hint metadata should have two operands.");
+    unsigned Count =
+        mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+    assert(Count >= 1 && "Unroll count must be positive.");
+    return Count;
+  }
+  return 0;
+}
+
+// Returns loop size estimation for unrolled loop.
+static uint64_t
+getUnrollAndJammedLoopSize(unsigned LoopSize,
+                           TargetTransformInfo::UnrollingPreferences &UP) {
+  assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
+  return static_cast<uint64_t>(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
+}
+
+// Calculates unroll and jam count and writes it to UP.Count. Returns true if
+// unroll count was set explicitly.
+static bool computeUnrollAndJamCount(
+    Loop *L, Loop *SubLoop, const TargetTransformInfo &TTI, DominatorTree &DT,
+    LoopInfo *LI, ScalarEvolution &SE,
+    const SmallPtrSetImpl<const Value *> &EphValues,
+    OptimizationRemarkEmitter *ORE, unsigned OuterTripCount,
+    unsigned OuterTripMultiple, unsigned OuterLoopSize, unsigned InnerTripCount,
+    unsigned InnerLoopSize, TargetTransformInfo::UnrollingPreferences &UP) {
+  // Check for explicit Count from the "unroll-and-jam-count" option.
+  bool UserUnrollCount = UnrollAndJamCount.getNumOccurrences() > 0;
+  if (UserUnrollCount) {
+    UP.Count = UnrollAndJamCount;
+    UP.Force = true;
+    if (UP.AllowRemainder &&
+        getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
+        getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
+            UP.UnrollAndJamInnerLoopThreshold)
+      return true;
+  }
+
+  // Check for unroll_and_jam pragmas
+  unsigned PragmaCount = UnrollAndJamCountPragmaValue(L);
+  if (PragmaCount > 0) {
+    UP.Count = PragmaCount;
+    UP.Runtime = true;
+    UP.Force = true;
+    if ((UP.AllowRemainder || (OuterTripMultiple % PragmaCount == 0)) &&
+        getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
+        getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
+            UP.UnrollAndJamInnerLoopThreshold)
+      return true;
+  }
+
+  // Use computeUnrollCount from the loop unroller to get a sensible count
+  // for the unrolling the outer loop. This uses UP.Threshold /
+  // UP.PartialThreshold / UP.MaxCount to come up with sensible loop values.
+  // We have already checked that the loop has no unroll.* pragmas.
+  unsigned MaxTripCount = 0;
+  bool UseUpperBound = false;
+  bool ExplicitUnroll = computeUnrollCount(
+      L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
+      OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
+  if (ExplicitUnroll || UseUpperBound) {
+    // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
+    // for the unroller instead.
+    UP.Count = 0;
+    return false;
+  }
+
+  bool PragmaEnableUnroll = HasUnrollAndJamEnablePragma(L);
+  ExplicitUnroll = PragmaCount > 0 || PragmaEnableUnroll || UserUnrollCount;
+
+  // If the loop has an unrolling pragma, we want to be more aggressive with
+  // unrolling limits.
+  if (ExplicitUnroll && OuterTripCount != 0)
+    UP.UnrollAndJamInnerLoopThreshold = PragmaUnrollAndJamThreshold;
+
+  if (!UP.AllowRemainder && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
+                                UP.UnrollAndJamInnerLoopThreshold) {
+    UP.Count = 0;
+    return false;
+  }
+
+  // If the inner loop count is known and small, leave the entire loop nest to
+  // be the unroller
+  if (!ExplicitUnroll && InnerTripCount &&
+      InnerLoopSize * InnerTripCount < UP.Threshold) {
+    UP.Count = 0;
+    return false;
+  }
+
+  // We have a sensible limit for the outer loop, now adjust it for the inner
+  // loop and UP.UnrollAndJamInnerLoopThreshold.
+  while (UP.Count != 0 && UP.AllowRemainder &&
+         getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
+             UP.UnrollAndJamInnerLoopThreshold)
+    UP.Count--;
+
+  if (!ExplicitUnroll) {
+    // Check for situations where UnJ is likely to be unprofitable. Including
+    // subloops with more than 1 block.
+    if (SubLoop->getBlocks().size() != 1) {
+      UP.Count = 0;
+      return false;
+    }
+
+    // Limit to loops where there is something to gain from unrolling and
+    // jamming the loop. In this case, look for loads that are invariant in the
+    // outer loop and can become shared.
+    unsigned NumInvariant = 0;
+    for (BasicBlock *BB : SubLoop->getBlocks()) {
+      for (Instruction &I : *BB) {
+        if (auto *Ld = dyn_cast<LoadInst>(&I)) {
+          Value *V = Ld->getPointerOperand();
+          const SCEV *LSCEV = SE.getSCEVAtScope(V, L);
+          if (SE.isLoopInvariant(LSCEV, L))
+            NumInvariant++;
+        }
+      }
+    }
+    if (NumInvariant == 0) {
+      UP.Count = 0;
+      return false;
+    }
+  }
+
+  return ExplicitUnroll;
+}
+
+static LoopUnrollResult
+tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
+                      ScalarEvolution &SE, const TargetTransformInfo &TTI,
+                      AssumptionCache &AC, DependenceInfo &DI,
+                      OptimizationRemarkEmitter &ORE, int OptLevel) {
+  // Quick checks of the correct loop form
+  if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
+    return LoopUnrollResult::Unmodified;
+  Loop *SubLoop = L->getSubLoops()[0];
+  if (!SubLoop->isLoopSimplifyForm())
+    return LoopUnrollResult::Unmodified;
+
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Exit = L->getExitingBlock();
+  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
+  BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
+
+  if (Latch != Exit || SubLoopLatch != SubLoopExit)
+    return LoopUnrollResult::Unmodified;
+
+  TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
+      L, SE, TTI, OptLevel, None, None, None, None, None, None);
+  if (AllowUnrollAndJam.getNumOccurrences() > 0)
+    UP.UnrollAndJam = AllowUnrollAndJam;
+  if (UnrollAndJamThreshold.getNumOccurrences() > 0)
+    UP.UnrollAndJamInnerLoopThreshold = UnrollAndJamThreshold;
+  // Exit early if unrolling is disabled.
+  if (!UP.UnrollAndJam || UP.UnrollAndJamInnerLoopThreshold == 0)
+    return LoopUnrollResult::Unmodified;
+
+  LLVM_DEBUG(dbgs() << "Loop Unroll and Jam: F["
+                    << L->getHeader()->getParent()->getName() << "] Loop %"
+                    << L->getHeader()->getName() << "\n");
+
+  // A loop with any unroll pragma (enabling/disabling/count/etc) is left for
+  // the unroller, so long as it does not explicitly have unroll_and_jam
+  // metadata. This means #pragma nounroll will disable unroll and jam as well
+  // as unrolling
+  if (HasUnrollAndJamDisablePragma(L) ||
+      (HasAnyUnrollPragma(L, "llvm.loop.unroll.") &&
+       !HasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam."))) {
+    LLVM_DEBUG(dbgs() << "  Disabled due to pragma.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  if (!isSafeToUnrollAndJam(L, SE, DT, DI)) {
+    LLVM_DEBUG(dbgs() << "  Disabled due to not being safe.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  // Approximate the loop size and collect useful info
+  unsigned NumInlineCandidates;
+  bool NotDuplicatable;
+  bool Convergent;
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+  unsigned InnerLoopSize =
+      ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable,
+                          Convergent, TTI, EphValues, UP.BEInsns);
+  unsigned OuterLoopSize =
+      ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
+                          TTI, EphValues, UP.BEInsns);
+  LLVM_DEBUG(dbgs() << "  Outer Loop Size: " << OuterLoopSize << "\n");
+  LLVM_DEBUG(dbgs() << "  Inner Loop Size: " << InnerLoopSize << "\n");
+  if (NotDuplicatable) {
+    LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable "
+                         "instructions.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+  if (NumInlineCandidates != 0) {
+    LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+  if (Convergent) {
+    LLVM_DEBUG(
+        dbgs() << "  Not unrolling loop with convergent instructions.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  // Find trip count and trip multiple
+  unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch);
+  unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch);
+  unsigned InnerTripCount = SE.getSmallConstantTripCount(SubLoop, SubLoopLatch);
+
+  // Decide if, and by how much, to unroll
+  bool IsCountSetExplicitly = computeUnrollAndJamCount(
+      L, SubLoop, TTI, DT, LI, SE, EphValues, &ORE, OuterTripCount,
+      OuterTripMultiple, OuterLoopSize, InnerTripCount, InnerLoopSize, UP);
+  if (UP.Count <= 1)
+    return LoopUnrollResult::Unmodified;
+  // Unroll factor (Count) must be less or equal to TripCount.
+  if (OuterTripCount && UP.Count > OuterTripCount)
+    UP.Count = OuterTripCount;
+
+  LoopUnrollResult UnrollResult =
+      UnrollAndJamLoop(L, UP.Count, OuterTripCount, OuterTripMultiple,
+                       UP.UnrollRemainder, LI, &SE, &DT, &AC, &ORE);
+
+  // If loop has an unroll count pragma or unrolled by explicitly set count
+  // mark loop as unrolled to prevent unrolling beyond that requested.
+  if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)
+    L->setLoopAlreadyUnrolled();
+
+  return UnrollResult;
+}
+
+namespace {
+
+class LoopUnrollAndJam : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  unsigned OptLevel;
+
+  LoopUnrollAndJam(int OptLevel = 2) : LoopPass(ID), OptLevel(OptLevel) {
+    initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+
+    Function &F = *L->getHeader()->getParent();
+
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    const TargetTransformInfo &TTI =
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
+    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+    // pass.  Function analyses need to be preserved across loop transformations
+    // but ORE cannot be preserved (see comment before the pass definition).
+    OptimizationRemarkEmitter ORE(&F);
+
+    LoopUnrollResult Result =
+        tryToUnrollAndJamLoop(L, DT, LI, SE, TTI, AC, DI, ORE, OptLevel);
+
+    if (Result == LoopUnrollResult::FullyUnrolled)
+      LPM.markLoopAsDeleted(*L);
+
+    return Result != LoopUnrollResult::Unmodified;
+  }
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG...
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<DependenceAnalysisWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char LoopUnrollAndJam::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam",
+                      "Unroll and Jam loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
+INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam",
+                    "Unroll and Jam loops", false, false)
+
+Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) {
+  return new LoopUnrollAndJam(OptLevel);
+}
+
+PreservedAnalyses LoopUnrollAndJamPass::run(Loop &L, LoopAnalysisManager &AM,
+                                            LoopStandardAnalysisResults &AR,
+                                            LPMUpdater &) {
+  const auto &FAM =
+      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
+  Function *F = L.getHeader()->getParent();
+
+  auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
+  // FIXME: This should probably be optional rather than required.
+  if (!ORE)
+    report_fatal_error(
+        "LoopUnrollAndJamPass: OptimizationRemarkEmitterAnalysis not cached at "
+        "a higher level");
+
+  DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
+
+  LoopUnrollResult Result = tryToUnrollAndJamLoop(
+      &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, DI, *ORE, OptLevel);
+
+  if (Result == LoopUnrollResult::Unmodified)
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}

Modified: llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp?rev=336062&r1=336061&r2=336062&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp Sun Jul  1 05:47:30 2018
@@ -165,7 +165,7 @@ static const unsigned NoThreshold = std:
 
 /// Gather the various unrolling parameters based on the defaults, compiler
 /// flags, TTI overrides and user specified parameters.
-static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
+TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
     Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
     Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
     Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
@@ -192,6 +192,8 @@ static TargetTransformInfo::UnrollingPre
   UP.Force = false;
   UP.UpperBound = false;
   UP.AllowPeeling = true;
+  UP.UnrollAndJam = false;
+  UP.UnrollAndJamInnerLoopThreshold = 60;
 
   // Override with any target specific settings
   TTI.getUnrollingPreferences(L, SE, UP);
@@ -615,11 +617,10 @@ static Optional<EstimatedUnrollCost> ana
 }
 
 /// ApproximateLoopSize - Approximate the size of the loop.
-static unsigned
-ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable,
-                    bool &Convergent, const TargetTransformInfo &TTI,
-                    const SmallPtrSetImpl<const Value *> &EphValues,
-                    unsigned BEInsns) {
+unsigned llvm::ApproximateLoopSize(
+    const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
+    const TargetTransformInfo &TTI,
+    const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
   CodeMetrics Metrics;
   for (BasicBlock *BB : L->blocks())
     Metrics.analyzeBasicBlock(BB, TTI, EphValues);
@@ -712,7 +713,7 @@ static uint64_t getUnrolledLoopSize(
 
 // Returns true if unroll count was set explicitly.
 // Calculates unroll count and writes it to UP.Count.
-static bool computeUnrollCount(
+bool llvm::computeUnrollCount(
     Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
     ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
     OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
@@ -753,8 +754,8 @@ static bool computeUnrollCount(
 
   if (ExplicitUnroll && TripCount != 0) {
     // If the loop has an unrolling pragma, we want to be more aggressive with
-    // unrolling limits. Set thresholds to at least the PragmaThreshold value
-    // which is larger than the default limits.
+    // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
+    // value which is larger than the default limits.
     UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
     UP.PartialThreshold =
         std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);

Modified: llvm/trunk/lib/Transforms/Scalar/Scalar.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/Scalar.cpp?rev=336062&r1=336061&r2=336062&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Scalar/Scalar.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/Scalar.cpp Sun Jul  1 05:47:30 2018
@@ -70,6 +70,7 @@ void llvm::initializeScalarOpts(PassRegi
   initializeLoopStrengthReducePass(Registry);
   initializeLoopRerollPass(Registry);
   initializeLoopUnrollPass(Registry);
+  initializeLoopUnrollAndJamPass(Registry);
   initializeLoopUnswitchPass(Registry);
   initializeLoopVersioningLICMPass(Registry);
   initializeLoopIdiomRecognizeLegacyPassPass(Registry);
@@ -185,6 +186,10 @@ void LLVMAddLoopUnrollPass(LLVMPassManag
   unwrap(PM)->add(createLoopUnrollPass());
 }
 
+void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopUnrollAndJamPass());
+}
+
 void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopUnswitchPass());
 }

Modified: llvm/trunk/lib/Transforms/Utils/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Utils/CMakeLists.txt?rev=336062&r1=336061&r2=336062&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Utils/CMakeLists.txt (original)
+++ llvm/trunk/lib/Transforms/Utils/CMakeLists.txt Sun Jul  1 05:47:30 2018
@@ -28,6 +28,7 @@ add_llvm_library(LLVMTransformUtils
   LoopRotationUtils.cpp
   LoopSimplify.cpp
   LoopUnroll.cpp
+  LoopUnrollAndJam.cpp
   LoopUnrollPeel.cpp
   LoopUnrollRuntime.cpp
   LoopUtils.cpp

Modified: llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp?rev=336062&r1=336061&r2=336062&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp (original)
+++ llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp Sun Jul  1 05:47:30 2018
@@ -63,8 +63,7 @@ UnrollVerifyDomtree("unroll-verify-domtr
 
 /// Convert the instruction operands from referencing the current values into
 /// those specified by VMap.
-static inline void remapInstruction(Instruction *I,
-                                    ValueToValueMapTy &VMap) {
+void llvm::remapInstruction(Instruction *I, ValueToValueMapTy &VMap) {
   for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
     Value *Op = I->getOperand(op);
 
@@ -98,9 +97,9 @@ static inline void remapInstruction(Inst
 /// Folds a basic block into its predecessor if it only has one predecessor, and
 /// that predecessor only has one successor.
 /// The LoopInfo Analysis that is passed will be kept consistent.
-static BasicBlock *
-foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
-                         DominatorTree *DT) {
+BasicBlock *llvm::foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI,
+                                           ScalarEvolution *SE,
+                                           DominatorTree *DT) {
   // Merge basic blocks into their predecessor if there is only one distinct
   // pred, and if there is only one distinct successor of the predecessor, and
   // if there are no PHI nodes.
@@ -110,7 +109,8 @@ foldBlockIntoPredecessor(BasicBlock *BB,
   if (OnlyPred->getTerminator()->getNumSuccessors() != 1)
     return nullptr;
 
-  LLVM_DEBUG(dbgs() << "Merging: " << *BB << "into: " << *OnlyPred);
+  LLVM_DEBUG(dbgs() << "Merging: " << BB->getName() << " into "
+                    << OnlyPred->getName() << "\n");
 
   // Resolve any PHI nodes at the start of the block.  They are all
   // guaranteed to have exactly one entry if they exist, unless there are
@@ -255,9 +255,9 @@ static bool isEpilogProfitable(Loop *L)
 /// Perform some cleanup and simplifications on loops after unrolling. It is
 /// useful to simplify the IV's in the new loop, as well as do a quick
 /// simplify/dce pass of the instructions.
-static void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
-                                    ScalarEvolution *SE, DominatorTree *DT,
-                                    AssumptionCache *AC) {
+void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
+                                   ScalarEvolution *SE, DominatorTree *DT,
+                                   AssumptionCache *AC) {
   // Simplify any new induction variables in the partially unrolled loop.
   if (SE && SimplifyIVs) {
     SmallVector<WeakTrackingVH, 16> DeadInsts;
@@ -473,8 +473,8 @@ LoopUnrollResult llvm::UnrollLoop(
     if (Force)
       RuntimeTripCount = false;
     else {
-      LLVM_DEBUG(dbgs() << "Wont unroll; remainder loop could not be generated"
-                           "when assuming runtime trip count\n");
+      LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be "
+                           "generated when assuming runtime trip count\n");
       return LoopUnrollResult::Unmodified;
     }
   }

Added: llvm/trunk/lib/Transforms/Utils/LoopUnrollAndJam.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Utils/LoopUnrollAndJam.cpp?rev=336062&view=auto
==============================================================================
--- llvm/trunk/lib/Transforms/Utils/LoopUnrollAndJam.cpp (added)
+++ llvm/trunk/lib/Transforms/Utils/LoopUnrollAndJam.cpp Sun Jul  1 05:47:30 2018
@@ -0,0 +1,774 @@
+//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements loop unroll and jam as a routine, much like
+// LoopUnroll.cpp implements loop unroll.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/Utils/Local.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll-and-jam"
+
+STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
+STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
+
+static bool containsBB(std::vector<BasicBlock *> &V, BasicBlock *BB) {
+  return std::find(V.begin(), V.end(), BB) != V.end();
+}
+
+// Partition blocks in an outer/inner loop pair into blocks before and after
+// the loop
+static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
+                                     std::vector<BasicBlock *> &ForeBlocks,
+                                     std::vector<BasicBlock *> &SubLoopBlocks,
+                                     std::vector<BasicBlock *> &AftBlocks,
+                                     DominatorTree *DT) {
+  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
+  SubLoopBlocks = SubLoop->getBlocks();
+
+  for (BasicBlock *BB : L->blocks()) {
+    if (!SubLoop->contains(BB)) {
+      if (DT->dominates(SubLoopLatch, BB))
+        AftBlocks.push_back(BB);
+      else
+        ForeBlocks.push_back(BB);
+    }
+  }
+
+  // Check that all blocks in ForeBlocks together dominate the subloop
+  // TODO: This might ideally be done better with a dominator/postdominators.
+  BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
+  for (BasicBlock *BB : ForeBlocks) {
+    if (BB == SubLoopPreHeader)
+      continue;
+    TerminatorInst *TI = BB->getTerminator();
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      if (!containsBB(ForeBlocks, TI->getSuccessor(i)))
+        return false;
+  }
+
+  return true;
+}
+
+// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
+static void
+moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, BasicBlock *Latch,
+                                  Instruction *InsertLoc,
+                                  std::vector<BasicBlock *> &AftBlocks) {
+  // We need to ensure we move the instructions in the correct order,
+  // starting with the earliest required instruction and moving forward.
+  std::vector<Instruction *> Worklist;
+  std::vector<Instruction *> Visited;
+  for (auto &Phi : Header->phis()) {
+    Value *V = Phi.getIncomingValueForBlock(Latch);
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      Worklist.push_back(I);
+  }
+
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+    Worklist.pop_back();
+    if (!containsBB(AftBlocks, I->getParent()))
+      continue;
+
+    Visited.push_back(I);
+    for (auto &U : I->operands())
+      if (Instruction *II = dyn_cast<Instruction>(U))
+        Worklist.push_back(II);
+  }
+
+  // Move all instructions in program order to before the InsertLoc
+  BasicBlock *InsertLocBB = InsertLoc->getParent();
+  for (Instruction *I : reverse(Visited)) {
+    if (I->getParent() != InsertLocBB)
+      I->moveBefore(InsertLoc);
+  }
+}
+
+/*
+  This method performs Unroll and Jam. For a simple loop like:
+  for (i = ..)
+    Fore(i)
+    for (j = ..)
+      SubLoop(i, j)
+    Aft(i)
+
+  Instead of doing normal inner or outer unrolling, we do:
+  for (i = .., i+=2)
+    Fore(i)
+    Fore(i+1)
+    for (j = ..)
+      SubLoop(i, j)
+      SubLoop(i+1, j)
+    Aft(i)
+    Aft(i+1)
+
+  So the outer loop is essetially unrolled and then the inner loops are fused
+  ("jammed") together into a single loop. This can increase speed when there
+  are loads in SubLoop that are invariant to i, as they become shared between
+  the now jammed inner loops.
+
+  We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
+  Fore blocks are those before the inner loop, Aft are those after. Normal
+  Unroll code is used to copy each of these sets of blocks and the results are
+  combined together into the final form above.
+
+  isSafeToUnrollAndJam should be used prior to calling this to make sure the
+  unrolling will be valid. Checking profitablility is also advisable.
+*/
+LoopUnrollResult
+llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
+                       unsigned TripMultiple, bool UnrollRemainder,
+                       LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+                       AssumptionCache *AC, OptimizationRemarkEmitter *ORE) {
+
+  // When we enter here we should have already checked that it is safe
+  BasicBlock *Header = L->getHeader();
+  assert(L->getSubLoops().size() == 1);
+  Loop *SubLoop = *L->begin();
+
+  // Don't enter the unroll code if there is nothing to do.
+  if (TripCount == 0 && Count < 2) {
+    LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  assert(Count > 0);
+  assert(TripMultiple > 0);
+  assert(TripCount == 0 || TripCount % TripMultiple == 0);
+
+  // Are we eliminating the loop control altogether?
+  bool CompletelyUnroll = (Count == TripCount);
+
+  // We use the runtime remainder in cases where we don't know trip multiple
+  if (TripMultiple == 1 || TripMultiple % Count != 0) {
+    if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
+                                    /*UseEpilogRemainder*/ true,
+                                    UnrollRemainder, LI, SE, DT, AC, true)) {
+      LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
+                           "generated when assuming runtime trip count\n");
+      return LoopUnrollResult::Unmodified;
+    }
+  }
+
+  // Notify ScalarEvolution that the loop will be substantially changed,
+  // if not outright eliminated.
+  if (SE) {
+    SE->forgetLoop(L);
+    SE->forgetLoop(SubLoop);
+  }
+
+  using namespace ore;
+  // Report the unrolling decision.
+  if (CompletelyUnroll) {
+    LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
+                      << Header->getName() << " with trip count " << TripCount
+                      << "!\n");
+    ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
+                                 L->getHeader())
+              << "completely unroll and jammed loop with "
+              << NV("UnrollCount", TripCount) << " iterations");
+  } else {
+    auto DiagBuilder = [&]() {
+      OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
+                              L->getHeader());
+      return Diag << "unroll and jammed loop by a factor of "
+                  << NV("UnrollCount", Count);
+    };
+
+    LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
+                      << " by " << Count);
+    if (TripMultiple != 1) {
+      LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
+      ORE->emit([&]() {
+        return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
+                             << " trips per branch";
+      });
+    } else {
+      LLVM_DEBUG(dbgs() << " with run-time trip count");
+      ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
+    }
+    LLVM_DEBUG(dbgs() << "!\n");
+  }
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+  assert(Preheader && LatchBlock && Header);
+  assert(BI && !BI->isUnconditional());
+  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
+  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
+  bool SubLoopContinueOnTrue = SubLoop->contains(
+      SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
+
+  // Partition blocks in an outer/inner loop pair into blocks before and after
+  // the loop
+  std::vector<BasicBlock *> SubLoopBlocks;
+  std::vector<BasicBlock *> ForeBlocks;
+  std::vector<BasicBlock *> AftBlocks;
+  partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
+                           DT);
+
+  // We keep track of the entering/first and exiting/last block of each of
+  // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
+  // blocks easier.
+  std::vector<BasicBlock *> ForeBlocksFirst;
+  std::vector<BasicBlock *> ForeBlocksLast;
+  std::vector<BasicBlock *> SubLoopBlocksFirst;
+  std::vector<BasicBlock *> SubLoopBlocksLast;
+  std::vector<BasicBlock *> AftBlocksFirst;
+  std::vector<BasicBlock *> AftBlocksLast;
+  ForeBlocksFirst.push_back(Header);
+  ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
+  SubLoopBlocksFirst.push_back(SubLoop->getHeader());
+  SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
+  AftBlocksFirst.push_back(SubLoop->getExitBlock());
+  AftBlocksLast.push_back(L->getExitingBlock());
+  // Maps Blocks[0] -> Blocks[It]
+  ValueToValueMapTy LastValueMap;
+
+  // Move any instructions from fore phi operands from AftBlocks into Fore.
+  moveHeaderPhiOperandsToForeBlocks(
+      Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
+      AftBlocks);
+
+  // The current on-the-fly SSA update requires blocks to be processed in
+  // reverse postorder so that LastValueMap contains the correct value at each
+  // exit.
+  LoopBlocksDFS DFS(L);
+  DFS.perform(LI);
+  // Stash the DFS iterators before adding blocks to the loop.
+  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
+  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
+
+  if (Header->getParent()->isDebugInfoForProfiling())
+    for (BasicBlock *BB : L->getBlocks())
+      for (Instruction &I : *BB)
+        if (!isa<DbgInfoIntrinsic>(&I))
+          if (const DILocation *DIL = I.getDebugLoc())
+            I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
+
+  // Copy all blocks
+  for (unsigned It = 1; It != Count; ++It) {
+    std::vector<BasicBlock *> NewBlocks;
+    // Maps Blocks[It] -> Blocks[It-1]
+    DenseMap<Value *, Value *> PrevItValueMap;
+
+    for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+      ValueToValueMapTy VMap;
+      BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
+      Header->getParent()->getBasicBlockList().push_back(New);
+
+      if (containsBB(ForeBlocks, *BB)) {
+        L->addBasicBlockToLoop(New, *LI);
+
+        if (*BB == ForeBlocksFirst[0])
+          ForeBlocksFirst.push_back(New);
+        if (*BB == ForeBlocksLast[0])
+          ForeBlocksLast.push_back(New);
+      } else if (containsBB(SubLoopBlocks, *BB)) {
+        SubLoop->addBasicBlockToLoop(New, *LI);
+
+        if (*BB == SubLoopBlocksFirst[0])
+          SubLoopBlocksFirst.push_back(New);
+        if (*BB == SubLoopBlocksLast[0])
+          SubLoopBlocksLast.push_back(New);
+      } else if (containsBB(AftBlocks, *BB)) {
+        L->addBasicBlockToLoop(New, *LI);
+
+        if (*BB == AftBlocksFirst[0])
+          AftBlocksFirst.push_back(New);
+        if (*BB == AftBlocksLast[0])
+          AftBlocksLast.push_back(New);
+      } else {
+        llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
+      }
+
+      // Update our running maps of newest clones
+      PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
+      LastValueMap[*BB] = New;
+      for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
+           VI != VE; ++VI) {
+        PrevItValueMap[VI->second] =
+            const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
+        LastValueMap[VI->first] = VI->second;
+      }
+
+      NewBlocks.push_back(New);
+
+      // Update DomTree:
+      if (*BB == ForeBlocksFirst[0])
+        DT->addNewBlock(New, ForeBlocksLast[It - 1]);
+      else if (*BB == SubLoopBlocksFirst[0])
+        DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
+      else if (*BB == AftBlocksFirst[0])
+        DT->addNewBlock(New, AftBlocksLast[It - 1]);
+      else {
+        // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
+        // structure.
+        auto BBDomNode = DT->getNode(*BB);
+        auto BBIDom = BBDomNode->getIDom();
+        BasicBlock *OriginalBBIDom = BBIDom->getBlock();
+        assert(OriginalBBIDom);
+        assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
+        DT->addNewBlock(
+            New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
+      }
+    }
+
+    // Remap all instructions in the most recent iteration
+    for (BasicBlock *NewBlock : NewBlocks) {
+      for (Instruction &I : *NewBlock) {
+        ::remapInstruction(&I, LastValueMap);
+        if (auto *II = dyn_cast<IntrinsicInst>(&I))
+          if (II->getIntrinsicID() == Intrinsic::assume)
+            AC->registerAssumption(II);
+      }
+    }
+
+    // Alter the ForeBlocks phi's, pointing them at the latest version of the
+    // value from the previous iteration's phis
+    for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
+      Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
+      assert(OldValue && "should have incoming edge from Aft[It]");
+      Value *NewValue = OldValue;
+      if (Value *PrevValue = PrevItValueMap[OldValue])
+        NewValue = PrevValue;
+
+      assert(Phi.getNumOperands() == 2);
+      Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
+      Phi.setIncomingValue(0, NewValue);
+      Phi.removeIncomingValue(1);
+    }
+  }
+
+  // Now that all the basic blocks for the unrolled iterations are in place,
+  // finish up connecting the blocks and phi nodes. At this point LastValueMap
+  // is the last unrolled iterations values.
+
+  // Update Phis in BB from OldBB to point to NewBB
+  auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
+                            BasicBlock *NewBB) {
+    for (PHINode &Phi : BB->phis()) {
+      int I = Phi.getBasicBlockIndex(OldBB);
+      Phi.setIncomingBlock(I, NewBB);
+    }
+  };
+  // Update Phis in BB from OldBB to point to NewBB and use the latest value
+  // from LastValueMap
+  auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
+                                     BasicBlock *NewBB,
+                                     ValueToValueMapTy &LastValueMap) {
+    for (PHINode &Phi : BB->phis()) {
+      for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
+        if (Phi.getIncomingBlock(b) == OldBB) {
+          Value *OldValue = Phi.getIncomingValue(b);
+          if (Value *LastValue = LastValueMap[OldValue])
+            Phi.setIncomingValue(b, LastValue);
+          Phi.setIncomingBlock(b, NewBB);
+          break;
+        }
+      }
+    }
+  };
+  // Move all the phis from Src into Dest
+  auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
+    Instruction *insertPoint = Dest->getFirstNonPHI();
+    while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
+      Phi->moveBefore(insertPoint);
+  };
+
+  // Update the PHI values outside the loop to point to the last block
+  updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
+                           LastValueMap);
+
+  // Update ForeBlocks successors and phi nodes
+  BranchInst *ForeTerm =
+      cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
+  BasicBlock *Dest = SubLoopBlocksFirst[0];
+  ForeTerm->setSuccessor(0, Dest);
+
+  if (CompletelyUnroll) {
+    while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
+      Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
+      Phi->getParent()->getInstList().erase(Phi);
+    }
+  } else {
+    // Update the PHI values to point to the last aft block
+    updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
+                             AftBlocksLast.back(), LastValueMap);
+  }
+
+  for (unsigned It = 1; It != Count; It++) {
+    // Remap ForeBlock successors from previous iteration to this
+    BranchInst *ForeTerm =
+        cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
+    BasicBlock *Dest = ForeBlocksFirst[It];
+    ForeTerm->setSuccessor(0, Dest);
+  }
+
+  // Subloop successors and phis
+  BranchInst *SubTerm =
+      cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
+  SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
+  SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
+  updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
+                  ForeBlocksLast.back());
+  updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
+                  SubLoopBlocksLast.back());
+
+  for (unsigned It = 1; It != Count; It++) {
+    // Replace the conditional branch of the previous iteration subloop with an
+    // unconditional one to this one
+    BranchInst *SubTerm =
+        cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
+    BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
+    SubTerm->eraseFromParent();
+
+    updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
+                    ForeBlocksLast.back());
+    updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
+                    SubLoopBlocksLast.back());
+    movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
+  }
+
+  // Aft blocks successors and phis
+  BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
+  if (CompletelyUnroll) {
+    BranchInst::Create(LoopExit, Term);
+    Term->eraseFromParent();
+  } else {
+    Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
+  }
+  updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
+                  SubLoopBlocksLast.back());
+
+  for (unsigned It = 1; It != Count; It++) {
+    // Replace the conditional branch of the previous iteration subloop with an
+    // unconditional one to this one
+    BranchInst *AftTerm =
+        cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
+    BranchInst::Create(AftBlocksFirst[It], AftTerm);
+    AftTerm->eraseFromParent();
+
+    updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
+                    SubLoopBlocksLast.back());
+    movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
+  }
+
+  // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
+  // new ones required.
+  if (Count != 1) {
+    SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+    DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
+                           SubLoopBlocksFirst[0]);
+    DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
+                           SubLoopBlocksLast[0], AftBlocksFirst[0]);
+
+    DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
+                           ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
+    DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
+                           SubLoopBlocksLast.back(), AftBlocksFirst[0]);
+    DT->applyUpdates(DTUpdates);
+  }
+
+  // Merge adjacent basic blocks, if possible.
+  SmallPtrSet<BasicBlock *, 16> MergeBlocks;
+  MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
+  MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
+  MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
+  while (!MergeBlocks.empty()) {
+    BasicBlock *BB = *MergeBlocks.begin();
+    BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
+    if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
+      BasicBlock *Dest = Term->getSuccessor(0);
+      if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
+        // Don't remove BB and add Fold as they are the same BB
+        assert(Fold == BB);
+        (void)Fold;
+        MergeBlocks.erase(Dest);
+      } else
+        MergeBlocks.erase(BB);
+    } else
+      MergeBlocks.erase(BB);
+  }
+
+  // At this point, the code is well formed.  We now do a quick sweep over the
+  // inserted code, doing constant propagation and dead code elimination as we
+  // go.
+  simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
+  simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
+
+  NumCompletelyUnrolledAndJammed += CompletelyUnroll;
+  ++NumUnrolledAndJammed;
+
+#ifndef NDEBUG
+  // We shouldn't have done anything to break loop simplify form or LCSSA.
+  Loop *OuterL = L->getParentLoop();
+  Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
+  assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
+  if (!CompletelyUnroll)
+    assert(L->isLoopSimplifyForm());
+  assert(SubLoop->isLoopSimplifyForm());
+  assert(DT->verify());
+#endif
+
+  // Update LoopInfo if the loop is completely removed.
+  if (CompletelyUnroll)
+    LI->erase(L);
+
+  return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
+                          : LoopUnrollResult::PartiallyUnrolled;
+}
+
+static bool getLoadsAndStores(std::vector<BasicBlock *> &Blocks,
+                              SmallVector<Value *, 4> &MemInstr) {
+  // Scan the BBs and collect legal loads and stores.
+  // Returns false if non-simple loads/stores are found.
+  for (BasicBlock *BB : Blocks) {
+    for (Instruction &I : *BB) {
+      if (auto *Ld = dyn_cast<LoadInst>(&I)) {
+        if (!Ld->isSimple())
+          return false;
+        MemInstr.push_back(&I);
+      } else if (auto *St = dyn_cast<StoreInst>(&I)) {
+        if (!St->isSimple())
+          return false;
+        MemInstr.push_back(&I);
+      } else if (I.mayReadOrWriteMemory()) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
+                              SmallVector<Value *, 4> &Later,
+                              unsigned LoopDepth, bool InnerLoop,
+                              DependenceInfo &DI) {
+  // Use DA to check for dependencies between loads and stores that make unroll
+  // and jam invalid
+  for (Value *I : Earlier) {
+    for (Value *J : Later) {
+      Instruction *Src = cast<Instruction>(I);
+      Instruction *Dst = cast<Instruction>(J);
+      if (Src == Dst)
+        continue;
+      // Ignore Input dependencies.
+      if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
+        continue;
+
+      // Track dependencies, and if we find them take a conservative approach
+      // by allowing only = or < (not >), altough some > would be safe
+      // (depending upon unroll width).
+      // For the inner loop, we need to disallow any (> <) dependencies
+      // FIXME: Allow > so long as distance is less than unroll width
+      if (auto D = DI.depends(Src, Dst, true)) {
+        assert(D->isOrdered() && "Expected an output, flow or anti dep.");
+
+        if (D->isConfused())
+          return false;
+        if (!InnerLoop) {
+          if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT)
+            return false;
+        } else {
+          assert(LoopDepth + 1 <= D->getLevels());
+          if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
+              D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT)
+            return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+static bool checkDependencies(Loop *L, std::vector<BasicBlock *> &ForeBlocks,
+                              std::vector<BasicBlock *> &SubLoopBlocks,
+                              std::vector<BasicBlock *> &AftBlocks,
+                              DependenceInfo &DI) {
+  // Get all loads/store pairs for each blocks
+  SmallVector<Value *, 4> ForeMemInstr;
+  SmallVector<Value *, 4> SubLoopMemInstr;
+  SmallVector<Value *, 4> AftMemInstr;
+  if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
+      !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
+      !getLoadsAndStores(AftBlocks, AftMemInstr))
+    return false;
+
+  // Check for dependencies between any blocks that may change order
+  unsigned LoopDepth = L->getLoopDepth();
+  return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
+                           DI) &&
+         checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
+         checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
+                           DI) &&
+         checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
+                           DI);
+}
+
+bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
+                                DependenceInfo &DI) {
+  /* We currently handle outer loops like this:
+        |
+    ForeFirst    <----\    }
+     Blocks           |    } ForeBlocks
+    ForeLast          |    }
+        |             |
+    SubLoopFirst  <\  |    }
+     Blocks        |  |    } SubLoopBlocks
+    SubLoopLast   -/  |    }
+        |             |
+    AftFirst          |    }
+     Blocks           |    } AftBlocks
+    AftLast     ------/    }
+        |
+
+    There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
+    and AftBlocks, providing that there is one edge from Fores to SubLoops,
+    one edge from SubLoops to Afts and a single outer loop exit (from Afts).
+    In practice we currently limit Aft blocks to a single block, and limit
+    things further in the profitablility checks of the unroll and jam pass.
+
+    Because of the way we rearrange basic blocks, we also require that
+    the Fore blocks on all unrolled iterations are safe to move before the
+    SubLoop blocks of all iterations. So we require that the phi node looping
+    operands of ForeHeader can be moved to at least the end of ForeEnd, so that
+    we can arrange cloned Fore Blocks before the subloop and match up Phi's
+    correctly.
+
+    i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
+    It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
+
+    There are then a number of checks along the lines of no calls, no
+    exceptions, inner loop IV is consistent, etc. Note that for loops requiring
+    runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
+    UnrollAndJamLoop if the trip count cannot be easily calculated.
+  */
+
+  if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
+    return false;
+  Loop *SubLoop = L->getSubLoops()[0];
+  if (!SubLoop->isLoopSimplifyForm())
+    return false;
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Exit = L->getExitingBlock();
+  BasicBlock *SubLoopHeader = SubLoop->getHeader();
+  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
+  BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
+
+  if (Latch != Exit)
+    return false;
+  if (SubLoopLatch != SubLoopExit)
+    return false;
+
+  if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken())
+    return false;
+
+  // Split blocks into Fore/SubLoop/Aft based on dominators
+  std::vector<BasicBlock *> SubLoopBlocks;
+  std::vector<BasicBlock *> ForeBlocks;
+  std::vector<BasicBlock *> AftBlocks;
+  if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
+                                AftBlocks, &DT))
+    return false;
+
+  // Aft blocks may need to move instructions to fore blocks, which becomes more
+  // difficult if there are multiple (potentially conditionally executed)
+  // blocks. For now we just exclude loops with multiple aft blocks.
+  if (AftBlocks.size() != 1)
+    return false;
+
+  // Check inner loop IV is consistent between all iterations
+  const SCEV *SubLoopBECountSC = SE.getExitCount(SubLoop, SubLoopLatch);
+  if (isa<SCEVCouldNotCompute>(SubLoopBECountSC) ||
+      !SubLoopBECountSC->getType()->isIntegerTy())
+    return false;
+  ScalarEvolution::LoopDisposition LD =
+      SE.getLoopDisposition(SubLoopBECountSC, L);
+  if (LD != ScalarEvolution::LoopInvariant)
+    return false;
+
+  // Check the loop safety info for exceptions.
+  LoopSafetyInfo LSI;
+  computeLoopSafetyInfo(&LSI, L);
+  if (LSI.MayThrow)
+    return false;
+
+  // We've ruled out the easy stuff and now need to check that there are no
+  // interdependencies which may prevent us from moving the:
+  //  ForeBlocks before Subloop and AftBlocks.
+  //  Subloop before AftBlocks.
+  //  ForeBlock phi operands before the subloop
+
+  // Make sure we can move all instructions we need to before the subloop
+  SmallVector<Instruction *, 8> Worklist;
+  SmallPtrSet<Instruction *, 8> Visited;
+  for (auto &Phi : Header->phis()) {
+    Value *V = Phi.getIncomingValueForBlock(Latch);
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      Worklist.push_back(I);
+  }
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+    Worklist.pop_back();
+    if (Visited.insert(I).second) {
+      if (SubLoop->contains(I->getParent()))
+        return false;
+      if (containsBB(AftBlocks, I->getParent())) {
+        // If we hit a phi node in afts we know we are done (probably LCSSA)
+        if (isa<PHINode>(I))
+          return false;
+        if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
+          return false;
+        for (auto &U : I->operands())
+          if (Instruction *II = dyn_cast<Instruction>(U))
+            Worklist.push_back(II);
+      }
+    }
+  }
+
+  // Check for memory dependencies which prohibit the unrolling we are doing.
+  // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
+  // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
+  if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI))
+    return false;
+
+  return true;
+}

Added: llvm/trunk/test/Transforms/LoopUnrollAndJam/dependencies.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnrollAndJam/dependencies.ll?rev=336062&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopUnrollAndJam/dependencies.ll (added)
+++ llvm/trunk/test/Transforms/LoopUnrollAndJam/dependencies.ll Sun Jul  1 05:47:30 2018
@@ -0,0 +1,470 @@
+; RUN: opt -basicaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; CHECK-LABEL: fore_aft_less
+; CHECK: %j = phi
+; CHECK: %j.1 = phi
+; CHECK: %j.2 = phi
+; CHECK: %j.3 = phi
+define void @fore_aft_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp sgt i32 %N, 0
+  br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+  %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 1, i32* %arrayidx, align 4
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+  %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx5, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %add6 = add nuw nsw i32 %j, 1
+  %exitcond = icmp eq i32 %add6, %N
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add7 = add nuw nsw i32 %i, 1
+  %add72 = add nuw nsw i32 %i, -1
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+  store i32 %add, i32* %arrayidx8, align 4
+  %exitcond29 = icmp eq i32 %add7, %N
+  br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+  ret void
+}
+
+
+; CHECK-LABEL: fore_aft_eq
+; CHECK: %j = phi
+; CHECK: %j.1 = phi
+; CHECK: %j.2 = phi
+; CHECK: %j.3 = phi
+define void @fore_aft_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp sgt i32 %N, 0
+  br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+  %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 1, i32* %arrayidx, align 4
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+  %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx5, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %add6 = add nuw nsw i32 %j, 1
+  %exitcond = icmp eq i32 %add6, %N
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add7 = add nuw nsw i32 %i, 1
+  %add72 = add nuw nsw i32 %i, 0
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add, i32* %arrayidx8, align 4
+  %exitcond29 = icmp eq i32 %add7, %N
+  br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+  ret void
+}
+
+
+; CHECK-LABEL: fore_aft_more
+; CHECK: %j = phi
+; CHECK-NOT: %j.1 = phi
+define void @fore_aft_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp sgt i32 %N, 0
+  br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+  %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 1, i32* %arrayidx, align 4
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+  %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx5, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %add6 = add nuw nsw i32 %j, 1
+  %exitcond = icmp eq i32 %add6, %N
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add7 = add nuw nsw i32 %i, 1
+  %add72 = add nuw nsw i32 %i, 1
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+  store i32 %add, i32* %arrayidx8, align 4
+  %exitcond29 = icmp eq i32 %add7, %N
+  br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+  ret void
+}
+
+
+; CHECK-LABEL: fore_sub_less
+; CHECK: %j = phi
+; CHECK: %j.1 = phi
+; CHECK: %j.2 = phi
+; CHECK: %j.3 = phi
+define void @fore_sub_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp sgt i32 %N, 0
+  br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+  %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 1, i32* %arrayidx, align 4
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+  %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx5, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %add72 = add nuw nsw i32 %i, -1
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+  store i32 %add, i32* %arrayidx8, align 4
+  %add6 = add nuw nsw i32 %j, 1
+  %exitcond = icmp eq i32 %add6, %N
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add7 = add nuw nsw i32 %i, 1
+  %exitcond29 = icmp eq i32 %add7, %N
+  br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+  ret void
+}
+
+
+; CHECK-LABEL: fore_sub_eq
+; CHECK: %j = phi
+; CHECK: %j.1 = phi
+; CHECK: %j.2 = phi
+; CHECK: %j.3 = phi
+define void @fore_sub_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp sgt i32 %N, 0
+  br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+  %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 1, i32* %arrayidx, align 4
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+  %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx5, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %add72 = add nuw nsw i32 %i, 0
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+  store i32 %add, i32* %arrayidx8, align 4
+  %add6 = add nuw nsw i32 %j, 1
+  %exitcond = icmp eq i32 %add6, %N
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add7 = add nuw nsw i32 %i, 1
+  %exitcond29 = icmp eq i32 %add7, %N
+  br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+  ret void
+}
+
+
+; CHECK-LABEL: fore_sub_more
+; CHECK: %j = phi
+; CHECK-NOT: %j.1 = phi
+define void @fore_sub_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp sgt i32 %N, 0
+  br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+  %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 1, i32* %arrayidx, align 4
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+  %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx5, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %add72 = add nuw nsw i32 %i, 1
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+  store i32 %add, i32* %arrayidx8, align 4
+  %add6 = add nuw nsw i32 %j, 1
+  %exitcond = icmp eq i32 %add6, %N
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add7 = add nuw nsw i32 %i, 1
+  %exitcond29 = icmp eq i32 %add7, %N
+  br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+  ret void
+}
+
+
+; CHECK-LABEL: sub_aft_less
+; CHECK: %j = phi
+; CHECK: %j.1 = phi
+; CHECK: %j.2 = phi
+; CHECK: %j.3 = phi
+define void @sub_aft_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp sgt i32 %N, 0
+  br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+  %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+  %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx5, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %add6 = add nuw nsw i32 %j, 1
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 1, i32* %arrayidx, align 4
+  %exitcond = icmp eq i32 %add6, %N
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add7 = add nuw nsw i32 %i, 1
+  %add72 = add nuw nsw i32 %i, -1
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+  store i32 %add, i32* %arrayidx8, align 4
+  %exitcond29 = icmp eq i32 %add7, %N
+  br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+  ret void
+}
+
+
+; CHECK-LABEL: sub_aft_eq
+; CHECK: %j = phi
+; CHECK: %j.1 = phi
+; CHECK: %j.2 = phi
+; CHECK: %j.3 = phi
+define void @sub_aft_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp sgt i32 %N, 0
+  br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+  %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+  %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx5, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %add6 = add nuw nsw i32 %j, 1
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 1, i32* %arrayidx, align 4
+  %exitcond = icmp eq i32 %add6, %N
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add7 = add nuw nsw i32 %i, 1
+  %add72 = add nuw nsw i32 %i, 0
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add, i32* %arrayidx8, align 4
+  %exitcond29 = icmp eq i32 %add7, %N
+  br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+  ret void
+}
+
+
+; CHECK-LABEL: sub_aft_more
+; CHECK: %j = phi
+; CHECK-NOT: %j.1 = phi
+define void @sub_aft_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp sgt i32 %N, 0
+  br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+  %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+  %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx5, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %add6 = add nuw nsw i32 %j, 1
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 1, i32* %arrayidx, align 4
+  %exitcond = icmp eq i32 %add6, %N
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add7 = add nuw nsw i32 %i, 1
+  %add72 = add nuw nsw i32 %i, 1
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+  store i32 %add, i32* %arrayidx8, align 4
+  %exitcond29 = icmp eq i32 %add7, %N
+  br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+  ret void
+}
+
+
+; CHECK-LABEL: sub_sub_less
+; CHECK: %j = phi
+; CHECK-NOT: %j.1 = phi
+define void @sub_sub_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp sgt i32 %N, 0
+  br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+  %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+  %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx5, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %add6 = add nuw nsw i32 %j, 1
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 1, i32* %arrayidx, align 4
+  %add72 = add nuw nsw i32 %i, -1
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+  store i32 %add, i32* %arrayidx8, align 4
+  %exitcond = icmp eq i32 %add6, %N
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add7 = add nuw nsw i32 %i, 1
+  %exitcond29 = icmp eq i32 %add7, %N
+  br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+  ret void
+}
+
+
+; CHECK-LABEL: sub_sub_eq
+; CHECK: %j = phi
+; CHECK: %j.1 = phi
+define void @sub_sub_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp sgt i32 %N, 0
+  br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+  %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+  %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx5, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %add6 = add nuw nsw i32 %j, 1
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 1, i32* %arrayidx, align 4
+  %add72 = add nuw nsw i32 %i, 0
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+  store i32 %add, i32* %arrayidx8, align 4
+  %exitcond = icmp eq i32 %add6, %N
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add7 = add nuw nsw i32 %i, 1
+  %exitcond29 = icmp eq i32 %add7, %N
+  br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+  ret void
+}
+
+
+; CHECK-LABEL: sub_sub_more
+; CHECK: %j = phi
+; CHECK-NOT: %j.1 = phi
+define void @sub_sub_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp sgt i32 %N, 0
+  br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+  %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+  %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx5, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %add6 = add nuw nsw i32 %j, 1
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 1, i32* %arrayidx, align 4
+  %add72 = add nuw nsw i32 %i, 1
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
+  store i32 %add, i32* %arrayidx8, align 4
+  %exitcond = icmp eq i32 %add6, %N
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add7 = add nuw nsw i32 %i, 1
+  %exitcond29 = icmp eq i32 %add7, %N
+  br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopUnrollAndJam/disable.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnrollAndJam/disable.ll?rev=336062&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopUnrollAndJam/disable.ll (added)
+++ llvm/trunk/test/Transforms/LoopUnrollAndJam/disable.ll Sun Jul  1 05:47:30 2018
@@ -0,0 +1,741 @@
+; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 -pass-remarks=loop-unroll-and-jam < %s -S 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+;; Common check for all tests. None should be unroll and jammed
+; CHECK-NOT: remark: {{.*}} unroll and jammed
+
+
+; CHECK-LABEL: disabled1
+; Tests for(i) { sum = A[i]; for(j) sum += B[j]; A[i+1] = sum; }
+; A[i] to A[i+1] dependency should block unrollandjam
+define void @disabled1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i.029 = phi i32 [ %add10, %for.latch ], [ 0, %for.preheader ]
+; CHECK: %j.026 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp127 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp127, %cmp
+  br i1 %or.cond, label %for.preheader, label %return
+
+for.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.029 = phi i32 [ %add10, %for.latch ], [ 0, %for.preheader ]
+  %b.028 = phi i32 [ %inc8, %for.latch ], [ 1, %for.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.029
+  %0 = load i32, i32* %arrayidx, align 4
+  br label %for.inner
+
+for.inner:
+  %j.026 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum1.025 = phi i32 [ %0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %j.026
+  %1 = load i32, i32* %arrayidx6, align 4
+  %add = add i32 %1, %sum1.025
+  %inc = add nuw i32 %j.026, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %b.028
+  store i32 %add, i32* %arrayidx7, align 4
+  %inc8 = add nuw nsw i32 %b.028, 1
+  %add10 = add nuw nsw i32 %i.029, 1
+  %exitcond30 = icmp eq i32 %add10, %I
+  br i1 %exitcond30, label %return, label %for.outer
+
+return:
+  ret void
+}
+
+
+; CHECK-LABEL: disabled2
+; Tests an incompatible block layout (for.outer jumps past for.inner)
+; FIXME: Make this work
+define void @disabled2(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i.032 = phi i32 [ %add13, %for.latch ], [ 0, %for.preheader ]
+; CHECK: %j.030 = phi i32 [ %inc, %for.inner ], [ 0, %for.inner.preheader ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp131 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp131, %cmp
+  br i1 %or.cond, label %for.preheader, label %for.end14
+
+for.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.032 = phi i32 [ %add13, %for.latch ], [ 0, %for.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.032
+  %0 = load i32, i32* %arrayidx, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %for.latch, label %for.inner
+
+for.inner:
+  %j.030 = phi i32 [ %inc, %for.inner ], [ 0, %for.outer ]
+  %sum1.029 = phi i32 [ %sum1.1, %for.inner ], [ 0, %for.outer ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %j.030
+  %1 = load i32, i32* %arrayidx6, align 4
+  %tobool7 = icmp eq i32 %1, 0
+  %sub = add i32 %sum1.029, 10
+  %add = sub i32 %sub, %1
+  %sum1.1 = select i1 %tobool7, i32 %sum1.029, i32 %add
+  %inc = add nuw i32 %j.030, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %sum1.1.lcssa = phi i32 [ 0, %for.outer ], [ %sum1.1, %for.inner ]
+  %arrayidx11 = getelementptr inbounds i32, i32* %A, i32 %i.032
+  store i32 %sum1.1.lcssa, i32* %arrayidx11, align 4
+  %add13 = add nuw i32 %i.032, 1
+  %exitcond33 = icmp eq i32 %add13, %I
+  br i1 %exitcond33, label %for.end14, label %for.outer
+
+for.end14:
+  ret void
+}
+
+
+; CHECK-LABEL: disabled3
+; Tests loop carry dependencies in an array S
+define void @disabled3(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i.029 = phi i32 [ 0, %for.preheader ], [ %add12, %for.latch ]
+; CHECK: %j.027 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+  %S = alloca [4 x i32], align 4
+  %cmp = icmp eq i32 %J, 0
+  br i1 %cmp, label %return, label %if.end
+
+if.end:
+  %0 = bitcast [4 x i32]* %S to i8*
+  %cmp128 = icmp eq i32 %I, 0
+  br i1 %cmp128, label %for.cond.cleanup, label %for.preheader
+
+for.preheader:
+  %arrayidx9 = getelementptr inbounds [4 x i32], [4 x i32]* %S, i32 0, i32 0
+  br label %for.outer
+
+for.cond.cleanup:
+  br label %return
+
+for.outer:
+  %i.029 = phi i32 [ 0, %for.preheader ], [ %add12, %for.latch ]
+  br label %for.inner
+
+for.inner:
+  %j.027 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j.027
+  %l2 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %j.027, %i.029
+  %rem = urem i32 %add, %J
+  %arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %rem
+  %l3 = load i32, i32* %arrayidx6, align 4
+  %mul = mul i32 %l3, %l2
+  %rem7 = urem i32 %j.027, 3
+  %arrayidx8 = getelementptr inbounds [4 x i32], [4 x i32]* %S, i32 0, i32 %rem7
+  store i32 %mul, i32* %arrayidx8, align 4
+  %inc = add nuw i32 %j.027, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %l1 = load i32, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %A, i32 %i.029
+  store i32 %l1, i32* %arrayidx10, align 4
+  %add12 = add nuw i32 %i.029, 1
+  %exitcond31 = icmp eq i32 %add12, %I
+  br i1 %exitcond31, label %for.cond.cleanup, label %for.outer
+
+return:
+  ret void
+}
+
+
+; CHECK-LABEL: disabled4
+; Inner looop induction variable is not consistent
+; ie for(i = 0..n) for (j = 0..i) sum+=B[j]
+define void @disabled4(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %indvars.iv = phi i32 [ %indvars.iv.next, %for.latch ], [ 1, %for.preheader ]
+; CHECK: %j.021 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ugt i32 %I, 1
+  %or.cond = and i1 %cmp122, %cmp
+  br i1 %or.cond, label %for.preheader, label %for.end9
+
+for.preheader:
+  br label %for.outer
+
+for.outer:
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.latch ], [ 1, %for.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.021 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum1.020 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j.021
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %sum1.020
+  %inc = add nuw i32 %j.021, 1
+  %exitcond = icmp eq i32 %inc, %indvars.iv
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  store i32 %add, i32* %arrayidx6, align 4
+  %indvars.iv.next = add nuw i32 %indvars.iv, 1
+  %exitcond24 = icmp eq i32 %indvars.iv.next, %I
+  br i1 %exitcond24, label %for.end9, label %for.outer
+
+for.end9:
+  ret void
+}
+
+
+; CHECK-LABEL: disabled5
+; Test odd uses of phi nodes where the outer IV cannot be moved into Fore as it hits a PHI
+ at f = hidden global i32 0, align 4
+define i32 @disabled5() #0 {
+; CHECK: %0 = phi i32 [ %f.promoted10, %entry ], [ 2, %for.latch ]
+; CHECK: %1 = phi i32 [ %0, %for.outer ], [ 2, %for.inner ]
+entry:
+  %f.promoted10 = load i32, i32* @f, align 4
+  br label %for.outer
+
+for.outer:
+  %0 = phi i32 [ %f.promoted10, %entry ], [ 2, %for.latch ]
+  %d.018 = phi i16 [ 0, %entry ], [ %odd.lcssa, %for.latch ]
+  %inc5.sink9 = phi i32 [ 2, %entry ], [ %inc5, %for.latch ]
+  br label %for.inner
+
+for.inner:
+  %1 = phi i32 [ %0, %for.outer ], [ 2, %for.inner ]
+  %inc.sink8 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %inc = add nuw nsw i32 %inc.sink8, 1
+  %exitcond = icmp ne i32 %inc, 7
+  br i1 %exitcond, label %for.inner, label %for.latch
+
+for.latch:
+  %.lcssa = phi i32 [ %1, %for.inner ]
+  %odd.lcssa = phi i16 [ 1, %for.inner ]
+  %inc5 = add nuw nsw i32 %inc5.sink9, 1
+  %exitcond11 = icmp ne i32 %inc5, 7
+  br i1 %exitcond11, label %for.outer, label %for.end
+
+for.end:
+  %.lcssa.lcssa = phi i32 [ %.lcssa, %for.latch ]
+  %inc.lcssa.lcssa = phi i32 [ 7, %for.latch ]
+  ret i32 0
+}
+
+
+; CHECK-LABEL: disabled6
+; There is a dependency in here, between @d and %0 (=@f)
+ at d6 = hidden global i16 5, align 2
+ at f6 = hidden global i16* @d6, align 4
+define i32 @disabled6() #0 {
+; CHECK: %inc8.sink14.i = phi i16 [ 1, %entry ], [ %inc8.i, %for.cond.cleanup.i ]
+; CHECK: %c.013.i = phi i32 [ 0, %for.body.i ], [ %inc.i, %for.body6.i ]
+entry:
+  store i16 1, i16* @d6, align 2
+  %0 = load i16*, i16** @f6, align 4
+  br label %for.body.i
+
+for.body.i:
+  %inc8.sink14.i = phi i16 [ 1, %entry ], [ %inc8.i, %for.cond.cleanup.i ]
+  %1 = load i16, i16* %0, align 2
+  br label %for.body6.i
+
+for.cond.cleanup.i:
+  %inc8.i = add nuw nsw i16 %inc8.sink14.i, 1
+  store i16 %inc8.i, i16* @d6, align 2
+  %cmp.i = icmp ult i16 %inc8.i, 6
+  br i1 %cmp.i, label %for.body.i, label %test.exit
+
+for.body6.i:
+  %c.013.i = phi i32 [ 0, %for.body.i ], [ %inc.i, %for.body6.i ]
+  %inc.i = add nuw nsw i32 %c.013.i, 1
+  %exitcond.i = icmp eq i32 %inc.i, 7
+  br i1 %exitcond.i, label %for.cond.cleanup.i, label %for.body6.i
+
+test.exit:
+  %conv2.i = sext i16 %1 to i32
+  ret i32 0
+}
+
+
+; CHECK-LABEL: disabled7
+; Has negative output dependency
+define void @disabled7(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i.028 = phi i32 [ %add11, %for.cond3.for.cond.cleanup5_crit_edge ], [ 0, %for.body.preheader ]
+; CHECK: %j.026 = phi i32 [ 0, %for.body ], [ %add9, %for.body6 ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp127 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp127, %cmp
+  br i1 %or.cond, label %for.body.preheader, label %for.end12
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %i.028 = phi i32 [ %add11, %for.cond3.for.cond.cleanup5_crit_edge ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.028
+  store i32 0, i32* %arrayidx, align 4
+  %sub = add i32 %i.028, -1
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %sub
+  store i32 2, i32* %arrayidx2, align 4
+  br label %for.body6
+
+for.cond3.for.cond.cleanup5_crit_edge:
+  store i32 %add, i32* %arrayidx, align 4
+  %add11 = add nuw i32 %i.028, 1
+  %exitcond29 = icmp eq i32 %add11, %I
+  br i1 %exitcond29, label %for.end12, label %for.body
+
+for.body6:
+  %0 = phi i32 [ 0, %for.body ], [ %add, %for.body6 ]
+  %j.026 = phi i32 [ 0, %for.body ], [ %add9, %for.body6 ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %B, i32 %j.026
+  %1 = load i32, i32* %arrayidx7, align 4
+  %add = add i32 %1, %0
+  %add9 = add nuw i32 %j.026, 1
+  %exitcond = icmp eq i32 %add9, %J
+  br i1 %exitcond, label %for.cond3.for.cond.cleanup5_crit_edge, label %for.body6
+
+for.end12:
+  ret void
+}
+
+
+; CHECK-LABEL: disabled8
+; Same as above with an extra outer loop nest
+define void @disabled8(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i.036 = phi i32 [ %add15, %for.latch ], [ 0, %for.body ]
+; CHECK: %j.034 = phi i32 [ 0, %for.outer ], [ %add13, %for.inner ]
+entry:
+  %cmp = icmp eq i32 %J, 0
+  %cmp335 = icmp eq i32 %I, 0
+  %or.cond = or i1 %cmp, %cmp335
+  br i1 %or.cond, label %for.end18, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %x.037 = phi i32 [ %inc, %for.cond.cleanup4 ], [ 0, %for.body.preheader ]
+  br label %for.outer
+
+for.cond.cleanup4:
+  %inc = add nuw nsw i32 %x.037, 1
+  %exitcond40 = icmp eq i32 %inc, 5
+  br i1 %exitcond40, label %for.end18, label %for.body
+
+for.outer:
+  %i.036 = phi i32 [ %add15, %for.latch ], [ 0, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.036
+  store i32 0, i32* %arrayidx, align 4
+  %sub = add i32 %i.036, -1
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %sub
+  store i32 2, i32* %arrayidx6, align 4
+  br label %for.inner
+
+for.latch:
+  store i32 %add, i32* %arrayidx, align 4
+  %add15 = add nuw i32 %i.036, 1
+  %exitcond38 = icmp eq i32 %add15, %I
+  br i1 %exitcond38, label %for.cond.cleanup4, label %for.outer
+
+for.inner:
+  %0 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %j.034 = phi i32 [ 0, %for.outer ], [ %add13, %for.inner ]
+  %arrayidx11 = getelementptr inbounds i32, i32* %B, i32 %j.034
+  %1 = load i32, i32* %arrayidx11, align 4
+  %add = add i32 %1, %0
+  %add13 = add nuw i32 %j.034, 1
+  %exitcond = icmp eq i32 %add13, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.end18:
+  ret void
+}
+
+
+; CHECK-LABEL: disabled9
+; Can't prove alias between A and B
+define void @disabled9(i32 %I, i32 %J, i32* nocapture %A, i32* nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %sum1
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.lcssa = phi i32 [ %add, %for.inner ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add.lcssa, i32* %arrayidx6, align 4
+  %add8 = add nuw i32 %i, 1
+  %exitcond25 = icmp eq i32 %add8, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: disable10
+; Simple call
+declare void @f10(i32, i32) #0
+define void @disable10(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %sum1
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, %J
+  tail call void @f10(i32 %i, i32 %j) nounwind
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.lcssa = phi i32 [ %add, %for.inner ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add.lcssa, i32* %arrayidx6, align 4
+  %add8 = add nuw i32 %i, 1
+  %exitcond25 = icmp eq i32 %add8, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: disable11
+; volatile
+define void @disable11(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load volatile i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %sum1
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.lcssa = phi i32 [ %add, %for.inner ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add.lcssa, i32* %arrayidx6, align 4
+  %add8 = add nuw i32 %i, 1
+  %exitcond25 = icmp eq i32 %add8, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: disable12
+; Multiple aft blocks
+define void @disable12(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %add8, %for.latch3 ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %add8, %for.latch3 ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %sum1
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.lcssa = phi i32 [ %add, %for.inner ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add.lcssa, i32* %arrayidx6, align 4
+  %cmpl = icmp eq i32 %add.lcssa, 10
+  br i1 %cmpl, label %for.latch2, label %for.latch3
+
+for.latch2:
+  br label %for.latch3
+
+for.latch3:
+  %add8 = add nuw i32 %i, 1
+  %exitcond25 = icmp eq i32 %add8, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: disable13
+; Two subloops
+define void @disable13(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+; CHECK: %j2 = phi i32 [ %inc2, %for.inner2 ], [ 0, %for.inner2.preheader ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %sum1
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.inner2, label %for.inner
+
+for.inner2:
+  %j2 = phi i32 [ 0, %for.inner ], [ %inc2, %for.inner2 ]
+  %sum12 = phi i32 [ 0, %for.inner ], [ %add2, %for.inner2 ]
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i32 %j2
+  %l0 = load i32, i32* %arrayidx2, align 4
+  %add2 = add i32 %l0, %sum12
+  %inc2 = add nuw i32 %j2, 1
+  %exitcond2 = icmp eq i32 %inc2, %J
+  br i1 %exitcond2, label %for.latch, label %for.inner2
+
+for.latch:
+  %add.lcssa = phi i32 [ %add, %for.inner2 ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add.lcssa, i32* %arrayidx6, align 4
+  %add8 = add nuw i32 %i, 1
+  %exitcond25 = icmp eq i32 %add8, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: disable14
+; Multiple exits blocks
+define void @disable14(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ %inc, %for.inner ], [ 0, %for.inner.preheader ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+  %add8 = add nuw i32 %i, 1
+  %exitcond23 = icmp eq i32 %add8, %I
+  br i1 %exitcond23, label %for.end.loopexit, label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %sum1
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.lcssa = phi i32 [ %add, %for.inner ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add.lcssa, i32* %arrayidx6, align 4
+  %exitcond25 = icmp eq i32 %add8, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: disable15
+; Latch != exit
+define void @disable15(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ %inc, %for.inner ], [ 0, %for.inner.preheader ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+  %add8 = add nuw i32 %i, 1
+  %exitcond25 = icmp eq i32 %add8, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %sum1
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.lcssa = phi i32 [ %add, %for.inner ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add.lcssa, i32* %arrayidx6, align 4
+  br label %for.outer
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: disable16
+; Cannot move other before inner loop
+define void @disable16(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+  %otherphi = phi i32 [ %other, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %sum1
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.lcssa = phi i32 [ %add, %for.inner ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add.lcssa, i32* %arrayidx6, align 4
+  %add8 = add nuw i32 %i, 1
+  %exitcond25 = icmp eq i32 %add8, %I
+  %loadarr = getelementptr inbounds i32, i32* %A, i32 %i
+  %load = load i32, i32* %arrayidx6, align 4
+  %other = add i32 %otherphi, %load
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopUnrollAndJam/pragma.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnrollAndJam/pragma.ll?rev=336062&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopUnrollAndJam/pragma.ll (added)
+++ llvm/trunk/test/Transforms/LoopUnrollAndJam/pragma.ll Sun Jul  1 05:47:30 2018
@@ -0,0 +1,319 @@
+; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-runtime < %s -S | FileCheck %s
+; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-runtime -unroll-and-jam-threshold=15 < %s -S | FileCheck %s --check-prefix=CHECK-LOWTHRES
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; CHECK-LABEL: test1
+; Basic check that these loops are by default UnJ'd
+define void @test1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
+; CHECK-LOWTHRES: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: nounroll_and_jam
+; #pragma nounroll_and_jam
+define void @nounroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !1
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: unroll_and_jam_count
+; #pragma unroll_and_jam(8)
+define void @unroll_and_jam_count(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us.7, %for.latch ], [ 0, %for.outer.preheader.new ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !3
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: unroll_and_jam
+; #pragma unroll_and_jam
+define void @unroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
+; CHECK-LOWTHRES: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !5
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: nounroll
+; #pragma nounroll (which we take to mean disable unroll and jam too)
+define void @nounroll(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !7
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: unroll
+; #pragma unroll (which we take to mean disable unroll and jam)
+define void @unroll(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !9
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: nounroll_plus_unroll_and_jam
+; #pragma clang loop nounroll, unroll_and_jam (which we take to mean do unroll_and_jam)
+define void @nounroll_plus_unroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !11
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+!1 = distinct !{!1, !2}
+!2 = distinct !{!"llvm.loop.unroll_and_jam.disable"}
+!3 = distinct !{!3, !4}
+!4 = distinct !{!"llvm.loop.unroll_and_jam.count", i32 8}
+!5 = distinct !{!5, !6}
+!6 = distinct !{!"llvm.loop.unroll_and_jam.enable"}
+!7 = distinct !{!7, !8}
+!8 = distinct !{!"llvm.loop.unroll.disable"}
+!9 = distinct !{!9, !10}
+!10 = distinct !{!"llvm.loop.unroll.enable"}
+!11 = distinct !{!11, !8, !6}
\ No newline at end of file

Added: llvm/trunk/test/Transforms/LoopUnrollAndJam/unprofitable.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnrollAndJam/unprofitable.ll?rev=336062&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopUnrollAndJam/unprofitable.ll (added)
+++ llvm/trunk/test/Transforms/LoopUnrollAndJam/unprofitable.ll Sun Jul  1 05:47:30 2018
@@ -0,0 +1,217 @@
+; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -pass-remarks=loop-unroll < %s -S 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8m.main-arm-none-eabi"
+
+;; Common check for all tests. None should be unroll and jammed due to profitability
+; CHECK-NOT: remark: {{.*}} unroll and jammed
+
+
+; CHECK-LABEL: unprof1
+; Multiple inner loop blocks
+define void @unprof1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner2 ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner2 ]
+  %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner2 ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %sum1
+br label %for.inner2
+
+for.inner2:
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.lcssa = phi i32 [ %add, %for.inner2 ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add.lcssa, i32* %arrayidx6, align 4
+  %addinc = add nuw i32 %i, 1
+  %exitcond25 = icmp eq i32 %addinc, %I
+  br i1 %exitcond25, label %for.loopexit, label %for.outer
+
+for.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: unprof2
+; Constant inner loop count
+define void @unprof2(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %sum1
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, 10
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.lcssa = phi i32 [ %add, %for.inner ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add.lcssa, i32* %arrayidx6, align 4
+  %addinc = add nuw i32 %i, 1
+  %exitcond25 = icmp eq i32 %addinc, %I
+  br i1 %exitcond25, label %for.loopexit, label %for.outer
+
+for.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: unprof3
+; Complex inner loop
+define void @unprof3(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %sum1
+  %add0 = add i32 %0, %sum1
+  %add1 = add i32 %0, %sum1
+  %add2 = add i32 %0, %sum1
+  %add3 = add i32 %0, %sum1
+  %add4 = add i32 %0, %sum1
+  %add5 = add i32 %0, %sum1
+  %add6 = add i32 %0, %sum1
+  %add7 = add i32 %0, %sum1
+  %add8 = add i32 %0, %sum1
+  %add9 = add i32 %0, %sum1
+  %add10 = add i32 %0, %sum1
+  %add11 = add i32 %0, %sum1
+  %add12 = add i32 %0, %sum1
+  %add13 = add i32 %0, %sum1
+  %add14 = add i32 %0, %sum1
+  %add15 = add i32 %0, %sum1
+  %add16 = add i32 %0, %sum1
+  %add17 = add i32 %0, %sum1
+  %add18 = add i32 %0, %sum1
+  %add19 = add i32 %0, %sum1
+  %add20 = add i32 %0, %sum1
+  %add21 = add i32 %0, %sum1
+  %add22 = add i32 %0, %sum1
+  %add23 = add i32 %0, %sum1
+  %add24 = add i32 %0, %sum1
+  %add25 = add i32 %0, %sum1
+  %add26 = add i32 %0, %sum1
+  %add27 = add i32 %0, %sum1
+  %add28 = add i32 %0, %sum1
+  %add29 = add i32 %0, %sum1
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.lcssa = phi i32 [ %add, %for.inner ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add.lcssa, i32* %arrayidx6, align 4
+  %addinc = add nuw i32 %i, 1
+  %exitcond25 = icmp eq i32 %addinc, %I
+  br i1 %exitcond25, label %for.loopexit, label %for.outer
+
+for.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: unprof4
+; No loop invariant loads
+define void @unprof4(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+; CHECK: %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
+; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %j2 = add i32 %j, %i
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j2
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %sum1
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.lcssa = phi i32 [ %add, %for.inner ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add.lcssa, i32* %arrayidx6, align 4
+  %addinc = add nuw i32 %i, 1
+  %exitcond25 = icmp eq i32 %addinc, %I
+  br i1 %exitcond25, label %for.loopexit, label %for.outer
+
+for.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll?rev=336062&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll (added)
+++ llvm/trunk/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll Sun Jul  1 05:47:30 2018
@@ -0,0 +1,735 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -basicaa -tbaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 -unroll-remainder < %s -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; CHECK-LABEL: test1
+; Tests for(i) { sum = 0; for(j) sum += B[j]; A[i] = sum; }
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[J:%.*]], 0
+; CHECK-NEXT:    [[CMPJ:%.*]] = icmp ne i32 [[I:%.*]], 0
+; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[CMP]], [[CMPJ]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[FOR_OUTER_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.outer.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[I]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i32 [[I]], 3
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_OUTER_PREHEADER_NEW:%.*]]
+; CHECK:       for.outer.preheader.new:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i32 [[I]], [[XTRAITER]]
+; CHECK-NEXT:    br label [[FOR_OUTER:%.*]]
+; CHECK:       for.outer:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[ADD8_3:%.*]], [[FOR_LATCH:%.*]] ], [ 0, [[FOR_OUTER_PREHEADER_NEW]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i32 [ [[UNROLL_ITER]], [[FOR_OUTER_PREHEADER_NEW]] ], [ [[NITER_NSUB_3:%.*]], [[FOR_LATCH]] ]
+; CHECK-NEXT:    [[ADD8:%.*]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    [[NITER_NSUB:%.*]] = sub i32 [[NITER]], 1
+; CHECK-NEXT:    [[ADD8_1:%.*]] = add nuw nsw i32 [[ADD8]], 1
+; CHECK-NEXT:    [[NITER_NSUB_1:%.*]] = sub i32 [[NITER_NSUB]], 1
+; CHECK-NEXT:    [[ADD8_2:%.*]] = add nuw nsw i32 [[ADD8_1]], 1
+; CHECK-NEXT:    [[NITER_NSUB_2:%.*]] = sub i32 [[NITER_NSUB_1]], 1
+; CHECK-NEXT:    [[ADD8_3]] = add nuw i32 [[ADD8_2]], 1
+; CHECK-NEXT:    [[NITER_NSUB_3]] = sub i32 [[NITER_NSUB_2]], 1
+; CHECK-NEXT:    br label [[FOR_INNER:%.*]]
+; CHECK:       for.inner:
+; CHECK-NEXT:    [[J_0:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_1:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_1:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_2:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_2:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT:    [[J_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_3:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT:    [[SUM_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_3:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[J_0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !tbaa !0
+; CHECK-NEXT:    [[ADD]] = add i32 [[TMP2]], [[SUM]]
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[J_0]], 1
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4, !tbaa !0
+; CHECK-NEXT:    [[ADD_1]] = add i32 [[TMP3]], [[SUM_1]]
+; CHECK-NEXT:    [[INC_1]] = add nuw i32 [[J_1]], 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4, !tbaa !0
+; CHECK-NEXT:    [[ADD_2]] = add i32 [[TMP4]], [[SUM_2]]
+; CHECK-NEXT:    [[INC_2]] = add nuw i32 [[J_2]], 1
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4, !tbaa !0
+; CHECK-NEXT:    [[ADD_3]] = add i32 [[TMP5]], [[SUM_3]]
+; CHECK-NEXT:    [[INC_3]] = add nuw i32 [[J_3]], 1
+; CHECK-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_3]], [[J]]
+; CHECK-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_LATCH]], label [[FOR_INNER]]
+; CHECK:       for.latch:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD_LCSSA_1:%.*]] = phi i32 [ [[ADD_1]], [[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD_LCSSA_2:%.*]] = phi i32 [ [[ADD_2]], [[FOR_INNER]] ]
+; CHECK-NEXT:    [[ADD_LCSSA_3:%.*]] = phi i32 [ [[ADD_3]], [[FOR_INNER]] ]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA]], i32* [[ARRAYIDX6]], align 4, !tbaa !0
+; CHECK-NEXT:    [[ARRAYIDX6_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_1]], i32* [[ARRAYIDX6_1]], align 4, !tbaa !0
+; CHECK-NEXT:    [[ARRAYIDX6_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8_1]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_2]], i32* [[ARRAYIDX6_2]], align 4, !tbaa !0
+; CHECK-NEXT:    [[ARRAYIDX6_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8_2]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_3]], i32* [[ARRAYIDX6_3]], align 4, !tbaa !0
+; CHECK-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i32 [[NITER_NSUB_3]], 0
+; CHECK-NEXT:    br i1 [[NITER_NCMP_3]], label [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[FOR_OUTER]], !llvm.loop !4
+; CHECK:       for.end.loopexit.unr-lcssa.loopexit:
+; CHECK-NEXT:    [[I_UNR_PH:%.*]] = phi i32 [ [[ADD8_3]], [[FOR_LATCH]] ]
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT_UNR_LCSSA]]
+; CHECK:       for.end.loopexit.unr-lcssa:
+; CHECK-NEXT:    [[I_UNR:%.*]] = phi i32 [ 0, [[FOR_OUTER_PREHEADER]] ], [ [[I_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[FOR_OUTER_EPIL_PREHEADER:%.*]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.outer.epil.preheader:
+; CHECK-NEXT:    br label [[FOR_OUTER_EPIL:%.*]]
+; CHECK:       for.outer.epil:
+; CHECK-NEXT:    br label [[FOR_INNER_EPIL:%.*]]
+; CHECK:       for.inner.epil:
+; CHECK-NEXT:    [[J_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[INC_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    [[SUM_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[ADD_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_EPIL]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_EPIL]], align 4, !tbaa !0
+; CHECK-NEXT:    [[ADD_EPIL]] = add i32 [[TMP6]], [[SUM_EPIL]]
+; CHECK-NEXT:    [[INC_EPIL]] = add nuw i32 [[J_EPIL]], 1
+; CHECK-NEXT:    [[EXITCOND_EPIL:%.*]] = icmp eq i32 [[INC_EPIL]], [[J]]
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL]], label [[FOR_LATCH_EPIL:%.*]], label [[FOR_INNER_EPIL]]
+; CHECK:       for.latch.epil:
+; CHECK-NEXT:    [[ADD_LCSSA_EPIL:%.*]] = phi i32 [ [[ADD_EPIL]], [[FOR_INNER_EPIL]] ]
+; CHECK-NEXT:    [[ARRAYIDX6_EPIL:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_UNR]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL]], i32* [[ARRAYIDX6_EPIL]], align 4, !tbaa !0
+; CHECK-NEXT:    [[ADD8_EPIL:%.*]] = add nuw i32 [[I_UNR]], 1
+; CHECK-NEXT:    [[EPIL_ITER_SUB:%.*]] = sub i32 [[XTRAITER]], 1
+; CHECK-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i32 [[EPIL_ITER_SUB]], 0
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label [[FOR_OUTER_EPIL_1:%.*]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA:%.*]]
+; CHECK:       for.end.loopexit.epilog-lcssa:
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+; CHECK:       for.outer.epil.1:
+; CHECK-NEXT:    br label [[FOR_INNER_EPIL_1:%.*]]
+; CHECK:       for.inner.epil.1:
+; CHECK-NEXT:    [[J_EPIL_1:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_1]] ], [ [[INC_EPIL_1:%.*]], [[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    [[SUM_EPIL_1:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_1]] ], [ [[ADD_EPIL_1:%.*]], [[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_EPIL_1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_EPIL_1]], align 4, !tbaa !0
+; CHECK-NEXT:    [[ADD_EPIL_1]] = add i32 [[TMP7]], [[SUM_EPIL_1]]
+; CHECK-NEXT:    [[INC_EPIL_1]] = add nuw i32 [[J_EPIL_1]], 1
+; CHECK-NEXT:    [[EXITCOND_EPIL_1:%.*]] = icmp eq i32 [[INC_EPIL_1]], [[J]]
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_1]], label [[FOR_LATCH_EPIL_1:%.*]], label [[FOR_INNER_EPIL_1]]
+; CHECK:       for.latch.epil.1:
+; CHECK-NEXT:    [[ADD_LCSSA_EPIL_1:%.*]] = phi i32 [ [[ADD_EPIL_1]], [[FOR_INNER_EPIL_1]] ]
+; CHECK-NEXT:    [[ARRAYIDX6_EPIL_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8_EPIL]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL_1]], i32* [[ARRAYIDX6_EPIL_1]], align 4, !tbaa !0
+; CHECK-NEXT:    [[ADD8_EPIL_1:%.*]] = add nuw i32 [[ADD8_EPIL]], 1
+; CHECK-NEXT:    [[EPIL_ITER_SUB_1:%.*]] = sub i32 [[EPIL_ITER_SUB]], 1
+; CHECK-NEXT:    [[EPIL_ITER_CMP_1:%.*]] = icmp ne i32 [[EPIL_ITER_SUB_1]], 0
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP_1]], label [[FOR_OUTER_EPIL_2:%.*]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
+; CHECK:       for.outer.epil.2:
+; CHECK-NEXT:    br label [[FOR_INNER_EPIL_2:%.*]]
+; CHECK:       for.inner.epil.2:
+; CHECK-NEXT:    [[J_EPIL_2:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_2]] ], [ [[INC_EPIL_2:%.*]], [[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    [[SUM_EPIL_2:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL_2]] ], [ [[ADD_EPIL_2:%.*]], [[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_EPIL_2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX_EPIL_2]], align 4, !tbaa !0
+; CHECK-NEXT:    [[ADD_EPIL_2]] = add i32 [[TMP8]], [[SUM_EPIL_2]]
+; CHECK-NEXT:    [[INC_EPIL_2]] = add nuw i32 [[J_EPIL_2]], 1
+; CHECK-NEXT:    [[EXITCOND_EPIL_2:%.*]] = icmp eq i32 [[INC_EPIL_2]], [[J]]
+; CHECK-NEXT:    br i1 [[EXITCOND_EPIL_2]], label [[FOR_LATCH_EPIL_2:%.*]], label [[FOR_INNER_EPIL_2]]
+; CHECK:       for.latch.epil.2:
+; CHECK-NEXT:    [[ADD_LCSSA_EPIL_2:%.*]] = phi i32 [ [[ADD_EPIL_2]], [[FOR_INNER_EPIL_2]] ]
+; CHECK-NEXT:    [[ARRAYIDX6_EPIL_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8_EPIL_1]]
+; CHECK-NEXT:    store i32 [[ADD_LCSSA_EPIL_2]], i32* [[ARRAYIDX6_EPIL_2]], align 4, !tbaa !0
+; CHECK-NEXT:    [[ADD8_EPIL_2:%.*]] = add nuw i32 [[ADD8_EPIL_1]], 1
+; CHECK-NEXT:    [[EPIL_ITER_SUB_2:%.*]] = sub i32 [[EPIL_ITER_SUB_1]], 1
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]]
+define void @test1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmpJ = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmpJ
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx, align 4, !tbaa !5
+  %add = add i32 %0, %sum
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.lcssa = phi i32 [ %add, %for.inner ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add.lcssa, i32* %arrayidx6, align 4, !tbaa !5
+  %add8 = add nuw i32 %i, 1
+  %exitcond25 = icmp eq i32 %add8, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: test2
+; Tests for(i) { sum = A[i]; for(j) sum += B[j]; A[i] = sum; }
+; A[i] load/store dependency should not block unroll-and-jam
+; CHECK: for.outer:
+; CHECK:   %i = phi i32 [ %add9.3, %for.latch ], [ 0, %for.outer.preheader.new ]
+; CHECK:   %niter = phi i32 [ %unroll_iter, %for.outer.preheader.new ], [ %niter.nsub.3, %for.latch ]
+; CHECK:   br label %for.inner
+; CHECK: for.inner:
+; CHECK:   %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+; CHECK:   %sum = phi i32 [ %2, %for.outer ], [ %add, %for.inner ]
+; CHECK:   %j.1 = phi i32 [ 0, %for.outer ], [ %inc.1, %for.inner ]
+; CHECK:   %sum.1 = phi i32 [ %3, %for.outer ], [ %add.1, %for.inner ]
+; CHECK:   %j.2 = phi i32 [ 0, %for.outer ], [ %inc.2, %for.inner ]
+; CHECK:   %sum.2 = phi i32 [ %4, %for.outer ], [ %add.2, %for.inner ]
+; CHECK:   %j.3 = phi i32 [ 0, %for.outer ], [ %inc.3, %for.inner ]
+; CHECK:   %sum.3 = phi i32 [ %5, %for.outer ], [ %add.3, %for.inner ]
+; CHECK:   br i1 %exitcond.3, label %for.latch, label %for.inner
+; CHECK: for.latch:
+; CHECK:   %add.lcssa = phi i32 [ %add, %for.inner ]
+; CHECK:   %add.lcssa.1 = phi i32 [ %add.1, %for.inner ]
+; CHECK:   %add.lcssa.2 = phi i32 [ %add.2, %for.inner ]
+; CHECK:   %add.lcssa.3 = phi i32 [ %add.3, %for.inner ]
+; CHECK:   br i1 %niter.ncmp.3, label %for.end10.loopexit.unr-lcssa.loopexit, label %for.outer
+; CHECK: for.end10.loopexit.unr-lcssa.loopexit:
+define void @test2(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp125 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp125
+  br i1 %or.cond, label %for.outer.preheader, label %for.end10
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %add9, %for.latch ], [ 0, %for.outer.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+  %0 = load i32, i32* %arrayidx, align 4, !tbaa !5
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum = phi i32 [ %0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %j
+  %1 = load i32, i32* %arrayidx6, align 4, !tbaa !5
+  %add = add i32 %1, %sum
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.lcssa = phi i32 [ %add, %for.inner ]
+  store i32 %add.lcssa, i32* %arrayidx, align 4, !tbaa !5
+  %add9 = add nuw i32 %i, 1
+  %exitcond28 = icmp eq i32 %add9, %I
+  br i1 %exitcond28, label %for.end10.loopexit, label %for.outer
+
+for.end10.loopexit:
+  br label %for.end10
+
+for.end10:
+  ret void
+}
+
+
+; CHECK-LABEL: test3
+; Tests Complete unroll-and-jam of the outer loop
+; CHECK: for.outer:
+; CHECK:   br label %for.inner
+; CHECK: for.inner:
+; CHECK:   %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+; CHECK:   %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+; CHECK:   %j.1 = phi i32 [ 0, %for.outer ], [ %inc.1, %for.inner ]
+; CHECK:   %sum.1 = phi i32 [ 0, %for.outer ], [ %add.1, %for.inner ]
+; CHECK:   %j.2 = phi i32 [ 0, %for.outer ], [ %inc.2, %for.inner ]
+; CHECK:   %sum.2 = phi i32 [ 0, %for.outer ], [ %add.2, %for.inner ]
+; CHECK:   %j.3 = phi i32 [ 0, %for.outer ], [ %inc.3, %for.inner ]
+; CHECK:   %sum.3 = phi i32 [ 0, %for.outer ], [ %add.3, %for.inner ]
+; CHECK:   br i1 %exitcond.3, label %for.latch, label %for.inner
+; CHECK: for.latch:
+; CHECK:   %add.lcssa = phi i32 [ %add, %for.inner ]
+; CHECK:   %add.lcssa.1 = phi i32 [ %add.1, %for.inner ]
+; CHECK:   %add.lcssa.2 = phi i32 [ %add.2, %for.inner ]
+; CHECK:   %add.lcssa.3 = phi i32 [ %add.3, %for.inner ]
+; CHECK:   br label %for.end
+; CHECK: for.end:
+define void @test3(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+entry:
+  %cmp = icmp eq i32 %J, 0
+  br i1 %cmp, label %for.end, label %for.preheader
+
+for.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %add8, %for.latch ], [ 0, %for.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx, align 4, !tbaa !5
+  %sub = add i32 %sum, 10
+  %add = sub i32 %sub, %0
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add, i32* %arrayidx6, align 4, !tbaa !5
+  %add8 = add nuw nsw i32 %i, 1
+  %exitcond23 = icmp eq i32 %add8, 4
+  br i1 %exitcond23, label %for.end, label %for.outer
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: test4
+; Tests Complete unroll-and-jam with a trip count of 1
+; CHECK: for.outer:
+; CHECK:   br label %for.inner
+; CHECK: for.inner:
+; CHECK:   %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+; CHECK:   %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+; CHECK:   br i1 %exitcond, label %for.latch, label %for.inner
+; CHECK: for.latch:
+; CHECK:   %add.lcssa = phi i32 [ %add, %for.inner ]
+; CHECK:   br label %for.end
+; CHECK: for.end:
+define void @test4(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+entry:
+  %cmp = icmp eq i32 %J, 0
+  br i1 %cmp, label %for.end, label %for.preheader
+
+for.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %add8, %for.latch ], [ 0, %for.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx, align 4, !tbaa !5
+  %sub = add i32 %sum, 10
+  %add = sub i32 %sub, %0
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add, i32* %arrayidx6, align 4, !tbaa !5
+  %add8 = add nuw nsw i32 %i, 1
+  %exitcond23 = icmp eq i32 %add8, 1
+  br i1 %exitcond23, label %for.end, label %for.outer
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: test5
+; Multiple SubLoopBlocks
+; CHECK: for.outer:
+; CHECK:   br label %for.inner
+; CHECK: for.inner:
+; CHECK:   %inc8.sink15 = phi i32 [ 0, %for.outer ], [ %inc8, %for.inc.1 ]
+; CHECK:   %inc8.sink15.1 = phi i32 [ 0, %for.outer ], [ %inc8.1, %for.inc.1 ]
+; CHECK:   br label %for.inner2
+; CHECK: for.inner2:
+; CHECK:   br i1 %tobool, label %for.cond4, label %for.inc
+; CHECK: for.cond4:
+; CHECK:   br i1 %tobool.1, label %for.cond4a, label %for.inc
+; CHECK: for.cond4a:
+; CHECK:   br label %for.inc
+; CHECK: for.inc:
+; CHECK:   br i1 %tobool.11, label %for.cond4.1, label %for.inc.1
+; CHECK: for.latch:
+; CHECK:   br label %for.end
+; CHECK: for.end:
+; CHECK:   ret i32 0
+; CHECK: for.cond4.1:
+; CHECK:   br i1 %tobool.1.1, label %for.cond4a.1, label %for.inc.1
+; CHECK: for.cond4a.1:
+; CHECK:   br label %for.inc.1
+; CHECK: for.inc.1:
+; CHECK:   br i1 %exitcond.1, label %for.latch, label %for.inner
+ at a = hidden global [1 x i32] zeroinitializer, align 4
+define i32 @test5() #0 {
+entry:
+  br label %for.outer
+
+for.outer:
+  %.sink16 = phi i32 [ 0, %entry ], [ %add, %for.latch ]
+  br label %for.inner
+
+for.inner:
+  %inc8.sink15 = phi i32 [ 0, %for.outer ], [ %inc8, %for.inc ]
+  br label %for.inner2
+
+for.inner2:
+  %l1 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i32 0, i32 0), align 4
+  %tobool = icmp eq i32 %l1, 0
+  br i1 %tobool, label %for.cond4, label %for.inc
+
+for.cond4:
+  %l0 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i32 1, i32 0), align 4
+  %tobool.1 = icmp eq i32 %l0, 0
+  br i1 %tobool.1, label %for.cond4a, label %for.inc
+
+for.cond4a:
+  br label %for.inc
+
+for.inc:
+  %l2 = phi i32 [ 0, %for.inner2 ], [ 1, %for.cond4 ], [ 2, %for.cond4a ]
+  %inc8 = add nuw nsw i32 %inc8.sink15, 1
+  %exitcond = icmp eq i32 %inc8, 3
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %.lcssa = phi i32 [ %l2, %for.inc ]
+  %conv11 = and i32 %.sink16, 255
+  %add = add nuw nsw i32 %conv11, 4
+  %cmp = icmp eq i32 %add, 8
+  br i1 %cmp, label %for.end, label %for.outer
+
+for.end:
+  %.lcssa.lcssa = phi i32 [ %.lcssa, %for.latch ]
+  ret i32 0
+}
+
+
+; CHECK-LABEL: test6
+; Test odd uses of phi nodes
+; CHECK: for.outer:
+; CHECK:   br label %for.inner
+; CHECK: for.inner:
+; CHECK:   br i1 %exitcond.3, label %for.inner, label %for.latch
+; CHECK: for.latch:
+; CHECK:   br label %for.end
+; CHECK: for.end:
+; CHECK:   ret i32 0
+ at f = hidden global i32 0, align 4
+define i32 @test6() #0 {
+entry:
+  %f.promoted10 = load i32, i32* @f, align 4, !tbaa !5
+  br label %for.outer
+
+for.outer:
+  %p0 = phi i32 [ %f.promoted10, %entry ], [ 2, %for.latch ]
+  %inc5.sink9 = phi i32 [ 2, %entry ], [ %inc5, %for.latch ]
+  br label %for.inner
+
+for.inner:
+  %p1 = phi i32 [ %p0, %for.outer ], [ 2, %for.inner ]
+  %inc.sink8 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %inc = add nuw nsw i32 %inc.sink8, 1
+  %exitcond = icmp ne i32 %inc, 7
+  br i1 %exitcond, label %for.inner, label %for.latch
+
+for.latch:
+  %.lcssa = phi i32 [ %p1, %for.inner ]
+  %inc5 = add nuw nsw i32 %inc5.sink9, 1
+  %exitcond11 = icmp ne i32 %inc5, 7
+  br i1 %exitcond11, label %for.outer, label %for.end
+
+for.end:
+  %.lcssa.lcssa = phi i32 [ %.lcssa, %for.latch ]
+  %inc.lcssa.lcssa = phi i32 [ 7, %for.latch ]
+  ret i32 0
+}
+
+
+; CHECK-LABEL: test7
+; Has a positive dependency between two stores. Still valid.
+; The negative dependecy is in unroll-and-jam-disabled.ll
+; CHECK: for.outer:
+; CHECK:   %i = phi i32 [ %add.3, %for.latch ], [ 0, %for.preheader.new ]
+; CHECK:   %niter = phi i32 [ %unroll_iter, %for.preheader.new ], [ %niter.nsub.3, %for.latch ]
+; CHECK:   br label %for.inner
+; CHECK: for.latch:
+; CHECK:   %add9.lcssa = phi i32 [ %add9, %for.inner ]
+; CHECK:   %add9.lcssa.1 = phi i32 [ %add9.1, %for.inner ]
+; CHECK:   %add9.lcssa.2 = phi i32 [ %add9.2, %for.inner ]
+; CHECK:   %add9.lcssa.3 = phi i32 [ %add9.3, %for.inner ]
+; CHECK:   br i1 %niter.ncmp.3, label %for.end.loopexit.unr-lcssa.loopexit, label %for.outer
+; CHECK: for.inner:
+; CHECK:   %sum = phi i32 [ 0, %for.outer ], [ %add9, %for.inner ]
+; CHECK:   %j = phi i32 [ 0, %for.outer ], [ %add10, %for.inner ]
+; CHECK:   %sum.1 = phi i32 [ 0, %for.outer ], [ %add9.1, %for.inner ]
+; CHECK:   %j.1 = phi i32 [ 0, %for.outer ], [ %add10.1, %for.inner ]
+; CHECK:   %sum.2 = phi i32 [ 0, %for.outer ], [ %add9.2, %for.inner ]
+; CHECK:   %j.2 = phi i32 [ 0, %for.outer ], [ %add10.2, %for.inner ]
+; CHECK:   %sum.3 = phi i32 [ 0, %for.outer ], [ %add9.3, %for.inner ]
+; CHECK:   %j.3 = phi i32 [ 0, %for.outer ], [ %add10.3, %for.inner ]
+; CHECK:   br i1 %exitcond.3, label %for.latch, label %for.inner
+; CHECK: for.end.loopexit.unr-lcssa.loopexit:
+define void @test7(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp128 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp128, %cmp
+  br i1 %or.cond, label %for.preheader, label %for.end
+
+for.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %add, %for.latch ], [ 0, %for.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 0, i32* %arrayidx, align 4, !tbaa !5
+  %add = add nuw i32 %i, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %add
+  store i32 2, i32* %arrayidx2, align 4, !tbaa !5
+  br label %for.inner
+
+for.latch:
+  store i32 %add9, i32* %arrayidx, align 4, !tbaa !5
+  %exitcond30 = icmp eq i32 %add, %I
+  br i1 %exitcond30, label %for.end, label %for.outer
+
+for.inner:
+  %sum = phi i32 [ 0, %for.outer ], [ %add9, %for.inner ]
+  %j = phi i32 [ 0, %for.outer ], [ %add10, %for.inner ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %B, i32 %j
+  %l1 = load i32, i32* %arrayidx7, align 4, !tbaa !5
+  %add9 = add i32 %l1, %sum
+  %add10 = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %add10, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: test8
+; Same as test7 with an extra outer loop nest
+; CHECK: for.outest:
+; CHECK:   br label %for.outer
+; CHECK: for.outer:
+; CHECK:   %i = phi i32 [ %add.3, %for.latch ], [ 0, %for.outest.new ]
+; CHECK:   %niter = phi i32 [ %unroll_iter, %for.outest.new ], [ %niter.nsub.3, %for.latch ]
+; CHECK:   br label %for.inner
+; CHECK: for.inner:
+; CHECK:   %sum = phi i32 [ 0, %for.outer ], [ %add9, %for.inner ]
+; CHECK:   %j = phi i32 [ 0, %for.outer ], [ %add10, %for.inner ]
+; CHECK:   %sum.1 = phi i32 [ 0, %for.outer ], [ %add9.1, %for.inner ]
+; CHECK:   %j.1 = phi i32 [ 0, %for.outer ], [ %add10.1, %for.inner ]
+; CHECK:   %sum.2 = phi i32 [ 0, %for.outer ], [ %add9.2, %for.inner ]
+; CHECK:   %j.2 = phi i32 [ 0, %for.outer ], [ %add10.2, %for.inner ]
+; CHECK:   %sum.3 = phi i32 [ 0, %for.outer ], [ %add9.3, %for.inner ]
+; CHECK:   %j.3 = phi i32 [ 0, %for.outer ], [ %add10.3, %for.inner ]
+; CHECK:   br i1 %exitcond.3, label %for.latch, label %for.inner
+; CHECK: for.latch:
+; CHECK:   %add9.lcssa = phi i32 [ %add9, %for.inner ]
+; CHECK:   %add9.lcssa.1 = phi i32 [ %add9.1, %for.inner ]
+; CHECK:   %add9.lcssa.2 = phi i32 [ %add9.2, %for.inner ]
+; CHECK:   %add9.lcssa.3 = phi i32 [ %add9.3, %for.inner ]
+; CHECK:   br i1 %niter.ncmp.3, label %for.cleanup.unr-lcssa.loopexit, label %for.outer
+; CHECK: for.cleanup.epilog-lcssa:
+; CHECK:   br label %for.cleanup
+; CHECK: for.cleanup:
+; CHECK:   br i1 %exitcond41, label %for.end.loopexit, label %for.outest
+; CHECK: for.end.loopexit:
+; CHECK:   br label %for.end
+define void @test8(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+entry:
+  %cmp = icmp eq i32 %J, 0
+  %cmp336 = icmp eq i32 %I, 0
+  %or.cond = or i1 %cmp, %cmp336
+  br i1 %or.cond, label %for.end, label %for.preheader
+
+for.preheader:
+  br label %for.outest
+
+for.outest:
+  %x.038 = phi i32 [ %inc, %for.cleanup ], [ 0, %for.preheader ]
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %add, %for.latch ], [ 0, %for.outest ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 0, i32* %arrayidx, align 4, !tbaa !5
+  %add = add nuw i32 %i, 1
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %add
+  store i32 2, i32* %arrayidx6, align 4, !tbaa !5
+  br label %for.inner
+
+for.inner:
+  %sum = phi i32 [ 0, %for.outer ], [ %add9, %for.inner ]
+  %j = phi i32 [ 0, %for.outer ], [ %add10, %for.inner ]
+  %arrayidx11 = getelementptr inbounds i32, i32* %B, i32 %j
+  %l1 = load i32, i32* %arrayidx11, align 4, !tbaa !5
+  %add9 = add i32 %l1, %sum
+  %add10 = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %add10, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  store i32 %add9, i32* %arrayidx, align 4, !tbaa !5
+  %exitcond39 = icmp eq i32 %add, %I
+  br i1 %exitcond39, label %for.cleanup, label %for.outer
+
+for.cleanup:
+  %inc = add nuw nsw i32 %x.038, 1
+  %exitcond41 = icmp eq i32 %inc, 5
+  br i1 %exitcond41, label %for.end, label %for.outest
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: test9
+; Same as test1 with tbaa, not noalias
+; CHECK: for.outer:
+; CHECK:   %i = phi i32 [ %add8.3, %for.latch ], [ 0, %for.outer.preheader.new ]
+; CHECK:   %niter = phi i32 [ %unroll_iter, %for.outer.preheader.new ], [ %niter.nsub.3, %for.latch ]
+; CHECK:   br label %for.inner
+; CHECK: for.inner:
+; CHECK:   %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+; CHECK:   %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+; CHECK:   %j.1 = phi i32 [ 0, %for.outer ], [ %inc.1, %for.inner ]
+; CHECK:   %sum.1 = phi i32 [ 0, %for.outer ], [ %add.1, %for.inner ]
+; CHECK:   %j.2 = phi i32 [ 0, %for.outer ], [ %inc.2, %for.inner ]
+; CHECK:   %sum.2 = phi i32 [ 0, %for.outer ], [ %add.2, %for.inner ]
+; CHECK:   %j.3 = phi i32 [ 0, %for.outer ], [ %inc.3, %for.inner ]
+; CHECK:   %sum.3 = phi i32 [ 0, %for.outer ], [ %add.3, %for.inner ]
+; CHECK:   br i1 %exitcond.3, label %for.latch, label %for.inner
+; CHECK: for.latch:
+; CHECK:   %add.lcssa = phi i32 [ %add, %for.inner ]
+; CHECK:   %add.lcssa.1 = phi i32 [ %add.1, %for.inner ]
+; CHECK:   %add.lcssa.2 = phi i32 [ %add.2, %for.inner ]
+; CHECK:   %add.lcssa.3 = phi i32 [ %add.3, %for.inner ]
+; CHECK:   br i1 %niter.ncmp.3, label %for.end.loopexit.unr-lcssa.loopexit, label %for.outer
+; CHECK: for.end.loopexit.unr-lcssa.loopexit:
+define void @test9(i32 %I, i32 %J, i32* nocapture %A, i16* nocapture readonly %B) #0 {
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmpJ = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmpJ
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx = getelementptr inbounds i16, i16* %B, i32 %j
+  %0 = load i16, i16* %arrayidx, align 4, !tbaa !9
+  %sext = sext i16 %0 to i32
+  %add = add i32 %sext, %sum
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.lcssa = phi i32 [ %add, %for.inner ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add.lcssa, i32* %arrayidx6, align 4, !tbaa !5
+  %add8 = add nuw i32 %i, 1
+  %exitcond25 = icmp eq i32 %add8, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: test10
+; Be careful not to incorrectly update the exit phi nodes
+; CHECK: %dec.lcssa.lcssa.ph.ph = phi i64 [ 0, %for.inc24 ]
+%struct.a = type { i64 }
+ at g = common global %struct.a zeroinitializer, align 8
+ at c = common global [1 x i8] zeroinitializer, align 1
+define signext i16 @test10(i32 %k) #0 {
+entry:
+  %0 = load i8, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @c, i64 0, i64 0), align 1
+  %tobool9 = icmp eq i8 %0, 0
+  %tobool13 = icmp ne i32 %k, 0
+  br label %for.body
+
+for.body:
+  %storemerge82 = phi i64 [ 0, %entry ], [ %inc25, %for.inc24 ]
+  br label %for.body2
+
+for.body2:
+  %storemerge = phi i64 [ 4, %for.body ], [ %dec, %for.inc21 ]
+  br i1 %tobool9, label %for.body2.split, label %for.body2.split2
+
+for.body2.split2:
+  br i1 %tobool13, label %for.inc21, label %for.inc21.if
+
+for.body2.split:
+  br i1 %tobool13, label %for.inc21, label %for.inc21.then
+
+for.inc21.if:
+  %storemerge.1 = phi i64 [ 0, %for.body2.split2 ]
+  br label %for.inc21
+
+for.inc21.then:
+  %storemerge.2 = phi i64 [ 0, %for.body2.split ]
+  %storemerge.3 = phi i32 [ 0, %for.body2.split ]
+  br label %for.inc21
+
+for.inc21:
+  %storemerge.4 = phi i64 [ %storemerge.1, %for.inc21.if ], [ %storemerge.2, %for.inc21.then ], [ 4, %for.body2.split2 ], [ 4, %for.body2.split ]
+  %storemerge.5 = phi i32 [ 0, %for.inc21.if ], [ %storemerge.3, %for.inc21.then ], [ 0, %for.body2.split2 ], [ 0, %for.body2.split ]
+  %dec = add nsw i64 %storemerge, -1
+  %tobool = icmp eq i64 %dec, 0
+  br i1 %tobool, label %for.inc24, label %for.body2
+
+for.inc24:
+  %storemerge.4.lcssa = phi i64 [ %storemerge.4, %for.inc21 ]
+  %storemerge.5.lcssa = phi i32 [ %storemerge.5, %for.inc21 ]
+  %inc25 = add nuw nsw i64 %storemerge82, 1
+  %exitcond = icmp ne i64 %inc25, 5
+  br i1 %exitcond, label %for.body, label %for.end26
+
+for.end26:
+  %dec.lcssa.lcssa = phi i64 [ 0, %for.inc24 ]
+  %storemerge.4.lcssa.lcssa = phi i64 [ %storemerge.4.lcssa, %for.inc24 ]
+  %storemerge.5.lcssa.lcssa = phi i32 [ %storemerge.5.lcssa, %for.inc24 ]
+  store i64 %dec.lcssa.lcssa, i64* getelementptr inbounds (%struct.a, %struct.a* @g, i64 0, i32 0), align 8
+  ret i16 0
+}
+
+
+!5 = !{!6, !6, i64 0}
+!6 = !{!"int", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"short", !7, i64 0}