[llvm] 0a52401 - [LoopUnrollAndJam] Changed safety checks to consider more than 2-levels

Whitney Tsang via llvm-commits llvm-commits at lists.llvm.org
Wed May 6 14:48:04 PDT 2020


Author: Whitney Tsang
Date: 2020-05-06T21:47:44Z
New Revision: 0a52401ad68b084e52e535541ebff1e9cb6d3869

URL: https://github.com/llvm/llvm-project/commit/0a52401ad68b084e52e535541ebff1e9cb6d3869
DIFF: https://github.com/llvm/llvm-project/commit/0a52401ad68b084e52e535541ebff1e9cb6d3869.diff

LOG: [LoopUnrollAndJam] Changed safety checks to consider more than 2-levels
loop nest.

Summary: As discussed in https://reviews.llvm.org/D73129.

Example
Before unroll and jam:

for
  A
  for
    B
    for
      C
    D
  E
After unroll and jam (currently):

for
  A
  A'
  for
    B
    for
      C
    D
    B'
    for
      C'
    D'
  E
  E'
After unroll and jam (Ideal):

for
  A
  A'
  for
    B
    B'
    for
      C
      C'
    D
    D'
  E
  E'
This is the first patch to change unroll and jam to work in the ideal
way.
This patch change the safety checks needed to make sure is safe to
unroll and jam in the ideal way.

Reviewer: dmgreen, jdoerfert, Meinersbur, kbarton, bmahjour, etiotto
Reviewed By: Meinersbur
Subscribers: fhahn, hiraditya, zzheng, llvm-commits, anhtuyen, prithayan
Tag: LLVM
Differential Revision: https://reviews.llvm.org/D76132

Added: 
    llvm/test/Transforms/LoopUnrollAndJam/dependencies_multidims.ll

Modified: 
    llvm/include/llvm/Transforms/Utils/UnrollLoop.h
    llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
    llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
    llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index 7194d8b47cd9..1970cefcefba 100644
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -110,7 +110,7 @@ LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
                                   Loop **EpilogueLoop = nullptr);
 
 bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
-                          DependenceInfo &DI);
+                          DependenceInfo &DI, LoopInfo &LI);
 
 bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
                         DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,

diff  --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 98288ff461bb..f0ece1faa5fd 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -280,21 +280,6 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
                       ScalarEvolution &SE, const TargetTransformInfo &TTI,
                       AssumptionCache &AC, DependenceInfo &DI,
                       OptimizationRemarkEmitter &ORE, int OptLevel) {
-  // Quick checks of the correct loop form
-  if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
-    return LoopUnrollResult::Unmodified;
-  Loop *SubLoop = L->getSubLoops()[0];
-  if (!SubLoop->isLoopSimplifyForm())
-    return LoopUnrollResult::Unmodified;
-
-  BasicBlock *Latch = L->getLoopLatch();
-  BasicBlock *Exit = L->getExitingBlock();
-  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
-  BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
-
-  if (Latch != Exit || SubLoopLatch != SubLoopExit)
-    return LoopUnrollResult::Unmodified;
-
   TargetTransformInfo::UnrollingPreferences UP =
       gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, OptLevel, None,
                                  None, None, None, None, None, None, None);
@@ -324,7 +309,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
     return LoopUnrollResult::Unmodified;
   }
 
-  if (!isSafeToUnrollAndJam(L, SE, DT, DI)) {
+  if (!isSafeToUnrollAndJam(L, SE, DT, DI, *LI)) {
     LLVM_DEBUG(dbgs() << "  Disabled due to not being safe.\n");
     return LoopUnrollResult::Unmodified;
   }
@@ -335,6 +320,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   bool Convergent;
   SmallPtrSet<const Value *, 32> EphValues;
   CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+  Loop *SubLoop = L->getSubLoops()[0];
   unsigned InnerLoopSize =
       ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable,
                           Convergent, TTI, EphValues, UP.BEInsns);
@@ -372,6 +358,8 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
     SubLoop->setLoopID(NewInnerEpilogueLoopID.getValue());
 
   // Find trip count and trip multiple
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
   unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch);
   unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch);
   unsigned InnerTripCount = SE.getSmallConstantTripCount(SubLoop, SubLoopLatch);

diff  --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index e96c3520d575..e8aac1233292 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -69,17 +70,14 @@ typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
 
 // Partition blocks in an outer/inner loop pair into blocks before and after
 // the loop
-static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
-                                     BasicBlockSet &ForeBlocks,
-                                     BasicBlockSet &SubLoopBlocks,
-                                     BasicBlockSet &AftBlocks,
-                                     DominatorTree *DT) {
+static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks,
+                                BasicBlockSet &AftBlocks, DominatorTree &DT) {
+  Loop *SubLoop = L.getSubLoops()[0];
   BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
-  SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
 
-  for (BasicBlock *BB : L->blocks()) {
+  for (BasicBlock *BB : L.blocks()) {
     if (!SubLoop->contains(BB)) {
-      if (DT->dominates(SubLoopLatch, BB))
+      if (DT.dominates(SubLoopLatch, BB))
         AftBlocks.insert(BB);
       else
         ForeBlocks.insert(BB);
@@ -93,14 +91,44 @@ static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
     if (BB == SubLoopPreHeader)
       continue;
     Instruction *TI = BB->getTerminator();
-    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-      if (!ForeBlocks.count(TI->getSuccessor(i)))
+    for (BasicBlock *Succ : successors(TI))
+      if (!ForeBlocks.count(Succ))
         return false;
   }
 
   return true;
 }
 
+/// Partition blocks in a loop nest into blocks before and after each inner
+/// loop.
+static bool partitionOuterLoopBlocks(
+    Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks,
+    DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
+    DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) {
+  JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end());
+
+  for (Loop *L : Root.getLoopsInPreorder()) {
+    if (L == &JamLoop)
+      break;
+
+    if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT))
+      return false;
+  }
+
+  return true;
+}
+
+// TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more
+// than 2 levels loop.
+static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
+                                     BasicBlockSet &ForeBlocks,
+                                     BasicBlockSet &SubLoopBlocks,
+                                     BasicBlockSet &AftBlocks,
+                                     DominatorTree *DT) {
+  SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
+  return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT);
+}
+
 // Looks at the phi nodes in Header for values coming from Latch. For these
 // instructions and all their operands calls Visit on them, keeping going for
 // all the operands in AftBlocks. Returns false if Visit returns false,
@@ -619,7 +647,7 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
 }
 
 static bool getLoadsAndStores(BasicBlockSet &Blocks,
-                              SmallVector<Value *, 4> &MemInstr) {
+                              SmallVector<Instruction *, 4> &MemInstr) {
   // Scan the BBs and collect legal loads and stores.
   // Returns false if non-simple loads/stores are found.
   for (BasicBlock *BB : Blocks) {
@@ -640,97 +668,235 @@ static bool getLoadsAndStores(BasicBlockSet &Blocks,
   return true;
 }
 
-static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
-                              SmallVector<Value *, 4> &Later,
-                              unsigned LoopDepth, bool InnerLoop,
-                              DependenceInfo &DI) {
-  // Use DA to check for dependencies between loads and stores that make unroll
-  // and jam invalid
-  for (Value *I : Earlier) {
-    for (Value *J : Later) {
-      Instruction *Src = cast<Instruction>(I);
-      Instruction *Dst = cast<Instruction>(J);
-      if (Src == Dst)
-        continue;
-      // Ignore Input dependencies.
-      if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
-        continue;
-
-      // Track dependencies, and if we find them take a conservative approach
-      // by allowing only = or < (not >), altough some > would be safe
-      // (depending upon unroll width).
-      // For the inner loop, we need to disallow any (> <) dependencies
-      // FIXME: Allow > so long as distance is less than unroll width
-      if (auto D = DI.depends(Src, Dst, true)) {
-        assert(D->isOrdered() && "Expected an output, flow or anti dep.");
-
-        if (D->isConfused()) {
-          LLVM_DEBUG(dbgs() << "  Confused dependency between:\n"
-                            << "  " << *Src << "\n"
-                            << "  " << *Dst << "\n");
+static bool preservesForwardDependence(Instruction *Src, Instruction *Dst,
+                                       unsigned UnrollLevel, unsigned JamLevel,
+                                       bool Sequentialized, Dependence *D) {
+  // UnrollLevel might carry the dependency Src --> Dst
+  // Does a 
diff erent loop after unrolling?
+  for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
+       ++CurLoopDepth) {
+    auto JammedDir = D->getDirection(CurLoopDepth);
+    if (JammedDir == Dependence::DVEntry::LT)
+      return true;
+
+    if (JammedDir & Dependence::DVEntry::GT)
+      return false;
+  }
+
+  return true;
+}
+
+static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst,
+                                        unsigned UnrollLevel, unsigned JamLevel,
+                                        bool Sequentialized, Dependence *D) {
+  // UnrollLevel might carry the dependency Dst --> Src
+  for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
+       ++CurLoopDepth) {
+    auto JammedDir = D->getDirection(CurLoopDepth);
+    if (JammedDir == Dependence::DVEntry::GT)
+      return true;
+
+    if (JammedDir & Dependence::DVEntry::LT)
+      return false;
+  }
+
+  // Backward dependencies are only preserved if not interleaved.
+  return Sequentialized;
+}
+
+// Check whether it is semantically safe Src and Dst considering any potential
+// dependency between them.
+//
+// @param UnrollLevel The level of the loop being unrolled
+// @param JamLevel    The level of the loop being jammed; if Src and Dst are on
+// 
diff erent levels, the outermost common loop counts as jammed level
+//
+// @return true if is safe and false if there is a dependency violation.
+static bool checkDependency(Instruction *Src, Instruction *Dst,
+                            unsigned UnrollLevel, unsigned JamLevel,
+                            bool Sequentialized, DependenceInfo &DI) {
+  assert(UnrollLevel <= JamLevel &&
+         "Expecting JamLevel to be at least UnrollLevel");
+
+  if (Src == Dst)
+    return true;
+  // Ignore Input dependencies.
+  if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
+    return true;
+
+  // Check whether unroll-and-jam may violate a dependency.
+  // By construction, every dependency will be lexicographically non-negative
+  // (if it was, it would violate the current execution order), such as
+  //   (0,0,>,*,*)
+  // Unroll-and-jam changes the GT execution of two executions to the same
+  // iteration of the chosen unroll level. That is, a GT dependence becomes a GE
+  // dependence (or EQ, if we fully unrolled the loop) at the loop's position:
+  //   (0,0,>=,*,*)
+  // Now, the dependency is not necessarily non-negative anymore, i.e.
+  // unroll-and-jam may violate correctness.
+  std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true);
+  if (!D)
+    return true;
+  assert(D->isOrdered() && "Expected an output, flow or anti dep.");
+
+  if (D->isConfused()) {
+    LLVM_DEBUG(dbgs() << "  Confused dependency between:\n"
+                      << "  " << *Src << "\n"
+                      << "  " << *Dst << "\n");
+    return false;
+  }
+
+  // If outer levels (levels enclosing the loop being unroll-and-jammed) have a
+  // non-equal direction, then the locations accessed in the inner levels cannot
+  // overlap in memory. We assumes the indexes never overlap into neighboring
+  // dimensions.
+  for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth)
+    if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ))
+      return true;
+
+  auto UnrollDirection = D->getDirection(UnrollLevel);
+
+  // If the distance carried by the unrolled loop is 0, then after unrolling
+  // that distance will become non-zero resulting in non-overlapping accesses in
+  // the inner loops.
+  if (UnrollDirection == Dependence::DVEntry::EQ)
+    return true;
+
+  if (UnrollDirection & Dependence::DVEntry::LT &&
+      !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel,
+                                  Sequentialized, D.get()))
+    return false;
+
+  if (UnrollDirection & Dependence::DVEntry::GT &&
+      !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel,
+                                   Sequentialized, D.get()))
+    return false;
+
+  return true;
+}
+
+static bool
+checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks,
+                  const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
+                  const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap,
+                  DependenceInfo &DI, LoopInfo &LI) {
+  SmallVector<BasicBlockSet, 8> AllBlocks;
+  for (Loop *L : Root.getLoopsInPreorder())
+    if (ForeBlocksMap.find(L) != ForeBlocksMap.end())
+      AllBlocks.push_back(ForeBlocksMap.lookup(L));
+  AllBlocks.push_back(SubLoopBlocks);
+  for (Loop *L : Root.getLoopsInPreorder())
+    if (AftBlocksMap.find(L) != AftBlocksMap.end())
+      AllBlocks.push_back(AftBlocksMap.lookup(L));
+
+  unsigned LoopDepth = Root.getLoopDepth();
+  SmallVector<Instruction *, 4> EarlierLoadsAndStores;
+  SmallVector<Instruction *, 4> CurrentLoadsAndStores;
+  for (BasicBlockSet &Blocks : AllBlocks) {
+    CurrentLoadsAndStores.clear();
+    if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores))
+      return false;
+
+    Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent());
+    unsigned CurLoopDepth = CurLoop->getLoopDepth();
+
+    for (auto *Earlier : EarlierLoadsAndStores) {
+      Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent());
+      unsigned EarlierDepth = EarlierLoop->getLoopDepth();
+      unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth);
+      for (auto *Later : CurrentLoadsAndStores) {
+        if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false,
+                             DI))
+          return false;
+      }
+    }
+
+    size_t NumInsts = CurrentLoadsAndStores.size();
+    for (size_t I = 0; I < NumInsts; ++I) {
+      for (size_t J = I; J < NumInsts; ++J) {
+        if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J],
+                             LoopDepth, CurLoopDepth, true, DI))
           return false;
-        }
-        if (!InnerLoop) {
-          if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) {
-            LLVM_DEBUG(dbgs() << "  > dependency between:\n"
-                              << "  " << *Src << "\n"
-                              << "  " << *Dst << "\n");
-            return false;
-          }
-        } else {
-          assert(LoopDepth + 1 <= D->getLevels());
-          if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
-              D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) {
-            LLVM_DEBUG(dbgs() << "  < > dependency between:\n"
-                              << "  " << *Src << "\n"
-                              << "  " << *Dst << "\n");
-            return false;
-          }
-        }
       }
     }
+
+    EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(),
+                                 CurrentLoadsAndStores.end());
   }
   return true;
 }
 
-static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
-                              BasicBlockSet &SubLoopBlocks,
-                              BasicBlockSet &AftBlocks, DependenceInfo &DI) {
-  // Get all loads/store pairs for each blocks
-  SmallVector<Value *, 4> ForeMemInstr;
-  SmallVector<Value *, 4> SubLoopMemInstr;
-  SmallVector<Value *, 4> AftMemInstr;
-  if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
-      !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
-      !getLoadsAndStores(AftBlocks, AftMemInstr))
+static bool isEligibleLoopForm(const Loop &Root) {
+  // Root must have a child.
+  if (Root.getSubLoops().size() != 1)
     return false;
 
-  // Check for dependencies between any blocks that may change order
-  unsigned LoopDepth = L->getLoopDepth();
-  return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
-                           DI) &&
-         checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
-         checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
-                           DI) &&
-         checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
-                           DI);
+  const Loop *L = &Root;
+  do {
+    // All loops in Root need to be in simplify and rotated form.
+    if (!L->isLoopSimplifyForm())
+      return false;
+
+    if (!L->isRotatedForm())
+      return false;
+
+    if (L->getHeader()->hasAddressTaken()) {
+      LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
+      return false;
+    }
+
+    unsigned SubLoopsSize = L->getSubLoops().size();
+    if (SubLoopsSize == 0)
+      return true;
+
+    // Only one child is allowed.
+    if (SubLoopsSize != 1)
+      return false;
+
+    L = L->getSubLoops()[0];
+  } while (L);
+
+  return true;
+}
+
+static Loop *getInnerMostLoop(Loop *L) {
+  while (!L->getSubLoops().empty())
+    L = L->getSubLoops()[0];
+  return L;
 }
 
 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
-                                DependenceInfo &DI) {
+                                DependenceInfo &DI, LoopInfo &LI) {
+  if (!isEligibleLoopForm(*L)) {
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n");
+    return false;
+  }
+
   /* We currently handle outer loops like this:
         |
-    ForeFirst    <----\    }
-     Blocks           |    } ForeBlocks
-    ForeLast          |    }
-        |             |
-    SubLoopFirst  <\  |    }
-     Blocks        |  |    } SubLoopBlocks
-    SubLoopLast   -/  |    }
-        |             |
-    AftFirst          |    }
-     Blocks           |    } AftBlocks
-    AftLast     ------/    }
+    ForeFirst    <------\   }
+     Blocks             |   } ForeBlocks of L
+    ForeLast            |   }
+        |               |
+       ...              |
+        |               |
+    ForeFirst    <----\ |   }
+     Blocks           | |   } ForeBlocks of a inner loop of L
+    ForeLast          | |   }
+        |             | |
+    JamLoopFirst  <\  | |   }
+     Blocks        |  | |   } JamLoopBlocks of the innermost loop
+    JamLoopLast   -/  | |   }
+        |             | |
+    AftFirst          | |   }
+     Blocks           | |   } AftBlocks of a inner loop of L
+    AftLast     ------/ |   }
+        |               |
+       ...              |
+        |               |
+    AftFirst            |   }
+     Blocks             |   } AftBlocks of L
+    AftLast     --------/   }
         |
 
     There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
@@ -740,14 +906,16 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
     things further in the profitablility checks of the unroll and jam pass.
 
     Because of the way we rearrange basic blocks, we also require that
-    the Fore blocks on all unrolled iterations are safe to move before the
-    SubLoop blocks of all iterations. So we require that the phi node looping
-    operands of ForeHeader can be moved to at least the end of ForeEnd, so that
-    we can arrange cloned Fore Blocks before the subloop and match up Phi's
-    correctly.
+    the Fore blocks of L on all unrolled iterations are safe to move before the
+    blocks of the direct child of L of all iterations. So we require that the
+    phi node looping operands of ForeHeader can be moved to at least the end of
+    ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and
+    match up Phi's correctly.
 
-    i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
-    It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
+    i.e. The old order of blocks used to be
+           (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2.
+         It needs to be safe to transform this to
+           (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2.
 
     There are then a number of checks along the lines of no calls, no
     exceptions, inner loop IV is consistent, etc. Note that for loops requiring
@@ -755,35 +923,13 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
     UnrollAndJamLoop if the trip count cannot be easily calculated.
   */
 
-  if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
-    return false;
-  Loop *SubLoop = L->getSubLoops()[0];
-  if (!SubLoop->isLoopSimplifyForm())
-    return false;
-
-  BasicBlock *Header = L->getHeader();
-  BasicBlock *Latch = L->getLoopLatch();
-  BasicBlock *Exit = L->getExitingBlock();
-  BasicBlock *SubLoopHeader = SubLoop->getHeader();
-  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
-  BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
-
-  if (Latch != Exit)
-    return false;
-  if (SubLoopLatch != SubLoopExit)
-    return false;
-
-  if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) {
-    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
-    return false;
-  }
-
   // Split blocks into Fore/SubLoop/Aft based on dominators
+  Loop *JamLoop = getInnerMostLoop(L);
   BasicBlockSet SubLoopBlocks;
-  BasicBlockSet ForeBlocks;
-  BasicBlockSet AftBlocks;
-  if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
-                                AftBlocks, &DT)) {
+  DenseMap<Loop *, BasicBlockSet> ForeBlocksMap;
+  DenseMap<Loop *, BasicBlockSet> AftBlocksMap;
+  if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap,
+                                AftBlocksMap, DT)) {
     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
     return false;
   }
@@ -791,7 +937,7 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
   // Aft blocks may need to move instructions to fore blocks, which becomes more
   // 
diff icult if there are multiple (potentially conditionally executed)
   // blocks. For now we just exclude loops with multiple aft blocks.
-  if (AftBlocks.size() != 1) {
+  if (AftBlocksMap[L].size() != 1) {
     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
                          "multiple blocks after the loop\n");
     return false;
@@ -799,7 +945,9 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
 
   // Check inner loop backedge count is consistent on all iterations of the
   // outer loop
-  if (!hasIterationCountInvariantInParent(SubLoop, SE)) {
+  if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) {
+        return !hasIterationCountInvariantInParent(SubLoop, SE);
+      })) {
     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
                          "not consistent on each iteration\n");
     return false;
@@ -820,6 +968,10 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
   //  ForeBlock phi operands before the subloop
 
   // Make sure we can move all instructions we need to before the subloop
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlockSet AftBlocks = AftBlocksMap[L];
+  Loop *SubLoop = L->getSubLoops()[0];
   if (!processHeaderPhiOperands(
           Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
             if (SubLoop->contains(I->getParent()))
@@ -845,7 +997,8 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
   // Check for memory dependencies which prohibit the unrolling we are doing.
   // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
   // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
-  if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) {
+  if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI,
+                         LI)) {
     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
     return false;
   }

diff  --git a/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll b/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll
index d584238745cf..e1ab0ea6cd68 100644
--- a/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll
+++ b/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll
@@ -398,6 +398,8 @@ cleanup:
 ; CHECK-LABEL: sub_sub_eq
 ; CHECK: %j = phi
 ; CHECK: %j.1 = phi
+; CHECK: %j.2 = phi
+; CHECK: %j.3 = phi
 define void @sub_sub_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
 entry:
   %cmp = icmp sgt i32 %N, 0

diff  --git a/llvm/test/Transforms/LoopUnrollAndJam/dependencies_multidims.ll b/llvm/test/Transforms/LoopUnrollAndJam/dependencies_multidims.ll
new file mode 100644
index 000000000000..cb422dd4730c
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnrollAndJam/dependencies_multidims.ll
@@ -0,0 +1,219 @@
+; RUN: opt -da-disable-delinearization-checks -basicaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s
+; RUN: opt -da-disable-delinearization-checks -aa-pipeline=basic-aa -passes='unroll-and-jam' -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; CHECK-LABEL: sub_sub_less
+; CHECK: %j = phi
+; CHECK-NOT: %j.1 = phi
+define void @sub_sub_less([100 x i32]* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp sgt i32 %N, 0
+  br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+  %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+  %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx5, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %add6 = add nuw nsw i32 %j, 1
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %i, i32 %j
+  store i32 1, i32* %arrayidx, align 4
+  %add72 = add nuw nsw i32 %i, 1
+  %add73 = add nuw nsw i32 %j, -1
+  %arrayidx8 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %add72, i32 %add73
+  store i32 %add, i32* %arrayidx8, align 4
+  %exitcond = icmp eq i32 %add6, %N
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add7 = add nuw nsw i32 %i, 1
+  %exitcond29 = icmp eq i32 %add7, %N
+  br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+  ret void
+}
+
+
+; CHECK-LABEL: sub_sub_eq
+; CHECK: %j = phi
+; CHECK: %j.1 = phi
+; CHECK: %j.2 = phi
+; CHECK: %j.3 = phi
+define void @sub_sub_eq([100 x i32]* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp sgt i32 %N, 0
+  br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+  %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+  %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx5, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %add6 = add nuw nsw i32 %j, 1
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %i, i32 %j
+  store i32 1, i32* %arrayidx, align 4
+  %add72 = add nuw nsw i32 %i, 1
+  %add73 = add nuw nsw i32 %j, 0
+  %arrayidx8 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %add72, i32 %add73
+  store i32 %add, i32* %arrayidx8, align 4
+  %exitcond = icmp eq i32 %add6, %N
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add7 = add nuw nsw i32 %i, 1
+  %exitcond29 = icmp eq i32 %add7, %N
+  br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+  ret void
+}
+
+
+; CHECK-LABEL: sub_sub_more
+; CHECK: %j = phi
+; CHECK: %j.1 = phi
+; CHECK: %j.2 = phi
+; CHECK: %j.3 = phi
+define void @sub_sub_more([100 x i32]* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp = icmp sgt i32 %N, 0
+  br i1 %cmp, label %for.outer, label %cleanup
+
+for.outer:
+  %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
+  %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx5, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %add6 = add nuw nsw i32 %j, 1
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %i, i32 %j
+  store i32 1, i32* %arrayidx, align 4
+  %add72 = add nuw nsw i32 %i, 1
+  %add73 = add nuw nsw i32 %j, 1
+  %arrayidx8 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %add72, i32 %add73
+  store i32 %add, i32* %arrayidx8, align 4
+  %exitcond = icmp eq i32 %add6, %N
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add7 = add nuw nsw i32 %i, 1
+  %exitcond29 = icmp eq i32 %add7, %N
+  br i1 %exitcond29, label %cleanup, label %for.outer
+
+cleanup:
+  ret void
+}
+
+; CHECK-LABEL: sub_sub_less_3d
+; CHECK: %k = phi
+; CHECK-NOT: %k.1 = phi
+
+; for (long i = 0; i < 100; ++i)
+;   for (long j = 0; j < 100; ++j)
+;     for (long k = 0; k < 100; ++k) {
+;       A[i][j][k] = 0;
+;       A[i+1][j][k-1] = 0;
+;     }
+
+define void @sub_sub_less_3d([100 x [100 x i32]]* noalias %A) {
+entry:
+  br label %for.i
+
+for.i:
+  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i ], [ %inc.j, %for.j.latch ]
+  br label %for.k
+
+for.k:
+  %k = phi i32 [ 0, %for.j ], [ %inc.k, %for.k ]
+  %arrayidx = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* %A, i32 %i, i32 %j, i32 %k
+  store i32 0, i32* %arrayidx, align 4
+  %add.i = add nsw i32 %i, 1
+  %sub.k = add nsw i32 %k, -1
+  %arrayidx2 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* %A, i32 %add.i, i32 %j, i32 %sub.k
+  store i32 0, i32* %arrayidx2, align 4
+  %inc.k = add nsw i32 %k, 1
+  %cmp.k = icmp slt i32 %inc.k, 100
+  br i1 %cmp.k, label %for.k, label %for.j.latch
+
+for.j.latch:
+  %inc.j = add nsw i32 %j, 1
+  %cmp.j = icmp slt i32 %inc.j, 100
+  br i1 %cmp.j, label %for.j, label %for.i.latch, !llvm.loop !1
+
+for.i.latch:
+  %inc.i = add nsw i32 %i, 1
+  %cmp.i = icmp slt i32 %inc.i, 100
+  br i1 %cmp.i, label %for.i, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: sub_sub_outer_scalar
+; CHECK: %k = phi
+; CHECK-NOT: %k.1 = phi
+
+define void @sub_sub_outer_scalar([100 x i32]* %A) {
+entry:
+  br label %for.i
+
+for.i:
+  %i = phi i64 [ 0, %entry ], [ %inc.i, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i64 [ 0, %for.i ], [ %inc.j, %for.j.latch ]
+  br label %for.k
+
+for.k:
+  %k = phi i64 [ 0, %for.j ], [ %inc.k, %for.k ]
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 %j
+  %arrayidx7 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx, i64 0, i64 %k
+  %0 = load i32, i32* %arrayidx7, align 4
+  %sub.j = sub nsw i64 %j, 1
+  %arrayidx8 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 %sub.j
+  %arrayidx9 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx8, i64 0, i64 %k
+  store i32 %0, i32* %arrayidx9, align 4
+  %inc.k = add nsw i64 %k, 1
+  %cmp.k = icmp slt i64 %inc.k, 100
+  br i1 %cmp.k, label %for.k, label %for.j.latch
+
+for.j.latch:
+  %inc.j = add nsw i64 %j, 1
+  %cmp.j = icmp slt i64 %inc.j, 100
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %inc.i = add nsw i64 %i, 1
+  %cmp.i = icmp slt i64 %inc.i, 100
+  br i1 %cmp.i, label %for.i, label %for.end
+
+for.end:
+  ret void
+}
+
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.unroll_and_jam.disable"}


        


More information about the llvm-commits mailing list