[llvm] r265388 - Adds the ability to use an epilog remainder loop during loop unrolling and makes

Evgeny Stupachenko via llvm-commits llvm-commits at lists.llvm.org
Fri Apr 8 14:09:42 PDT 2016


Hi Michael,

Yes I do have the data.
There were ~10% improvements on some EEMBC tests; spec2000 performance
was almost flat (within 2%).

Let me look into BubbleSort case and come back with analysis and
possible improvement.

Thanks,
Evgeny


On Fri, Apr 8, 2016 at 1:23 PM, Michael Zolotukhin via llvm-commits
<llvm-commits at lists.llvm.org> wrote:
> Hi Evgeny,
>
> We’ve found several performance regressions on LLVM testsuite caused by this patch. Do you have a data from your performance experiments to back-up the decision to make epilogues the default strategy?
>
> One of the biggest regressions we see is 89% on SingleSource/Benchmarks/Stanford/Bubblesort (on x86), while the biggest gain is only 30%.
>
> Thanks,
> Michael
>
>
>> On Apr 5, 2016, at 5:19 AM, David L Kreitzer via llvm-commits <llvm-commits at lists.llvm.org> wrote:
>>
>> Author: dlkreitz
>> Date: Tue Apr  5 07:19:35 2016
>> New Revision: 265388
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=265388&view=rev
>> Log:
>> Adds the ability to use an epilog remainder loop during loop unrolling and makes
>> this the default behavior.
>>
>> Patch by Evgeny Stupachenko (evstupac at gmail.com).
>>
>> Differential Revision: http://reviews.llvm.org/D18158
>>
>> Modified:
>>    llvm/trunk/include/llvm/Transforms/Utils/UnrollLoop.h
>>    llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp
>>    llvm/trunk/lib/Transforms/Utils/LoopUnrollRuntime.cpp
>>    llvm/trunk/test/Transforms/LoopUnroll/AArch64/runtime-loop.ll
>>    llvm/trunk/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll
>>    llvm/trunk/test/Transforms/LoopUnroll/X86/mmx.ll
>>    llvm/trunk/test/Transforms/LoopUnroll/high-cost-trip-count-computation.ll
>>    llvm/trunk/test/Transforms/LoopUnroll/runtime-loop.ll
>>    llvm/trunk/test/Transforms/LoopUnroll/runtime-loop1.ll
>>    llvm/trunk/test/Transforms/LoopUnroll/runtime-loop2.ll
>>    llvm/trunk/test/Transforms/LoopUnroll/runtime-loop4.ll
>>    llvm/trunk/test/Transforms/LoopUnroll/runtime-loop5.ll
>>    llvm/trunk/test/Transforms/LoopUnroll/tripcount-overflow.ll
>>    llvm/trunk/test/Transforms/LoopUnroll/unroll-cleanup.ll
>>    llvm/trunk/test/Transforms/LoopUnroll/unroll-pragmas.ll
>>
>> Modified: llvm/trunk/include/llvm/Transforms/Utils/UnrollLoop.h
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Transforms/Utils/UnrollLoop.h?rev=265388&r1=265387&r2=265388&view=diff
>> ==============================================================================
>> --- llvm/trunk/include/llvm/Transforms/Utils/UnrollLoop.h (original)
>> +++ llvm/trunk/include/llvm/Transforms/Utils/UnrollLoop.h Tue Apr  5 07:19:35 2016
>> @@ -34,10 +34,11 @@ bool UnrollLoop(Loop *L, unsigned Count,
>>                 LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
>>                 AssumptionCache *AC, bool PreserveLCSSA);
>>
>> -bool UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
>> -                             bool AllowExpensiveTripCount, LoopInfo *LI,
>> -                             ScalarEvolution *SE, DominatorTree *DT,
>> -                             bool PreserveLCSSA);
>> +bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
>> +                                bool AllowExpensiveTripCount,
>> +                                bool UseEpilogRemainder, LoopInfo *LI,
>> +                                ScalarEvolution *SE, DominatorTree *DT,
>> +                                bool PreserveLCSSA);
>>
>> MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);
>> }
>>
>> Modified: llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp?rev=265388&r1=265387&r2=265388&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp (original)
>> +++ llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp Tue Apr  5 07:19:35 2016
>> @@ -44,6 +44,11 @@ using namespace llvm;
>> STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
>> STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
>>
>> +static cl::opt<bool>
>> +UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(true), cl::Hidden,
>> +                    cl::desc("Allow runtime unrolled loops to be unrolled "
>> +                             "with epilog instead of prolog."));
>> +
>> /// Convert the instruction operands from referencing the current values into
>> /// those specified by VMap.
>> static inline void remapInstruction(Instruction *I,
>> @@ -288,12 +293,13 @@ bool llvm::UnrollLoop(Loop *L, unsigned
>>                "convergent "
>>                "operation.");
>>       });
>> -  // Don't output the runtime loop prolog if Count is a multiple of
>> -  // TripMultiple.  Such a prolog is never needed, and is unsafe if the loop
>> +  // Don't output the runtime loop remainder if Count is a multiple of
>> +  // TripMultiple.  Such a remainder is never needed, and is unsafe if the loop
>>   // contains a convergent instruction.
>>   if (RuntimeTripCount && TripMultiple % Count != 0 &&
>> -      !UnrollRuntimeLoopProlog(L, Count, AllowExpensiveTripCount, LI, SE, DT,
>> -                               PreserveLCSSA))
>> +      !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount,
>> +                                  UnrollRuntimeEpilog, LI, SE, DT,
>> +                                  PreserveLCSSA))
>>     return false;
>>
>>   // Notify ScalarEvolution that the loop will be substantially changed,
>>
>> Modified: llvm/trunk/lib/Transforms/Utils/LoopUnrollRuntime.cpp
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Utils/LoopUnrollRuntime.cpp?rev=265388&r1=265387&r2=265388&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/Transforms/Utils/LoopUnrollRuntime.cpp (original)
>> +++ llvm/trunk/lib/Transforms/Utils/LoopUnrollRuntime.cpp Tue Apr  5 07:19:35 2016
>> @@ -16,8 +16,8 @@
>> // case, we need to generate code to execute these 'left over' iterations.
>> //
>> // The current strategy generates an if-then-else sequence prior to the
>> -// unrolled loop to execute the 'left over' iterations.  Other strategies
>> -// include generate a loop before or after the unrolled loop.
>> +// unrolled loop to execute the 'left over' iterations before or after the
>> +// unrolled loop.
>> //
>> //===----------------------------------------------------------------------===//
>>
>> @@ -60,33 +60,35 @@ STATISTIC(NumRuntimeUnrolled,
>> ///   than the unroll factor.
>> ///
>> static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
>> -                          BasicBlock *LastPrologBB, BasicBlock *PrologEnd,
>> -                          BasicBlock *OrigPH, BasicBlock *NewPH,
>> -                          ValueToValueMapTy &VMap, DominatorTree *DT,
>> -                          LoopInfo *LI, bool PreserveLCSSA) {
>> +                          BasicBlock *PrologExit, BasicBlock *PreHeader,
>> +                          BasicBlock *NewPreHeader, ValueToValueMapTy &VMap,
>> +                          DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA) {
>>   BasicBlock *Latch = L->getLoopLatch();
>>   assert(Latch && "Loop must have a latch");
>> +  BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]);
>>
>>   // Create a PHI node for each outgoing value from the original loop
>>   // (which means it is an outgoing value from the prolog code too).
>>   // The new PHI node is inserted in the prolog end basic block.
>> -  // The new PHI name is added as an operand of a PHI node in either
>> +  // The new PHI node value is added as an operand of a PHI node in either
>>   // the loop header or the loop exit block.
>> -  for (succ_iterator SBI = succ_begin(Latch), SBE = succ_end(Latch);
>> -       SBI != SBE; ++SBI) {
>> -    for (BasicBlock::iterator BBI = (*SBI)->begin();
>> -         PHINode *PN = dyn_cast<PHINode>(BBI); ++BBI) {
>> -
>> +  for (BasicBlock *Succ : successors(Latch)) {
>> +    for (Instruction &BBI : *Succ) {
>> +      PHINode *PN = dyn_cast<PHINode>(&BBI);
>> +      // Exit when we passed all PHI nodes.
>> +      if (!PN)
>> +        break;
>>       // Add a new PHI node to the prolog end block and add the
>>       // appropriate incoming values.
>> -      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName()+".unr",
>> -                                       PrologEnd->getTerminator());
>> +      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
>> +                                       PrologExit->getFirstNonPHI());
>>       // Adding a value to the new PHI node from the original loop preheader.
>>       // This is the value that skips all the prolog code.
>>       if (L->contains(PN)) {
>> -        NewPN->addIncoming(PN->getIncomingValueForBlock(NewPH), OrigPH);
>> +        NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader),
>> +                           PreHeader);
>>       } else {
>> -        NewPN->addIncoming(UndefValue::get(PN->getType()), OrigPH);
>> +        NewPN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
>>       }
>>
>>       Value *V = PN->getIncomingValueForBlock(Latch);
>> @@ -97,22 +99,22 @@ static void ConnectProlog(Loop *L, Value
>>       }
>>       // Adding a value to the new PHI node from the last prolog block
>>       // that was created.
>> -      NewPN->addIncoming(V, LastPrologBB);
>> +      NewPN->addIncoming(V, PrologLatch);
>>
>>       // Update the existing PHI node operand with the value from the
>>       // new PHI node.  How this is done depends on if the existing
>>       // PHI node is in the original loop block, or the exit block.
>>       if (L->contains(PN)) {
>> -        PN->setIncomingValue(PN->getBasicBlockIndex(NewPH), NewPN);
>> +        PN->setIncomingValue(PN->getBasicBlockIndex(NewPreHeader), NewPN);
>>       } else {
>> -        PN->addIncoming(NewPN, PrologEnd);
>> +        PN->addIncoming(NewPN, PrologExit);
>>       }
>>     }
>>   }
>>
>>   // Create a branch around the original loop, which is taken if there are no
>>   // iterations remaining to be executed after running the prologue.
>> -  Instruction *InsertPt = PrologEnd->getTerminator();
>> +  Instruction *InsertPt = PrologExit->getTerminator();
>>   IRBuilder<> B(InsertPt);
>>
>>   assert(Count != 0 && "nonsensical Count!");
>> @@ -126,25 +128,152 @@ static void ConnectProlog(Loop *L, Value
>>   BasicBlock *Exit = L->getUniqueExitBlock();
>>   assert(Exit && "Loop must have a single exit block only");
>>   // Split the exit to maintain loop canonicalization guarantees
>> -  SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit));
>> +  SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
>>   SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", DT, LI,
>>                          PreserveLCSSA);
>>   // Add the branch to the exit block (around the unrolled loop)
>> -  B.CreateCondBr(BrLoopExit, Exit, NewPH);
>> +  B.CreateCondBr(BrLoopExit, Exit, NewPreHeader);
>> +  InsertPt->eraseFromParent();
>> +}
>> +
>> +/// Connect the unrolling epilog code to the original loop.
>> +/// The unrolling epilog code contains code to execute the
>> +/// 'extra' iterations if the run-time trip count modulo the
>> +/// unroll count is non-zero.
>> +///
>> +/// This function performs the following:
>> +/// - Update PHI nodes at the unrolling loop exit and epilog loop exit
>> +/// - Create PHI nodes at the unrolling loop exit to combine
>> +///   values that exit the unrolling loop code and jump around it.
>> +/// - Update PHI operands in the epilog loop by the new PHI nodes
>> +/// - Branch around the epilog loop if extra iters (ModVal) is zero.
>> +///
>> +static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
>> +                          BasicBlock *Exit, BasicBlock *PreHeader,
>> +                          BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
>> +                          ValueToValueMapTy &VMap, DominatorTree *DT,
>> +                          LoopInfo *LI, bool PreserveLCSSA)  {
>> +  BasicBlock *Latch = L->getLoopLatch();
>> +  assert(Latch && "Loop must have a latch");
>> +  BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
>> +
>> +  // Loop structure should be the following:
>> +  //
>> +  // PreHeader
>> +  // NewPreHeader
>> +  //   Header
>> +  //   ...
>> +  //   Latch
>> +  // NewExit (PN)
>> +  // EpilogPreHeader
>> +  //   EpilogHeader
>> +  //   ...
>> +  //   EpilogLatch
>> +  // Exit (EpilogPN)
>> +
>> +  // Update PHI nodes at NewExit and Exit.
>> +  for (Instruction &BBI : *NewExit) {
>> +    PHINode *PN = dyn_cast<PHINode>(&BBI);
>> +    // Exit when we passed all PHI nodes.
>> +    if (!PN)
>> +      break;
>> +    // PN should be used in another PHI located in Exit block as
>> +    // Exit was split by SplitBlockPredecessors into Exit and NewExit
>> +    // Basicaly it should look like:
>> +    // NewExit:
>> +    //   PN = PHI [I, Latch]
>> +    // ...
>> +    // Exit:
>> +    //   EpilogPN = PHI [PN, EpilogPreHeader]
>> +    //
>> +    // There is EpilogPreHeader incoming block instead of NewExit as
>> +    // NewExit was spilt 1 more time to get EpilogPreHeader.
>> +    assert(PN->hasOneUse() && "The phi should have 1 use");
>> +    PHINode *EpilogPN = cast<PHINode> (PN->use_begin()->getUser());
>> +    assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
>> +
>> +    // Add incoming PreHeader from branch around the Loop
>> +    PN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
>> +
>> +    Value *V = PN->getIncomingValueForBlock(Latch);
>> +    Instruction *I = dyn_cast<Instruction>(V);
>> +    if (I && L->contains(I))
>> +      // If value comes from an instruction in the loop add VMap value.
>> +      V = VMap[I];
>> +    // For the instruction out of the loop, constant or undefined value
>> +    // insert value itself.
>> +    EpilogPN->addIncoming(V, EpilogLatch);
>> +
>> +    assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 &&
>> +          "EpilogPN should have EpilogPreHeader incoming block");
>> +    // Change EpilogPreHeader incoming block to NewExit.
>> +    EpilogPN->setIncomingBlock(EpilogPN->getBasicBlockIndex(EpilogPreHeader),
>> +                               NewExit);
>> +    // Now PHIs should look like:
>> +    // NewExit:
>> +    //   PN = PHI [I, Latch], [undef, PreHeader]
>> +    // ...
>> +    // Exit:
>> +    //   EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
>> +  }
>> +
>> +  // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader).
>> +  // Update corresponding PHI nodes in epilog loop.
>> +  for (BasicBlock *Succ : successors(Latch)) {
>> +    // Skip this as we already updated phis in exit blocks.
>> +    if (!L->contains(Succ))
>> +      continue;
>> +    for (Instruction &BBI : *Succ) {
>> +      PHINode *PN = dyn_cast<PHINode>(&BBI);
>> +      // Exit when we passed all PHI nodes.
>> +      if (!PN)
>> +        break;
>> +      // Add new PHI nodes to the loop exit block and update epilog
>> +      // PHIs with the new PHI values.
>> +      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
>> +                                       NewExit->getFirstNonPHI());
>> +      // Adding a value to the new PHI node from the unrolling loop preheader.
>> +      NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader), PreHeader);
>> +      // Adding a value to the new PHI node from the unrolling loop latch.
>> +      NewPN->addIncoming(PN->getIncomingValueForBlock(Latch), Latch);
>> +
>> +      // Update the existing PHI node operand with the value from the new PHI
>> +      // node.  Corresponding instruction in epilog loop should be PHI.
>> +      PHINode *VPN = cast<PHINode>(VMap[&BBI]);
>> +      VPN->setIncomingValue(VPN->getBasicBlockIndex(EpilogPreHeader), NewPN);
>> +    }
>> +  }
>> +
>> +  Instruction *InsertPt = NewExit->getTerminator();
>> +  IRBuilder<> B(InsertPt);
>> +  Value *BrLoopExit = B.CreateIsNotNull(ModVal);
>> +  assert(Exit && "Loop must have a single exit block only");
>> +  // Split the exit to maintain loop canonicalization guarantees
>> +  SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
>> +  SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI,
>> +                         PreserveLCSSA);
>> +  // Add the branch to the exit block (around the unrolling loop)
>> +  B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit);
>>   InsertPt->eraseFromParent();
>> }
>>
>> /// Create a clone of the blocks in a loop and connect them together.
>> -/// If UnrollProlog is true, loop structure will not be cloned, otherwise a new
>> -/// loop will be created including all cloned blocks, and the iterator of it
>> -/// switches to count NewIter down to 0.
>> +/// If CreateRemainderLoop is false, loop structure will not be cloned,
>> +/// otherwise a new loop will be created including all cloned blocks, and the
>> +/// iterator of it switches to count NewIter down to 0.
>> +/// The cloned blocks should be inserted between InsertTop and InsertBot.
>> +/// If loop structure is cloned InsertTop should be new preheader, InsertBot
>> +/// new loop exit.
>> ///
>> -static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
>> +static void CloneLoopBlocks(Loop *L, Value *NewIter,
>> +                            const bool CreateRemainderLoop,
>> +                            const bool UseEpilogRemainder,
>>                             BasicBlock *InsertTop, BasicBlock *InsertBot,
>> +                            BasicBlock *Preheader,
>>                             std::vector<BasicBlock *> &NewBlocks,
>>                             LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
>>                             LoopInfo *LI) {
>> -  BasicBlock *Preheader = L->getLoopPreheader();
>> +  StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
>>   BasicBlock *Header = L->getHeader();
>>   BasicBlock *Latch = L->getLoopLatch();
>>   Function *F = Header->getParent();
>> @@ -152,7 +281,7 @@ static void CloneLoopBlocks(Loop *L, Val
>>   LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
>>   Loop *NewLoop = nullptr;
>>   Loop *ParentLoop = L->getParentLoop();
>> -  if (!UnrollProlog) {
>> +  if (CreateRemainderLoop) {
>>     NewLoop = new Loop();
>>     if (ParentLoop)
>>       ParentLoop->addChildLoop(NewLoop);
>> @@ -163,7 +292,7 @@ static void CloneLoopBlocks(Loop *L, Val
>>   // For each block in the original loop, create a new copy,
>>   // and update the value map with the newly created values.
>>   for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
>> -    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".prol", F);
>> +    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
>>     NewBlocks.push_back(NewBB);
>>
>>     if (NewLoop)
>> @@ -179,16 +308,17 @@ static void CloneLoopBlocks(Loop *L, Val
>>     }
>>
>>     if (Latch == *BB) {
>> -      // For the last block, if UnrollProlog is true, create a direct jump to
>> -      // InsertBot. If not, create a loop back to cloned head.
>> +      // For the last block, if CreateRemainderLoop is false, create a direct
>> +      // jump to InsertBot. If not, create a loop back to cloned head.
>>       VMap.erase((*BB)->getTerminator());
>>       BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
>>       BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
>>       IRBuilder<> Builder(LatchBR);
>> -      if (UnrollProlog) {
>> +      if (!CreateRemainderLoop) {
>>         Builder.CreateBr(InsertBot);
>>       } else {
>> -        PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2, "prol.iter",
>> +        PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2,
>> +                                          suffix + ".iter",
>>                                           FirstLoopBB->getFirstNonPHI());
>>         Value *IdxSub =
>>             Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
>> @@ -207,9 +337,15 @@ static void CloneLoopBlocks(Loop *L, Val
>>   // cloned loop.
>>   for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
>>     PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
>> -    if (UnrollProlog) {
>> -      VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
>> -      cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
>> +    if (!CreateRemainderLoop) {
>> +      if (UseEpilogRemainder) {
>> +        unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
>> +        NewPHI->setIncomingBlock(idx, InsertTop);
>> +        NewPHI->removeIncomingValue(Latch, false);
>> +      } else {
>> +        VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
>> +        cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
>> +      }
>>     } else {
>>       unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
>>       NewPHI->setIncomingBlock(idx, InsertTop);
>> @@ -254,7 +390,7 @@ static void CloneLoopBlocks(Loop *L, Val
>>   }
>> }
>>
>> -/// Insert code in the prolog code when unrolling a loop with a
>> +/// Insert code in the prolog/epilog code when unrolling a loop with a
>> /// run-time trip-count.
>> ///
>> /// This method assumes that the loop unroll factor is total number
>> @@ -266,6 +402,7 @@ static void CloneLoopBlocks(Loop *L, Val
>> /// instruction in SimplifyCFG.cpp.  Then, the backend decides how code for
>> /// the switch instruction is generated.
>> ///
>> +/// ***Prolog case***
>> ///        extraiters = tripcount % loopfactor
>> ///        if (extraiters == 0) jump Loop:
>> ///        else jump Prol
>> @@ -277,17 +414,35 @@ static void CloneLoopBlocks(Loop *L, Val
>> /// ...
>> /// End:
>> ///
>> -bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
>> -                                   bool AllowExpensiveTripCount, LoopInfo *LI,
>> -                                   ScalarEvolution *SE, DominatorTree *DT,
>> -                                   bool PreserveLCSSA) {
>> -  // For now, only unroll loops that contain a single exit.
>> +/// ***Epilog case***
>> +///        extraiters = tripcount % loopfactor
>> +///        if (extraiters == tripcount) jump LoopExit:
>> +///        unroll_iters = tripcount - extraiters
>> +/// Loop:  LoopBody; (executes unroll_iter times);
>> +///        unroll_iter -= 1
>> +///        if (unroll_iter != 0) jump Loop:
>> +/// LoopExit:
>> +///        if (extraiters == 0) jump EpilExit:
>> +/// Epil:  LoopBody; (executes extraiters times)
>> +///        extraiters -= 1                 // Omitted if unroll factor is 2.
>> +///        if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
>> +/// EpilExit:
>> +
>> +bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
>> +                                      bool AllowExpensiveTripCount,
>> +                                      bool UseEpilogRemainder,
>> +                                      LoopInfo *LI, ScalarEvolution *SE,
>> +                                      DominatorTree *DT, bool PreserveLCSSA) {
>> +  // for now, only unroll loops that contain a single exit
>>   if (!L->getExitingBlock())
>>     return false;
>>
>>   // Make sure the loop is in canonical form, and there is a single
>>   // exit block only.
>> -  if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock())
>> +  if (!L->isLoopSimplifyForm())
>> +    return false;
>> +  BasicBlock *Exit = L->getUniqueExitBlock(); // successor out of loop
>> +  if (!Exit)
>>     return false;
>>
>>   // Use Scalar Evolution to compute the trip count. This allows more loops to
>> @@ -311,8 +466,8 @@ bool llvm::UnrollRuntimeLoopProlog(Loop
>>     return false;
>>
>>   BasicBlock *Header = L->getHeader();
>> -  BasicBlock *PH = L->getLoopPreheader();
>> -  BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator());
>> +  BasicBlock *PreHeader = L->getLoopPreheader();
>> +  BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
>>   const DataLayout &DL = Header->getModule()->getDataLayout();
>>   SCEVExpander Expander(*SE, DL, "loop-unroll");
>>   if (!AllowExpensiveTripCount &&
>> @@ -330,26 +485,75 @@ bool llvm::UnrollRuntimeLoopProlog(Loop
>>     SE->forgetLoop(ParentLoop);
>>
>>   BasicBlock *Latch = L->getLoopLatch();
>> -  // It helps to split the original preheader twice, one for the end of the
>> -  // prolog code and one for a new loop preheader.
>> -  BasicBlock *PEnd = SplitEdge(PH, Header, DT, LI);
>> -  BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), DT, LI);
>> -  PreHeaderBR = cast<BranchInst>(PH->getTerminator());
>>
>> +  // Loop structure is the following:
>> +  //
>> +  // PreHeader
>> +  //   Header
>> +  //   ...
>> +  //   Latch
>> +  // Exit
>> +
>> +  BasicBlock *NewPreHeader;
>> +  BasicBlock *NewExit = nullptr;
>> +  BasicBlock *PrologExit = nullptr;
>> +  BasicBlock *EpilogPreHeader = nullptr;
>> +  BasicBlock *PrologPreHeader = nullptr;
>> +
>> +  if (UseEpilogRemainder) {
>> +    // If epilog remainder
>> +    // Split PreHeader to insert a branch around loop for unrolling.
>> +    NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);
>> +    NewPreHeader->setName(PreHeader->getName() + ".new");
>> +    // Split Exit to create phi nodes from branch above.
>> +    SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
>> +    NewExit = SplitBlockPredecessors(Exit, Preds, ".unr-lcssa",
>> +                                     DT, LI, PreserveLCSSA);
>> +    // Split NewExit to insert epilog remainder loop.
>> +    EpilogPreHeader = SplitBlock(NewExit, NewExit->getTerminator(), DT, LI);
>> +    EpilogPreHeader->setName(Header->getName() + ".epil.preheader");
>> +  } else {
>> +    // If prolog remainder
>> +    // Split the original preheader twice to insert prolog remainder loop
>> +    PrologPreHeader = SplitEdge(PreHeader, Header, DT, LI);
>> +    PrologPreHeader->setName(Header->getName() + ".prol.preheader");
>> +    PrologExit = SplitBlock(PrologPreHeader, PrologPreHeader->getTerminator(),
>> +                            DT, LI);
>> +    PrologExit->setName(Header->getName() + ".prol.loopexit");
>> +    // Split PrologExit to get NewPreHeader.
>> +    NewPreHeader = SplitBlock(PrologExit, PrologExit->getTerminator(), DT, LI);
>> +    NewPreHeader->setName(PreHeader->getName() + ".new");
>> +  }
>> +  // Loop structure should be the following:
>> +  //  Epilog             Prolog
>> +  //
>> +  // PreHeader         PreHeader
>> +  // *NewPreHeader     *PrologPreHeader
>> +  //   Header          *PrologExit
>> +  //   ...             *NewPreHeader
>> +  //   Latch             Header
>> +  // *NewExit            ...
>> +  // *EpilogPreHeader    Latch
>> +  // Exit              Exit
>> +
>> +  // Calculate conditions for branch around loop for unrolling
>> +  // in epilog case and around prolog remainder loop in prolog case.
>>   // Compute the number of extra iterations required, which is:
>> -  //  extra iterations = run-time trip count % (loop unroll factor + 1)
>> +  //  extra iterations = run-time trip count % loop unroll factor
>> +  PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
>>   Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
>>                                             PreHeaderBR);
>>   Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
>>                                           PreHeaderBR);
>> -
>>   IRBuilder<> B(PreHeaderBR);
>>   Value *ModVal;
>>   // Calculate ModVal = (BECount + 1) % Count.
>>   // Note that TripCount is BECount + 1.
>>   if (isPowerOf2_32(Count)) {
>> +    // When Count is power of 2 we don't BECount for epilog case, however we'll
>> +    // need it for a branch around unrolling loop for prolog case.
>>     ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
>> -    //  1. There are no iterations to be run in the prologue loop.
>> +    //  1. There are no iterations to be run in the prolog/epilog loop.
>>     // OR
>>     //  2. The addition computing TripCount overflowed.
>>     //
>> @@ -371,18 +575,18 @@ bool llvm::UnrollRuntimeLoopProlog(Loop
>>                           ConstantInt::get(BECount->getType(), Count),
>>                           "xtraiter");
>>   }
>> -  Value *BranchVal = B.CreateIsNotNull(ModVal, "lcmp.mod");
>> -
>> -  // Branch to either the extra iterations or the cloned/unrolled loop.
>> -  // We will fix up the true branch label when adding loop body copies.
>> -  B.CreateCondBr(BranchVal, PEnd, PEnd);
>> -  assert(PreHeaderBR->isUnconditional() &&
>> -         PreHeaderBR->getSuccessor(0) == PEnd &&
>> -         "CFG edges in Preheader are not correct");
>> +  Value *CmpOperand =
>> +      UseEpilogRemainder ? TripCount :
>> +                           ConstantInt::get(TripCount->getType(), 0);
>> +  Value *BranchVal = B.CreateICmpNE(ModVal, CmpOperand, "lcmp.mod");
>> +  BasicBlock *FirstLoop = UseEpilogRemainder ? NewPreHeader : PrologPreHeader;
>> +  BasicBlock *SecondLoop = UseEpilogRemainder ? NewExit : PrologExit;
>> +  // Branch to either remainder (extra iterations) loop or unrolling loop.
>> +  B.CreateCondBr(BranchVal, FirstLoop, SecondLoop);
>>   PreHeaderBR->eraseFromParent();
>>   Function *F = Header->getParent();
>>   // Get an ordered list of blocks in the loop to help with the ordering of the
>> -  // cloned blocks in the prolog code.
>> +  // cloned blocks in the prolog/epilog code
>>   LoopBlocksDFS LoopBlocks(L);
>>   LoopBlocks.perform(LI);
>>
>> @@ -394,17 +598,38 @@ bool llvm::UnrollRuntimeLoopProlog(Loop
>>   std::vector<BasicBlock *> NewBlocks;
>>   ValueToValueMapTy VMap;
>>
>> -  bool UnrollPrologue = Count == 2;
>> +  // For unroll factor 2 remainder loop will have 1 iterations.
>> +  // Do not create 1 iteration loop.
>> +  bool CreateRemainderLoop = (Count != 2);
>>
>>   // Clone all the basic blocks in the loop. If Count is 2, we don't clone
>>   // the loop, otherwise we create a cloned loop to execute the extra
>>   // iterations. This function adds the appropriate CFG connections.
>> -  CloneLoopBlocks(L, ModVal, UnrollPrologue, PH, PEnd, NewBlocks, LoopBlocks,
>> -                  VMap, LI);
>> +  BasicBlock *InsertBot = UseEpilogRemainder ? Exit : PrologExit;
>> +  BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
>> +  CloneLoopBlocks(L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop,
>> +                  InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, LI);
>> +
>> +  // Insert the cloned blocks into the function.
>> +  F->getBasicBlockList().splice(InsertBot->getIterator(),
>> +                                F->getBasicBlockList(),
>> +                                NewBlocks[0]->getIterator(),
>> +                                F->end());
>>
>> -  // Insert the cloned blocks into the function just before the original loop.
>> -  F->getBasicBlockList().splice(PEnd->getIterator(), F->getBasicBlockList(),
>> -                                NewBlocks[0]->getIterator(), F->end());
>> +  // Loop structure should be the following:
>> +  //  Epilog             Prolog
>> +  //
>> +  // PreHeader         PreHeader
>> +  // NewPreHeader      PrologPreHeader
>> +  //   Header            PrologHeader
>> +  //   ...               ...
>> +  //   Latch             PrologLatch
>> +  // NewExit           PrologExit
>> +  // EpilogPreHeader   NewPreHeader
>> +  //   EpilogHeader      Header
>> +  //   ...               ...
>> +  //   EpilogLatch       Latch
>> +  // Exit              Exit
>>
>>   // Rewrite the cloned instruction operands to use the values created when the
>>   // clone is created.
>> @@ -415,11 +640,38 @@ bool llvm::UnrollRuntimeLoopProlog(Loop
>>     }
>>   }
>>
>> -  // Connect the prolog code to the original loop and update the
>> -  // PHI functions.
>> -  BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]);
>> -  ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, DT, LI,
>> -                PreserveLCSSA);
>> +  if (UseEpilogRemainder) {
>> +    // Connect the epilog code to the original loop and update the
>> +    // PHI functions.
>> +    ConnectEpilog(L, ModVal, NewExit, Exit, PreHeader,
>> +                  EpilogPreHeader, NewPreHeader, VMap, DT, LI,
>> +                  PreserveLCSSA);
>> +
>> +    // Update counter in loop for unrolling.
>> +    // I should be multiply of Count.
>> +    IRBuilder<> B2(NewPreHeader->getTerminator());
>> +    Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter");
>> +    BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
>> +    B2.SetInsertPoint(LatchBR);
>> +    PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
>> +                                      Header->getFirstNonPHI());
>> +    Value *IdxSub =
>> +        B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
>> +                     NewIdx->getName() + ".nsub");
>> +    Value *IdxCmp;
>> +    if (LatchBR->getSuccessor(0) == Header)
>> +      IdxCmp = B2.CreateIsNotNull(IdxSub, NewIdx->getName() + ".ncmp");
>> +    else
>> +      IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp");
>> +    NewIdx->addIncoming(TestVal, NewPreHeader);
>> +    NewIdx->addIncoming(IdxSub, Latch);
>> +    LatchBR->setCondition(IdxCmp);
>> +  } else {
>> +    // Connect the prolog code to the original loop and update the
>> +    // PHI functions.
>> +    ConnectProlog(L, BECount, Count, PrologExit, PreHeader, NewPreHeader,
>> +                  VMap, DT, LI, PreserveLCSSA);
>> +  }
>>   NumRuntimeUnrolled++;
>>   return true;
>> }
>>
>> Modified: llvm/trunk/test/Transforms/LoopUnroll/AArch64/runtime-loop.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnroll/AArch64/runtime-loop.ll?rev=265388&r1=265387&r2=265388&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/Transforms/LoopUnroll/AArch64/runtime-loop.ll (original)
>> +++ llvm/trunk/test/Transforms/LoopUnroll/AArch64/runtime-loop.ll Tue Apr  5 07:19:35 2016
>> @@ -1,13 +1,21 @@
>> -; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s
>> +; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s -check-prefix=EPILOG
>> +; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-a57 -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
>>
>> ; Tests for unrolling loops with run-time trip counts
>>
>> -; CHECK:  %xtraiter = and i32 %n
>> -; CHECK:  %lcmp.mod = icmp ne i32 %xtraiter, 0
>> -; CHECK:  br i1 %lcmp.mod, label %for.body.prol, label %for.body.preheader.split
>> +; EPILOG:  %xtraiter = and i32 %n
>> +; EPILOG:  %lcmp.mod = icmp ne i32 %xtraiter, %n
>> +; EPILOG:  br i1 %lcmp.mod, label %for.body.preheader.new, label %for.end.loopexit.unr-lcssa
>>
>> -; CHECK:  for.body.prol:
>> -; CHECK:  for.body:
>> +; PROLOG:  %xtraiter = and i32 %n
>> +; PROLOG:  %lcmp.mod = icmp ne i32 %xtraiter, 0
>> +; PROLOG:  br i1 %lcmp.mod, label %for.body.prol.preheader, label %for.body.prol.loopexit
>> +
>> +; EPILOG:  for.body:
>> +; EPILOG:  for.body.epil:
>> +
>> +; PROLOG:  for.body.prol:
>> +; PROLOG:  for.body:
>>
>> define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {
>> entry:
>>
>> Modified: llvm/trunk/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll?rev=265388&r1=265387&r2=265388&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll (original)
>> +++ llvm/trunk/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll Tue Apr  5 07:19:35 2016
>> @@ -1,4 +1,5 @@
>> -; RUN: opt < %s -S -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -loop-unroll | FileCheck %s
>> +; RUN: opt < %s -S -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -loop-unroll | FileCheck %s -check-prefix=EPILOG
>> +; RUN: opt < %s -S -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -loop-unroll -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
>> define void @unroll_opt_for_size() nounwind optsize {
>> entry:
>>   br label %loop
>> @@ -13,11 +14,17 @@ exit:
>>   ret void
>> }
>>
>> -; CHECK-LABEL: @unroll_opt_for_size
>> -; CHECK:      add
>> -; CHECK-NEXT: add
>> -; CHECK-NEXT: add
>> -; CHECK: icmp
>> +; EPILOG-LABEL: @unroll_opt_for_size
>> +; EPILOG:      add
>> +; EPILOG-NEXT: add
>> +; EPILOG-NEXT: add
>> +; EPILOG: icmp
>> +
>> +; PROLOG-LABEL: @unroll_opt_for_size
>> +; PROLOG:      add
>> +; PROLOG-NEXT: add
>> +; PROLOG-NEXT: add
>> +; PROLOG: icmp
>>
>> define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {
>> entry:
>> @@ -40,8 +47,13 @@ for.end:
>>   ret i32 %sum.0.lcssa
>> }
>>
>> -; CHECK-LABEL: @test
>> -; CHECK: for.body.prol{{.*}}:
>> -; CHECK: for.body:
>> -; CHECK: br i1 %exitcond.7, label %for.end.loopexit{{.*}}, label %for.body
>> +; EPILOG-LABEL: @test
>> +; EPILOG: for.body:
>> +; EPILOG: br i1 %niter.ncmp.7, label %for.end.loopexit{{.*}}, label %for.body
>> +; EPILOG: for.body.epil{{.*}}:
>> +
>> +; PROLOG-LABEL: @test
>> +; PROLOG: for.body.prol{{.*}}:
>> +; PROLOG: for.body:
>> +; PROLOG: br i1 %exitcond.7, label %for.end.loopexit{{.*}}, label %for.body
>>
>>
>> Modified: llvm/trunk/test/Transforms/LoopUnroll/X86/mmx.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnroll/X86/mmx.ll?rev=265388&r1=265387&r2=265388&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/Transforms/LoopUnroll/X86/mmx.ll (original)
>> +++ llvm/trunk/test/Transforms/LoopUnroll/X86/mmx.ll Tue Apr  5 07:19:35 2016
>> @@ -14,9 +14,9 @@ for.body:
>>
>> exit:                                             ; preds = %for.body
>>   %ret = phi x86_mmx [ undef, %for.body ]
>> -  ; CHECK: %[[ret_unr:.*]] = phi x86_mmx [ undef,
>> -  ; CHECK: %[[ret_ph:.*]]  = phi x86_mmx [ undef,
>> -  ; CHECK: %[[ret:.*]] = phi x86_mmx [ %[[ret_unr]], {{.*}} ], [ %[[ret_ph]]
>> +  ; CHECK: %[[ret_ph:.*]] = phi x86_mmx [ undef, %entry
>> +  ; CHECK: %[[ret_ph1:.*]]  = phi x86_mmx [ undef,
>> +  ; CHECK: %[[ret:.*]] = phi x86_mmx [ %[[ret_ph]], {{.*}} ], [ %[[ret_ph1]],
>>   ; CHECK: ret x86_mmx %[[ret]]
>>   ret x86_mmx %ret
>> }
>>
>> Modified: llvm/trunk/test/Transforms/LoopUnroll/high-cost-trip-count-computation.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnroll/high-cost-trip-count-computation.ll?rev=265388&r1=265387&r2=265388&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/Transforms/LoopUnroll/high-cost-trip-count-computation.ll (original)
>> +++ llvm/trunk/test/Transforms/LoopUnroll/high-cost-trip-count-computation.ll Tue Apr  5 07:19:35 2016
>> @@ -34,7 +34,7 @@ define i32 @test2(i64* %loc, i64 %conv7)
>> ; CHECK: udiv
>> ; CHECK: udiv
>> ; CHECK-NOT: udiv
>> -; CHECK-LABEL: for.body.prol
>> +; CHECK-LABEL: for.body
>> entry:
>>   %rem0 = load i64, i64* %loc, align 8
>>   %ExpensiveComputation = udiv i64 %rem0, 42 ; <<< Extra computations are added to the trip-count expression
>>
>> Modified: llvm/trunk/test/Transforms/LoopUnroll/runtime-loop.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnroll/runtime-loop.ll?rev=265388&r1=265387&r2=265388&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/Transforms/LoopUnroll/runtime-loop.ll (original)
>> +++ llvm/trunk/test/Transforms/LoopUnroll/runtime-loop.ll Tue Apr  5 07:19:35 2016
>> @@ -1,18 +1,30 @@
>> -; RUN: opt < %s -S -loop-unroll -unroll-runtime=true | FileCheck %s
>> +; RUN: opt < %s -S -loop-unroll -unroll-runtime=true | FileCheck %s -check-prefix=EPILOG
>> +; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
>>
>> target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
>>
>> ; Tests for unrolling loops with run-time trip counts
>>
>> -; CHECK: %xtraiter = and i32 %n
>> -; CHECK:  %lcmp.mod = icmp ne i32 %xtraiter, 0
>> -; CHECK:  br i1 %lcmp.mod, label %for.body.prol, label %for.body.preheader.split
>> -
>> -; CHECK: for.body.prol:
>> -; CHECK: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.preheader ]
>> -; CHECK:  %prol.iter.sub = sub i32 %prol.iter, 1
>> -; CHECK:  %prol.iter.cmp = icmp ne i32 %prol.iter.sub, 0
>> -; CHECK:  br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.preheader.split, !llvm.loop !0
>> +; EPILOG: %xtraiter = and i32 %n
>> +; EPILOG:  %lcmp.mod = icmp ne i32 %xtraiter, %n
>> +; EPILOG:  br i1 %lcmp.mod, label %for.body.preheader.new, label %for.end.loopexit.unr-lcssa
>> +
>> +; PROLOG: %xtraiter = and i32 %n
>> +; PROLOG:  %lcmp.mod = icmp ne i32 %xtraiter, 0
>> +; PROLOG:  br i1 %lcmp.mod, label %for.body.prol.preheader, label %for.body.prol.loopexit
>> +
>> +; EPILOG: for.body.epil:
>> +; EPILOG: %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %for.body.epil ],  [ %indvars.iv.unr, %for.body.epil.preheader ]
>> +; EPILOG:  %epil.iter.sub = sub i32 %epil.iter, 1
>> +; EPILOG:  %epil.iter.cmp = icmp ne i32 %epil.iter.sub, 0
>> +; EPILOG:  br i1 %epil.iter.cmp, label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !llvm.loop !0
>> +
>> +; PROLOG: for.body.prol:
>> +; PROLOG: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.prol.preheader ]
>> +; PROLOG:  %prol.iter.sub = sub i32 %prol.iter, 1
>> +; PROLOG:  %prol.iter.cmp = icmp ne i32 %prol.iter.sub, 0
>> +; PROLOG:  br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit, !llvm.loop !0
>> +
>>
>> define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {
>> entry:
>> @@ -39,8 +51,11 @@ for.end:
>> ; Still try to completely unroll loops with compile-time trip counts
>> ; even if the -unroll-runtime is specified
>>
>> -; CHECK: for.body:
>> -; CHECK-NOT: for.body.prol:
>> +; EPILOG: for.body:
>> +; EPILOG-NOT: for.body.epil:
>> +
>> +; PROLOG: for.body:
>> +; PROLOG-NOT: for.body.prol:
>>
>> define i32 @test1(i32* nocapture %a) nounwind uwtable readonly {
>> entry:
>> @@ -64,7 +79,8 @@ for.end:
>> ; This is test 2007-05-09-UnknownTripCount.ll which can be unrolled now
>> ; if the -unroll-runtime option is turned on
>>
>> -; CHECK: bb72.2:
>> +; EPILOG: bb72.2:
>> +; PROLOG: bb72.2:
>>
>> define void @foo(i32 %trips) {
>> entry:
>> @@ -86,8 +102,11 @@ cond_true138:
>>
>> ; Test run-time unrolling for a loop that counts down by -2.
>>
>> -; CHECK: for.body.prol:
>> -; CHECK: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.preheader.split
>> +; EPILOG: for.body.epil:
>> +; EPILOG: br i1 %epil.iter.cmp, label %for.body.epil, label %for.cond.for.end_crit_edge.epilog-lcssa
>> +
>> +; PROLOG: for.body.prol:
>> +; PROLOG: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit
>>
>> define zeroext i16 @down(i16* nocapture %p, i32 %len) nounwind uwtable readonly {
>> entry:
>> @@ -116,8 +135,11 @@ for.end:
>> }
>>
>> ; Test run-time unrolling disable metadata.
>> -; CHECK: for.body:
>> -; CHECK-NOT: for.body.prol:
>> +; EPILOG: for.body:
>> +; EPILOG-NOT: for.body.epil:
>> +
>> +; PROLOG: for.body:
>> +; PROLOG-NOT: for.body.prol:
>>
>> define zeroext i16 @test2(i16* nocapture %p, i32 %len) nounwind uwtable readonly {
>> entry:
>> @@ -148,6 +170,8 @@ for.end:
>> !0 = distinct !{!0, !1}
>> !1 = !{!"llvm.loop.unroll.runtime.disable"}
>>
>> -; CHECK: !0 = distinct !{!0, !1}
>> -; CHECK: !1 = !{!"llvm.loop.unroll.disable"}
>> +; EPILOG: !0 = distinct !{!0, !1}
>> +; EPILOG: !1 = !{!"llvm.loop.unroll.disable"}
>>
>> +; PROLOG: !0 = distinct !{!0, !1}
>> +; PROLOG: !1 = !{!"llvm.loop.unroll.disable"}
>>
>> Modified: llvm/trunk/test/Transforms/LoopUnroll/runtime-loop1.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnroll/runtime-loop1.ll?rev=265388&r1=265387&r2=265388&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/Transforms/LoopUnroll/runtime-loop1.ll (original)
>> +++ llvm/trunk/test/Transforms/LoopUnroll/runtime-loop1.ll Tue Apr  5 07:19:35 2016
>> @@ -1,19 +1,35 @@
>> -; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 | FileCheck %s
>> +; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 | FileCheck %s -check-prefix=EPILOG
>> +; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
>>
>> ; This tests that setting the unroll count works
>>
>> -; CHECK: for.body.preheader:
>> -; CHECK:   br {{.*}} label %for.body.prol, label %for.body.preheader.split, !dbg [[PH_LOC:![0-9]+]]
>> -; CHECK: for.body.prol:
>> -; CHECK:   br label %for.body.preheader.split, !dbg [[BODY_LOC:![0-9]+]]
>> -; CHECK: for.body.preheader.split:
>> -; CHECK:   br {{.*}} label %for.end.loopexit, label %for.body.preheader.split.split, !dbg [[PH_LOC]]
>> -; CHECK: for.body:
>> -; CHECK:   br i1 %exitcond.1, label %for.end.loopexit.unr-lcssa, label %for.body, !dbg [[BODY_LOC]]
>> -; CHECK-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body
>>
>> -; CHECK-DAG: [[PH_LOC]] = !DILocation(line: 101, column: 1, scope: !{{.*}})
>> -; CHECK-DAG: [[BODY_LOC]] = !DILocation(line: 102, column: 1, scope: !{{.*}})
>> +; EPILOG: for.body.preheader:
>> +; EPILOG:   br i1 %lcmp.mod, label %for.body.preheader.new, label %for.end.loopexit.unr-lcssa, !dbg [[PH_LOC:![0-9]+]]
>> +; EPILOG: for.body:
>> +; EPILOG:   br i1 %niter.ncmp.1, label %for.end.loopexit.unr-lcssa.loopexit, label %for.body, !dbg [[BODY_LOC:![0-9]+]]
>> +; EPILOG-NOT: br i1 %niter.ncmp.2, label %for.end.loopexit{{.*}}, label %for.body
>> +; EPILOG: for.body.epil.preheader:
>> +; EPILOG:   br label %for.body.epil, !dbg [[EXIT_LOC:![0-9]+]]
>> +; EPILOG: for.body.epil:
>> +; EPILOG:   br label %for.end.loopexit.epilog-lcssa, !dbg [[BODY_LOC:![0-9]+]]
>> +
>> +; EPILOG-DAG: [[PH_LOC]] = !DILocation(line: 101, column: 1, scope: !{{.*}})
>> +; EPILOG-DAG: [[BODY_LOC]] = !DILocation(line: 102, column: 1, scope: !{{.*}})
>> +; EPILOG-DAG: [[EXIT_LOC]] = !DILocation(line: 103, column: 1, scope: !{{.*}})
>> +
>> +; PROLOG: for.body.preheader:
>> +; PROLOG:   br {{.*}} label %for.body.prol.preheader, label %for.body.prol.loopexit, !dbg [[PH_LOC:![0-9]+]]
>> +; PROLOG: for.body.prol:
>> +; PROLOG:   br label %for.body.prol.loopexit, !dbg [[BODY_LOC:![0-9]+]]
>> +; PROLOG: for.body.prol.loopexit:
>> +; PROLOG:   br {{.*}} label %for.end.loopexit, label %for.body.preheader.new, !dbg [[PH_LOC]]
>> +; PROLOG: for.body:
>> +; PROLOG:   br i1 %exitcond.1, label %for.end.loopexit.unr-lcssa, label %for.body, !dbg [[BODY_LOC]]
>> +; PROLOG-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body
>> +
>> +; PROLOG-DAG: [[PH_LOC]] = !DILocation(line: 101, column: 1, scope: !{{.*}})
>> +; PROLOG-DAG: [[BODY_LOC]] = !DILocation(line: 102, column: 1, scope: !{{.*}})
>>
>> define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly !dbg !6 {
>> entry:
>>
>> Modified: llvm/trunk/test/Transforms/LoopUnroll/runtime-loop2.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnroll/runtime-loop2.ll?rev=265388&r1=265387&r2=265388&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/Transforms/LoopUnroll/runtime-loop2.ll (original)
>> +++ llvm/trunk/test/Transforms/LoopUnroll/runtime-loop2.ll Tue Apr  5 07:19:35 2016
>> @@ -1,12 +1,18 @@
>> -; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-runtime -unroll-count=8 | FileCheck %s
>> +; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-runtime -unroll-count=8 | FileCheck %s  -check-prefix=EPILOG
>> +; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-runtime -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
>>
>> ; Choose a smaller, power-of-two, unroll count if the loop is too large.
>> ; This test makes sure we're not unrolling 'odd' counts
>>
>> -; CHECK: for.body.prol:
>> -; CHECK: for.body:
>> -; CHECK: br i1 %exitcond.3, label %for.end.loopexit{{.*}}, label %for.body
>> -; CHECK-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body
>> +; EPILOG: for.body:
>> +; EPILOG: br i1 %niter.ncmp.3, label %for.end.loopexit.unr-lcssa.loopexit{{.*}}, label %for.body
>> +; EPILOG-NOT: br i1 %niter.ncmp.4, label %for.end.loopexit.unr-lcssa.loopexit{{.*}}, label %for.body
>> +; EPILOG: for.body.epil:
>> +
>> +; PROLOG: for.body.prol:
>> +; PROLOG: for.body:
>> +; PROLOG: br i1 %exitcond.3, label %for.end.loopexit{{.*}}, label %for.body
>> +; PROLOG-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body
>>
>> define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {
>> entry:
>>
>> Modified: llvm/trunk/test/Transforms/LoopUnroll/runtime-loop4.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnroll/runtime-loop4.ll?rev=265388&r1=265387&r2=265388&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/Transforms/LoopUnroll/runtime-loop4.ll (original)
>> +++ llvm/trunk/test/Transforms/LoopUnroll/runtime-loop4.ll Tue Apr  5 07:19:35 2016
>> @@ -1,13 +1,21 @@
>> -; RUN: opt < %s -S -O2 -unroll-runtime=true | FileCheck %s
>> +; RUN: opt < %s -S -O2 -unroll-runtime=true | FileCheck %s -check-prefix=EPILOG
>> +; RUN: opt < %s -S -O2 -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
>>
>> ; Check runtime unrolling prologue can be promoted by LICM pass.
>>
>> -; CHECK: entry:
>> -; CHECK: %xtraiter
>> -; CHECK: %lcmp.mod
>> -; CHECK: loop1:
>> -; CHECK: br i1 %lcmp.mod
>> -; CHECK: loop2.prol:
>> +; EPILOG: entry:
>> +; EPILOG: %xtraiter
>> +; EPILOG: %lcmp.mod
>> +; EPILOG: loop1:
>> +; EPILOG: br i1 %lcmp.mod
>> +; EPILOG: loop2.epil:
>> +
>> +; PROLOG: entry:
>> +; PROLOG: %xtraiter
>> +; PROLOG: %lcmp.mod
>> +; PROLOG: loop1:
>> +; PROLOG: br i1 %lcmp.mod
>> +; PROLOG: loop2.prol:
>>
>> define void @unroll(i32 %iter, i32* %addr1, i32* %addr2) nounwind {
>> entry:
>>
>> Modified: llvm/trunk/test/Transforms/LoopUnroll/runtime-loop5.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnroll/runtime-loop5.ll?rev=265388&r1=265387&r2=265388&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/Transforms/LoopUnroll/runtime-loop5.ll (original)
>> +++ llvm/trunk/test/Transforms/LoopUnroll/runtime-loop5.ll Tue Apr  5 07:19:35 2016
>> @@ -11,9 +11,6 @@ entry:
>>   %cmp1 = icmp eq i3 %n, 0
>>   br i1 %cmp1, label %for.end, label %for.body
>>
>> -; UNROLL-16-NOT: for.body.prol:
>> -; UNROLL-4: for.body.prol:
>> -
>> for.body:                                         ; preds = %for.body, %entry
>> ; UNROLL-16-LABEL: for.body:
>> ; UNROLL-4-LABEL: for.body:
>> @@ -39,6 +36,10 @@ for.body:
>>
>> ; UNROLL-16-LABEL: for.end
>> ; UNROLL-4-LABEL: for.end
>> +
>> +; UNROLL-16-NOT: for.body.epil:
>> +; UNROLL-4: for.body.epil:
>> +
>> for.end:                                          ; preds = %for.body, %entry
>>   %sum.0.lcssa = phi i3 [ 0, %entry ], [ %add, %for.body ]
>>   ret i3 %sum.0.lcssa
>>
>> Modified: llvm/trunk/test/Transforms/LoopUnroll/tripcount-overflow.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnroll/tripcount-overflow.ll?rev=265388&r1=265387&r2=265388&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/Transforms/LoopUnroll/tripcount-overflow.ll (original)
>> +++ llvm/trunk/test/Transforms/LoopUnroll/tripcount-overflow.ll Tue Apr  5 07:19:35 2016
>> @@ -13,13 +13,13 @@ target datalayout = "e-m:o-i64:64-f80:12
>> ; CHECK: entry:
>> ; CHECK-NEXT: %0 = add i32 %N, 1
>> ; CHECK-NEXT: %xtraiter = and i32 %0, 1
>> -; CHECK-NEXT: %lcmp.mod = icmp ne i32 %xtraiter, 0
>> -; CHECK-NEXT: br i1 %lcmp.mod, label %while.body.prol, label %entry.split
>> +; CHECK-NEXT: %lcmp.mod = icmp ne i32 %xtraiter, %0
>> +; CHECK-NEXT: br i1 %lcmp.mod, label %entry.new, label %while.end.unr-lcssa
>>
>> -; CHECK: while.body.prol:
>> -; CHECK: br label %entry.split
>> +; CHECK: while.body.epil:
>> +; CHECK: br label %while.end.epilog-lcssa
>>
>> -; CHECK: entry.split:
>> +; CHECK: while.end.epilog-lcssa:
>>
>> ; Function Attrs: nounwind readnone ssp uwtable
>> define i32 @foo(i32 %N) {
>>
>> Modified: llvm/trunk/test/Transforms/LoopUnroll/unroll-cleanup.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnroll/unroll-cleanup.ll?rev=265388&r1=265387&r2=265388&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/Transforms/LoopUnroll/unroll-cleanup.ll (original)
>> +++ llvm/trunk/test/Transforms/LoopUnroll/unroll-cleanup.ll Tue Apr  5 07:19:35 2016
>> @@ -4,14 +4,14 @@
>> ; RUN: opt < %s -O2 -S | FileCheck %s
>>
>> ; After loop unroll:
>> -;       %dec18 = add nsw i32 %dec18.in, -1
>> +;       %niter.nsub = add nsw i32 %niter, -1
>> ;       ...
>> -;       %dec18.1 = add nsw i32 %dec18, -1
>> +;       %niter.nsub.1 = add nsw i32 %niter.nsub, -1
>> ; should be merged to:
>> -;       %dec18.1 = add nsw i32 %dec18.in, -2
>> +;       %dec18.1 = add nsw i32 %niter, -2
>> ;
>> ; CHECK-LABEL: @_Z3fn1v(
>> -; CHECK: %dec18.1 = add nsw i32 %dec18.in, -2
>> +; CHECK: %niter.nsub.1 = add i32 %niter, -2
>>
>> ; ModuleID = '<stdin>'
>> target triple = "x86_64-unknown-linux-gnu"
>>
>> Modified: llvm/trunk/test/Transforms/LoopUnroll/unroll-pragmas.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnroll/unroll-pragmas.ll?rev=265388&r1=265387&r2=265388&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/Transforms/LoopUnroll/unroll-pragmas.ll (original)
>> +++ llvm/trunk/test/Transforms/LoopUnroll/unroll-pragmas.ll Tue Apr  5 07:19:35 2016
>> @@ -171,10 +171,6 @@ for.end:
>> ; should be duplicated (original and 4x unrolled).
>> ;
>> ; CHECK-LABEL: @runtime_loop_with_count4(
>> -; CHECK: for.body.prol:
>> -; CHECK: store
>> -; CHECK-NOT: store
>> -; CHECK: br i1
>> ; CHECK: for.body
>> ; CHECK: store
>> ; CHECK: store
>> @@ -182,6 +178,10 @@ for.end:
>> ; CHECK: store
>> ; CHECK-NOT: store
>> ; CHECK: br i1
>> +; CHECK: for.body.epil:
>> +; CHECK: store
>> +; CHECK-NOT: store
>> +; CHECK: br i1
>> define void @runtime_loop_with_count4(i32* nocapture %a, i32 %b) {
>> entry:
>>   %cmp3 = icmp sgt i32 %b, 0
>> @@ -287,10 +287,6 @@ for.end:
>> ; (original and 8x).
>> ;
>> ; CHECK-LABEL: @runtime_loop_with_enable(
>> -; CHECK: for.body.prol:
>> -; CHECK: store
>> -; CHECK-NOT: store
>> -; CHECK: br i1
>> ; CHECK: for.body:
>> ; CHECK: store i32
>> ; CHECK: store i32
>> @@ -302,6 +298,10 @@ for.end:
>> ; CHECK: store i32
>> ; CHECK-NOT: store i32
>> ; CHECK: br i1
>> +; CHECK: for.body.epil:
>> +; CHECK: store
>> +; CHECK-NOT: store
>> +; CHECK: br i1
>> define void @runtime_loop_with_enable(i32* nocapture %a, i32 %b) {
>> entry:
>>   %cmp3 = icmp sgt i32 %b, 0
>> @@ -328,16 +328,16 @@ for.end:
>> ; should be duplicated (original and 3x unrolled).
>> ;
>> ; CHECK-LABEL: @runtime_loop_with_count3(
>> -; CHECK: for.body.prol:
>> -; CHECK: store
>> -; CHECK-NOT: store
>> -; CHECK: br i1
>> ; CHECK: for.body
>> ; CHECK: store
>> ; CHECK: store
>> ; CHECK: store
>> ; CHECK-NOT: store
>> ; CHECK: br i1
>> +; CHECK: for.body.epil:
>> +; CHECK: store
>> +; CHECK-NOT: store
>> +; CHECK: br i1
>> define void @runtime_loop_with_count3(i32* nocapture %a, i32 %b) {
>> entry:
>>   %cmp3 = icmp sgt i32 %b, 0
>>
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits


More information about the llvm-commits mailing list